1 // Copyright (c) 2019-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 // 6 // Implementation details of various Bloom filter implementations used in 7 // RocksDB. (DynamicBloom is in a separate file for now because it 8 // supports concurrent write.) 9 10 #pragma once 11 #include <stddef.h> 12 #include <stdint.h> 13 #include <cmath> 14 15 #include "rocksdb/slice.h" 16 #include "util/hash.h" 17 18 #ifdef HAVE_AVX2 19 #include <immintrin.h> 20 #endif 21 22 namespace rocksdb { 23 24 class BloomMath { 25 public: 26 // False positive rate of a standard Bloom filter, for given ratio of 27 // filter memory bits to added keys, and number of probes per operation. 28 // (The false positive rate is effectively independent of scale, assuming 29 // the implementation scales OK.) StandardFpRate(double bits_per_key,int num_probes)30 static double StandardFpRate(double bits_per_key, int num_probes) { 31 // Standard very-good-estimate formula. See 32 // https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives 33 return std::pow(1.0 - std::exp(-num_probes / bits_per_key), num_probes); 34 } 35 36 // False positive rate of a "blocked"/"shareded"/"cache-local" Bloom filter, 37 // for given ratio of filter memory bits to added keys, number of probes per 38 // operation (all within the given block or cache line size), and block or 39 // cache line size. CacheLocalFpRate(double bits_per_key,int num_probes,int cache_line_bits)40 static double CacheLocalFpRate(double bits_per_key, int num_probes, 41 int cache_line_bits) { 42 double keys_per_cache_line = cache_line_bits / bits_per_key; 43 // A reasonable estimate is the average of the FP rates for one standard 44 // deviation above and below the mean bucket occupancy. See 45 // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#the-math 46 double keys_stddev = std::sqrt(keys_per_cache_line); 47 double crowded_fp = StandardFpRate( 48 cache_line_bits / (keys_per_cache_line + keys_stddev), num_probes); 49 double uncrowded_fp = StandardFpRate( 50 cache_line_bits / (keys_per_cache_line - keys_stddev), num_probes); 51 return (crowded_fp + uncrowded_fp) / 2; 52 } 53 54 // False positive rate of querying a new item against `num_keys` items, all 55 // hashed to `fingerprint_bits` bits. (This assumes the fingerprint hashes 56 // themselves are stored losslessly. See Section 4 of 57 // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf) FingerprintFpRate(size_t num_keys,int fingerprint_bits)58 static double FingerprintFpRate(size_t num_keys, int fingerprint_bits) { 59 double inv_fingerprint_space = std::pow(0.5, fingerprint_bits); 60 // Base estimate assumes each key maps to a unique fingerprint. 61 // Could be > 1 in extreme cases. 62 double base_estimate = num_keys * inv_fingerprint_space; 63 // To account for potential overlap, we choose between two formulas 64 if (base_estimate > 0.0001) { 65 // A very good formula assuming we don't construct a floating point 66 // number extremely close to 1. Always produces a probability < 1. 67 return 1.0 - std::exp(-base_estimate); 68 } else { 69 // A very good formula when base_estimate is far below 1. (Subtract 70 // away the integral-approximated sum that some key has same hash as 71 // one coming before it in a list.) 72 return base_estimate - (base_estimate * base_estimate * 0.5); 73 } 74 } 75 76 // Returns the probably of either of two independent(-ish) events 77 // happening, given their probabilities. (This is useful for combining 78 // results from StandardFpRate or CacheLocalFpRate with FingerprintFpRate 79 // for a hash-efficient Bloom filter's FP rate. See Section 4 of 80 // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf) IndependentProbabilitySum(double rate1,double rate2)81 static double IndependentProbabilitySum(double rate1, double rate2) { 82 // Use formula that avoids floating point extremely close to 1 if 83 // rates are extremely small. 84 return rate1 + rate2 - (rate1 * rate2); 85 } 86 }; 87 88 // A fast, flexible, and accurate cache-local Bloom implementation with 89 // SIMD-optimized query performance (currently using AVX2 on Intel). Write 90 // performance and non-SIMD read are very good, benefiting from fastrange32 91 // used in place of % and single-cycle multiplication on recent processors. 92 // 93 // Most other SIMD Bloom implementations sacrifice flexibility and/or 94 // accuracy by requiring num_probes to be a power of two and restricting 95 // where each probe can occur in a cache line. This implementation sacrifices 96 // SIMD-optimization for add (might still be possible, especially with AVX512) 97 // in favor of allowing any num_probes, not crossing cache line boundary, 98 // and accuracy close to theoretical best accuracy for a cache-local Bloom. 99 // E.g. theoretical best for 10 bits/key, num_probes=6, and 512-bit bucket 100 // (Intel cache line size) is 0.9535% FP rate. This implementation yields 101 // about 0.957%. (Compare to LegacyLocalityBloomImpl<false> at 1.138%, or 102 // about 0.951% for 1024-bit buckets, cache line size for some ARM CPUs.) 103 // 104 // This implementation can use a 32-bit hash (let h2 be h1 * 0x9e3779b9) or 105 // a 64-bit hash (split into two uint32s). With many millions of keys, the 106 // false positive rate associated with using a 32-bit hash can dominate the 107 // false positive rate of the underlying filter. At 10 bits/key setting, the 108 // inflection point is about 40 million keys, so 32-bit hash is a bad idea 109 // with 10s of millions of keys or more. 110 // 111 // Despite accepting a 64-bit hash, this implementation uses 32-bit fastrange 112 // to pick a cache line, which can be faster than 64-bit in some cases. 113 // This only hurts accuracy as you get into 10s of GB for a single filter, 114 // and accuracy abruptly breaks down at 256GB (2^32 cache lines). Switch to 115 // 64-bit fastrange if you need filters so big. ;) 116 // 117 // Using only a 32-bit input hash within each cache line has negligible 118 // impact for any reasonable cache line / bucket size, for arbitrary filter 119 // size, and potentially saves intermediate data size in some cases vs. 120 // tracking full 64 bits. (Even in an implementation using 64-bit arithmetic 121 // to generate indices, I might do the same, as a single multiplication 122 // suffices to generate a sufficiently mixed 64 bits from 32 bits.) 123 // 124 // This implementation is currently tied to Intel cache line size, 64 bytes == 125 // 512 bits. If there's sufficient demand for other cache line sizes, this is 126 // a pretty good implementation to extend, but slight performance enhancements 127 // are possible with an alternate implementation (probably not very compatible 128 // with SIMD): 129 // (1) Use rotation in addition to multiplication for remixing 130 // (like murmur hash). (Using multiplication alone *slightly* hurts accuracy 131 // because lower bits never depend on original upper bits.) 132 // (2) Extract more than one bit index from each re-mix. (Only if rotation 133 // or similar is part of remix, because otherwise you're making the 134 // multiplication-only problem worse.) 135 // (3) Re-mix full 64 bit hash, to get maximum number of bit indices per 136 // re-mix. 137 // 138 class FastLocalBloomImpl { 139 public: 140 // NOTE: this has only been validated to enough accuracy for producing 141 // reasonable warnings / user feedback, not for making functional decisions. EstimatedFpRate(size_t keys,size_t bytes,int num_probes,int hash_bits)142 static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes, 143 int hash_bits) { 144 return BloomMath::IndependentProbabilitySum( 145 BloomMath::CacheLocalFpRate(8.0 * bytes / keys, num_probes, 146 /*cache line bits*/ 512), 147 BloomMath::FingerprintFpRate(keys, hash_bits)); 148 } 149 ChooseNumProbes(int millibits_per_key)150 static inline int ChooseNumProbes(int millibits_per_key) { 151 // Since this implementation can (with AVX2) make up to 8 probes 152 // for the same cost, we pick the most accurate num_probes, based 153 // on actual tests of the implementation. Note that for higher 154 // bits/key, the best choice for cache-local Bloom can be notably 155 // smaller than standard bloom, e.g. 9 instead of 11 @ 16 b/k. 156 if (millibits_per_key <= 2080) { 157 return 1; 158 } else if (millibits_per_key <= 3580) { 159 return 2; 160 } else if (millibits_per_key <= 5100) { 161 return 3; 162 } else if (millibits_per_key <= 6640) { 163 return 4; 164 } else if (millibits_per_key <= 8300) { 165 return 5; 166 } else if (millibits_per_key <= 10070) { 167 return 6; 168 } else if (millibits_per_key <= 11720) { 169 return 7; 170 } else if (millibits_per_key <= 14001) { 171 // Would be something like <= 13800 but sacrificing *slightly* for 172 // more settings using <= 8 probes. 173 return 8; 174 } else if (millibits_per_key <= 16050) { 175 return 9; 176 } else if (millibits_per_key <= 18300) { 177 return 10; 178 } else if (millibits_per_key <= 22001) { 179 return 11; 180 } else if (millibits_per_key <= 25501) { 181 return 12; 182 } else if (millibits_per_key > 50000) { 183 // Top out at 24 probes (three sets of 8) 184 return 24; 185 } else { 186 // Roughly optimal choices for remaining range 187 // e.g. 188 // 28000 -> 12, 28001 -> 13 189 // 50000 -> 23, 50001 -> 24 190 return (millibits_per_key - 1) / 2000 - 1; 191 } 192 } 193 AddHash(uint32_t h1,uint32_t h2,uint32_t len_bytes,int num_probes,char * data)194 static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes, 195 int num_probes, char *data) { 196 uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6; 197 AddHashPrepared(h2, num_probes, data + bytes_to_cache_line); 198 } 199 AddHashPrepared(uint32_t h2,int num_probes,char * data_at_cache_line)200 static inline void AddHashPrepared(uint32_t h2, int num_probes, 201 char *data_at_cache_line) { 202 uint32_t h = h2; 203 for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) { 204 // 9-bit address within 512 bit cache line 205 int bitpos = h >> (32 - 9); 206 data_at_cache_line[bitpos >> 3] |= (uint8_t{1} << (bitpos & 7)); 207 } 208 } 209 PrepareHash(uint32_t h1,uint32_t len_bytes,const char * data,uint32_t * byte_offset)210 static inline void PrepareHash(uint32_t h1, uint32_t len_bytes, 211 const char *data, 212 uint32_t /*out*/ *byte_offset) { 213 uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6; 214 PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */); 215 PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */); 216 *byte_offset = bytes_to_cache_line; 217 } 218 HashMayMatch(uint32_t h1,uint32_t h2,uint32_t len_bytes,int num_probes,const char * data)219 static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes, 220 int num_probes, const char *data) { 221 uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6; 222 return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line); 223 } 224 HashMayMatchPrepared(uint32_t h2,int num_probes,const char * data_at_cache_line)225 static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes, 226 const char *data_at_cache_line) { 227 uint32_t h = h2; 228 #ifdef HAVE_AVX2 229 int rem_probes = num_probes; 230 231 // NOTE: For better performance for num_probes in {1, 2, 9, 10, 17, 18, 232 // etc.} one can insert specialized code for rem_probes <= 2, bypassing 233 // the SIMD code in those cases. There is a detectable but minor overhead 234 // applied to other values of num_probes (when not statically determined), 235 // but smoother performance curve vs. num_probes. But for now, when 236 // in doubt, don't add unnecessary code. 237 238 // Powers of 32-bit golden ratio, mod 2**32. 239 const __m256i multipliers = 240 _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9, 241 0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749); 242 243 for (;;) { 244 // Eight copies of hash 245 __m256i hash_vector = _mm256_set1_epi32(h); 246 247 // Same effect as repeated multiplication by 0x9e3779b9 thanks to 248 // associativity of multiplication. 249 hash_vector = _mm256_mullo_epi32(hash_vector, multipliers); 250 251 // Now the top 9 bits of each of the eight 32-bit values in 252 // hash_vector are bit addresses for probes within the cache line. 253 // While the platform-independent code uses byte addressing (6 bits 254 // to pick a byte + 3 bits to pick a bit within a byte), here we work 255 // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit 256 // within a word) because that works well with AVX2 and is equivalent 257 // under little-endian. 258 259 // Shift each right by 28 bits to get 4-bit word addresses. 260 const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28); 261 262 // Gather 32-bit values spread over 512 bits by 4-bit address. In 263 // essence, we are dereferencing eight pointers within the cache 264 // line. 265 // 266 // Option 1: AVX2 gather (seems to be a little slow - understandable) 267 // const __m256i value_vector = 268 // _mm256_i32gather_epi32(static_cast<const int 269 // *>(data_at_cache_line), 270 // word_addresses, 271 // /*bytes / i32*/ 4); 272 // END Option 1 273 // Potentially unaligned as we're not *always* cache-aligned -> loadu 274 const __m256i *mm_data = 275 reinterpret_cast<const __m256i *>(data_at_cache_line); 276 __m256i lower = _mm256_loadu_si256(mm_data); 277 __m256i upper = _mm256_loadu_si256(mm_data + 1); 278 // Option 2: AVX512VL permute hack 279 // Only negligibly faster than Option 3, so not yet worth supporting 280 // const __m256i value_vector = 281 // _mm256_permutex2var_epi32(lower, word_addresses, upper); 282 // END Option 2 283 // Option 3: AVX2 permute+blend hack 284 // Use lowest three bits to order probing values, as if all from same 285 // 256 bit piece. 286 lower = _mm256_permutevar8x32_epi32(lower, word_addresses); 287 upper = _mm256_permutevar8x32_epi32(upper, word_addresses); 288 // Just top 1 bit of address, to select between lower and upper. 289 const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31); 290 // Finally: the next 8 probed 32-bit values, in probing sequence order. 291 const __m256i value_vector = 292 _mm256_blendv_epi8(lower, upper, upper_lower_selector); 293 // END Option 3 294 295 // We might not need to probe all 8, so build a mask for selecting only 296 // what we need. (The k_selector(s) could be pre-computed but that 297 // doesn't seem to make a noticeable performance difference.) 298 const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); 299 // Subtract rem_probes from each of those constants 300 __m256i k_selector = 301 _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(rem_probes)); 302 // Negative after subtract -> use/select 303 // Keep only high bit (logical shift right each by 31). 304 k_selector = _mm256_srli_epi32(k_selector, 31); 305 306 // Strip off the 4 bit word address (shift left) 307 __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4); 308 // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses. 309 bit_addresses = _mm256_srli_epi32(bit_addresses, 27); 310 // Build a bit mask 311 const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses); 312 313 // Like ((~value_vector) & bit_mask) == 0) 314 bool match = _mm256_testc_si256(value_vector, bit_mask) != 0; 315 316 // This check first so that it's easy for branch predictor to optimize 317 // num_probes <= 8 case, making it free of unpredictable branches. 318 if (rem_probes <= 8) { 319 return match; 320 } else if (!match) { 321 return false; 322 } 323 // otherwise 324 // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power 325 h *= 0xab25f4c1; 326 rem_probes -= 8; 327 } 328 #else 329 for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) { 330 // 9-bit address within 512 bit cache line 331 int bitpos = h >> (32 - 9); 332 if ((data_at_cache_line[bitpos >> 3] & (char(1) << (bitpos & 7))) == 0) { 333 return false; 334 } 335 } 336 return true; 337 #endif 338 } 339 }; 340 341 // A legacy Bloom filter implementation with no locality of probes (slow). 342 // It uses double hashing to generate a sequence of hash values. 343 // Asymptotic analysis is in [Kirsch,Mitzenmacher 2006], but known to have 344 // subtle accuracy flaws for practical sizes [Dillinger,Manolios 2004]. 345 // 346 // DO NOT REUSE 347 // 348 class LegacyNoLocalityBloomImpl { 349 public: ChooseNumProbes(int bits_per_key)350 static inline int ChooseNumProbes(int bits_per_key) { 351 // We intentionally round down to reduce probing cost a little bit 352 int num_probes = static_cast<int>(bits_per_key * 0.69); // 0.69 =~ ln(2) 353 if (num_probes < 1) num_probes = 1; 354 if (num_probes > 30) num_probes = 30; 355 return num_probes; 356 } 357 AddHash(uint32_t h,uint32_t total_bits,int num_probes,char * data)358 static inline void AddHash(uint32_t h, uint32_t total_bits, int num_probes, 359 char *data) { 360 const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits 361 for (int i = 0; i < num_probes; i++) { 362 const uint32_t bitpos = h % total_bits; 363 data[bitpos / 8] |= (1 << (bitpos % 8)); 364 h += delta; 365 } 366 } 367 HashMayMatch(uint32_t h,uint32_t total_bits,int num_probes,const char * data)368 static inline bool HashMayMatch(uint32_t h, uint32_t total_bits, 369 int num_probes, const char *data) { 370 const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits 371 for (int i = 0; i < num_probes; i++) { 372 const uint32_t bitpos = h % total_bits; 373 if ((data[bitpos / 8] & (1 << (bitpos % 8))) == 0) { 374 return false; 375 } 376 h += delta; 377 } 378 return true; 379 } 380 }; 381 382 // A legacy Bloom filter implementation with probes local to a single 383 // cache line (fast). Because SST files might be transported between 384 // platforms, the cache line size is a parameter rather than hard coded. 385 // (But if specified as a constant parameter, an optimizing compiler 386 // should take advantage of that.) 387 // 388 // When ExtraRotates is false, this implementation is notably deficient in 389 // accuracy. Specifically, it uses double hashing with a 1/512 chance of the 390 // increment being zero (when cache line size is 512 bits). Thus, there's a 391 // 1/512 chance of probing only one index, which we'd expect to incur about 392 // a 1/2 * 1/512 or absolute 0.1% FP rate penalty. More detail at 393 // https://github.com/facebook/rocksdb/issues/4120 394 // 395 // DO NOT REUSE 396 // 397 template <bool ExtraRotates> 398 class LegacyLocalityBloomImpl { 399 private: GetLine(uint32_t h,uint32_t num_lines)400 static inline uint32_t GetLine(uint32_t h, uint32_t num_lines) { 401 uint32_t offset_h = ExtraRotates ? (h >> 11) | (h << 21) : h; 402 return offset_h % num_lines; 403 } 404 405 public: 406 // NOTE: this has only been validated to enough accuracy for producing 407 // reasonable warnings / user feedback, not for making functional decisions. EstimatedFpRate(size_t keys,size_t bytes,int num_probes)408 static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes) { 409 double bits_per_key = 8.0 * bytes / keys; 410 double filter_rate = BloomMath::CacheLocalFpRate(bits_per_key, num_probes, 411 /*cache line bits*/ 512); 412 if (!ExtraRotates) { 413 // Good estimate of impact of flaw in index computation. 414 // Adds roughly 0.002 around 50 bits/key and 0.001 around 100 bits/key. 415 // The + 22 shifts it nicely to fit for lower bits/key. 416 filter_rate += 0.1 / (bits_per_key * 0.75 + 22); 417 } else { 418 // Not yet validated 419 assert(false); 420 } 421 // Always uses 32-bit hash 422 double fingerprint_rate = BloomMath::FingerprintFpRate(keys, 32); 423 return BloomMath::IndependentProbabilitySum(filter_rate, fingerprint_rate); 424 } 425 AddHash(uint32_t h,uint32_t num_lines,int num_probes,char * data,int log2_cache_line_bytes)426 static inline void AddHash(uint32_t h, uint32_t num_lines, int num_probes, 427 char *data, int log2_cache_line_bytes) { 428 const int log2_cache_line_bits = log2_cache_line_bytes + 3; 429 430 char *data_at_offset = 431 data + (GetLine(h, num_lines) << log2_cache_line_bytes); 432 const uint32_t delta = (h >> 17) | (h << 15); 433 for (int i = 0; i < num_probes; ++i) { 434 // Mask to bit-within-cache-line address 435 const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1); 436 data_at_offset[bitpos / 8] |= (1 << (bitpos % 8)); 437 if (ExtraRotates) { 438 h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits)); 439 } 440 h += delta; 441 } 442 } 443 PrepareHashMayMatch(uint32_t h,uint32_t num_lines,const char * data,uint32_t * byte_offset,int log2_cache_line_bytes)444 static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines, 445 const char *data, 446 uint32_t /*out*/ *byte_offset, 447 int log2_cache_line_bytes) { 448 uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes; 449 PREFETCH(data + b, 0 /* rw */, 1 /* locality */); 450 PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1), 0 /* rw */, 451 1 /* locality */); 452 *byte_offset = b; 453 } 454 HashMayMatch(uint32_t h,uint32_t num_lines,int num_probes,const char * data,int log2_cache_line_bytes)455 static inline bool HashMayMatch(uint32_t h, uint32_t num_lines, 456 int num_probes, const char *data, 457 int log2_cache_line_bytes) { 458 uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes; 459 return HashMayMatchPrepared(h, num_probes, data + b, log2_cache_line_bytes); 460 } 461 HashMayMatchPrepared(uint32_t h,int num_probes,const char * data_at_offset,int log2_cache_line_bytes)462 static inline bool HashMayMatchPrepared(uint32_t h, int num_probes, 463 const char *data_at_offset, 464 int log2_cache_line_bytes) { 465 const int log2_cache_line_bits = log2_cache_line_bytes + 3; 466 467 const uint32_t delta = (h >> 17) | (h << 15); 468 for (int i = 0; i < num_probes; ++i) { 469 // Mask to bit-within-cache-line address 470 const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1); 471 if (((data_at_offset[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { 472 return false; 473 } 474 if (ExtraRotates) { 475 h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits)); 476 } 477 h += delta; 478 } 479 return true; 480 } 481 }; 482 483 } // namespace rocksdb 484