1 //  Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Implementation details of various Bloom filter implementations used in
7 // RocksDB. (DynamicBloom is in a separate file for now because it
8 // supports concurrent write.)
9 
10 #pragma once
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <cmath>
14 
15 #include "rocksdb/slice.h"
16 #include "util/hash.h"
17 
18 #ifdef HAVE_AVX2
19 #include <immintrin.h>
20 #endif
21 
22 namespace rocksdb {
23 
24 class BloomMath {
25  public:
26   // False positive rate of a standard Bloom filter, for given ratio of
27   // filter memory bits to added keys, and number of probes per operation.
28   // (The false positive rate is effectively independent of scale, assuming
29   // the implementation scales OK.)
StandardFpRate(double bits_per_key,int num_probes)30   static double StandardFpRate(double bits_per_key, int num_probes) {
31     // Standard very-good-estimate formula. See
32     // https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
33     return std::pow(1.0 - std::exp(-num_probes / bits_per_key), num_probes);
34   }
35 
36   // False positive rate of a "blocked"/"shareded"/"cache-local" Bloom filter,
37   // for given ratio of filter memory bits to added keys, number of probes per
38   // operation (all within the given block or cache line size), and block or
39   // cache line size.
CacheLocalFpRate(double bits_per_key,int num_probes,int cache_line_bits)40   static double CacheLocalFpRate(double bits_per_key, int num_probes,
41                                  int cache_line_bits) {
42     double keys_per_cache_line = cache_line_bits / bits_per_key;
43     // A reasonable estimate is the average of the FP rates for one standard
44     // deviation above and below the mean bucket occupancy. See
45     // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#the-math
46     double keys_stddev = std::sqrt(keys_per_cache_line);
47     double crowded_fp = StandardFpRate(
48         cache_line_bits / (keys_per_cache_line + keys_stddev), num_probes);
49     double uncrowded_fp = StandardFpRate(
50         cache_line_bits / (keys_per_cache_line - keys_stddev), num_probes);
51     return (crowded_fp + uncrowded_fp) / 2;
52   }
53 
54   // False positive rate of querying a new item against `num_keys` items, all
55   // hashed to `fingerprint_bits` bits. (This assumes the fingerprint hashes
56   // themselves are stored losslessly. See Section 4 of
57   // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf)
FingerprintFpRate(size_t num_keys,int fingerprint_bits)58   static double FingerprintFpRate(size_t num_keys, int fingerprint_bits) {
59     double inv_fingerprint_space = std::pow(0.5, fingerprint_bits);
60     // Base estimate assumes each key maps to a unique fingerprint.
61     // Could be > 1 in extreme cases.
62     double base_estimate = num_keys * inv_fingerprint_space;
63     // To account for potential overlap, we choose between two formulas
64     if (base_estimate > 0.0001) {
65       // A very good formula assuming we don't construct a floating point
66       // number extremely close to 1. Always produces a probability < 1.
67       return 1.0 - std::exp(-base_estimate);
68     } else {
69       // A very good formula when base_estimate is far below 1. (Subtract
70       // away the integral-approximated sum that some key has same hash as
71       // one coming before it in a list.)
72       return base_estimate - (base_estimate * base_estimate * 0.5);
73     }
74   }
75 
76   // Returns the probably of either of two independent(-ish) events
77   // happening, given their probabilities. (This is useful for combining
78   // results from StandardFpRate or CacheLocalFpRate with FingerprintFpRate
79   // for a hash-efficient Bloom filter's FP rate. See Section 4 of
80   // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf)
IndependentProbabilitySum(double rate1,double rate2)81   static double IndependentProbabilitySum(double rate1, double rate2) {
82     // Use formula that avoids floating point extremely close to 1 if
83     // rates are extremely small.
84     return rate1 + rate2 - (rate1 * rate2);
85   }
86 };
87 
88 // A fast, flexible, and accurate cache-local Bloom implementation with
89 // SIMD-optimized query performance (currently using AVX2 on Intel). Write
90 // performance and non-SIMD read are very good, benefiting from fastrange32
91 // used in place of % and single-cycle multiplication on recent processors.
92 //
93 // Most other SIMD Bloom implementations sacrifice flexibility and/or
94 // accuracy by requiring num_probes to be a power of two and restricting
95 // where each probe can occur in a cache line. This implementation sacrifices
96 // SIMD-optimization for add (might still be possible, especially with AVX512)
97 // in favor of allowing any num_probes, not crossing cache line boundary,
98 // and accuracy close to theoretical best accuracy for a cache-local Bloom.
99 // E.g. theoretical best for 10 bits/key, num_probes=6, and 512-bit bucket
100 // (Intel cache line size) is 0.9535% FP rate. This implementation yields
101 // about 0.957%. (Compare to LegacyLocalityBloomImpl<false> at 1.138%, or
102 // about 0.951% for 1024-bit buckets, cache line size for some ARM CPUs.)
103 //
104 // This implementation can use a 32-bit hash (let h2 be h1 * 0x9e3779b9) or
105 // a 64-bit hash (split into two uint32s). With many millions of keys, the
106 // false positive rate associated with using a 32-bit hash can dominate the
107 // false positive rate of the underlying filter. At 10 bits/key setting, the
108 // inflection point is about 40 million keys, so 32-bit hash is a bad idea
109 // with 10s of millions of keys or more.
110 //
111 // Despite accepting a 64-bit hash, this implementation uses 32-bit fastrange
112 // to pick a cache line, which can be faster than 64-bit in some cases.
113 // This only hurts accuracy as you get into 10s of GB for a single filter,
114 // and accuracy abruptly breaks down at 256GB (2^32 cache lines). Switch to
115 // 64-bit fastrange if you need filters so big. ;)
116 //
117 // Using only a 32-bit input hash within each cache line has negligible
118 // impact for any reasonable cache line / bucket size, for arbitrary filter
119 // size, and potentially saves intermediate data size in some cases vs.
120 // tracking full 64 bits. (Even in an implementation using 64-bit arithmetic
121 // to generate indices, I might do the same, as a single multiplication
122 // suffices to generate a sufficiently mixed 64 bits from 32 bits.)
123 //
124 // This implementation is currently tied to Intel cache line size, 64 bytes ==
125 // 512 bits. If there's sufficient demand for other cache line sizes, this is
126 // a pretty good implementation to extend, but slight performance enhancements
127 // are possible with an alternate implementation (probably not very compatible
128 // with SIMD):
129 // (1) Use rotation in addition to multiplication for remixing
130 // (like murmur hash). (Using multiplication alone *slightly* hurts accuracy
131 // because lower bits never depend on original upper bits.)
132 // (2) Extract more than one bit index from each re-mix. (Only if rotation
133 // or similar is part of remix, because otherwise you're making the
134 // multiplication-only problem worse.)
135 // (3) Re-mix full 64 bit hash, to get maximum number of bit indices per
136 // re-mix.
137 //
138 class FastLocalBloomImpl {
139  public:
140   // NOTE: this has only been validated to enough accuracy for producing
141   // reasonable warnings / user feedback, not for making functional decisions.
EstimatedFpRate(size_t keys,size_t bytes,int num_probes,int hash_bits)142   static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes,
143                                 int hash_bits) {
144     return BloomMath::IndependentProbabilitySum(
145         BloomMath::CacheLocalFpRate(8.0 * bytes / keys, num_probes,
146                                     /*cache line bits*/ 512),
147         BloomMath::FingerprintFpRate(keys, hash_bits));
148   }
149 
ChooseNumProbes(int millibits_per_key)150   static inline int ChooseNumProbes(int millibits_per_key) {
151     // Since this implementation can (with AVX2) make up to 8 probes
152     // for the same cost, we pick the most accurate num_probes, based
153     // on actual tests of the implementation. Note that for higher
154     // bits/key, the best choice for cache-local Bloom can be notably
155     // smaller than standard bloom, e.g. 9 instead of 11 @ 16 b/k.
156     if (millibits_per_key <= 2080) {
157       return 1;
158     } else if (millibits_per_key <= 3580) {
159       return 2;
160     } else if (millibits_per_key <= 5100) {
161       return 3;
162     } else if (millibits_per_key <= 6640) {
163       return 4;
164     } else if (millibits_per_key <= 8300) {
165       return 5;
166     } else if (millibits_per_key <= 10070) {
167       return 6;
168     } else if (millibits_per_key <= 11720) {
169       return 7;
170     } else if (millibits_per_key <= 14001) {
171       // Would be something like <= 13800 but sacrificing *slightly* for
172       // more settings using <= 8 probes.
173       return 8;
174     } else if (millibits_per_key <= 16050) {
175       return 9;
176     } else if (millibits_per_key <= 18300) {
177       return 10;
178     } else if (millibits_per_key <= 22001) {
179       return 11;
180     } else if (millibits_per_key <= 25501) {
181       return 12;
182     } else if (millibits_per_key > 50000) {
183       // Top out at 24 probes (three sets of 8)
184       return 24;
185     } else {
186       // Roughly optimal choices for remaining range
187       // e.g.
188       // 28000 -> 12, 28001 -> 13
189       // 50000 -> 23, 50001 -> 24
190       return (millibits_per_key - 1) / 2000 - 1;
191     }
192   }
193 
AddHash(uint32_t h1,uint32_t h2,uint32_t len_bytes,int num_probes,char * data)194   static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes,
195                              int num_probes, char *data) {
196     uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
197     AddHashPrepared(h2, num_probes, data + bytes_to_cache_line);
198   }
199 
AddHashPrepared(uint32_t h2,int num_probes,char * data_at_cache_line)200   static inline void AddHashPrepared(uint32_t h2, int num_probes,
201                                      char *data_at_cache_line) {
202     uint32_t h = h2;
203     for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
204       // 9-bit address within 512 bit cache line
205       int bitpos = h >> (32 - 9);
206       data_at_cache_line[bitpos >> 3] |= (uint8_t{1} << (bitpos & 7));
207     }
208   }
209 
PrepareHash(uint32_t h1,uint32_t len_bytes,const char * data,uint32_t * byte_offset)210   static inline void PrepareHash(uint32_t h1, uint32_t len_bytes,
211                                  const char *data,
212                                  uint32_t /*out*/ *byte_offset) {
213     uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
214     PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */);
215     PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */);
216     *byte_offset = bytes_to_cache_line;
217   }
218 
HashMayMatch(uint32_t h1,uint32_t h2,uint32_t len_bytes,int num_probes,const char * data)219   static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes,
220                                   int num_probes, const char *data) {
221     uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
222     return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line);
223   }
224 
HashMayMatchPrepared(uint32_t h2,int num_probes,const char * data_at_cache_line)225   static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes,
226                                           const char *data_at_cache_line) {
227     uint32_t h = h2;
228 #ifdef HAVE_AVX2
229     int rem_probes = num_probes;
230 
231     // NOTE: For better performance for num_probes in {1, 2, 9, 10, 17, 18,
232     // etc.} one can insert specialized code for rem_probes <= 2, bypassing
233     // the SIMD code in those cases. There is a detectable but minor overhead
234     // applied to other values of num_probes (when not statically determined),
235     // but smoother performance curve vs. num_probes. But for now, when
236     // in doubt, don't add unnecessary code.
237 
238     // Powers of 32-bit golden ratio, mod 2**32.
239     const __m256i multipliers =
240         _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9,
241                           0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749);
242 
243     for (;;) {
244       // Eight copies of hash
245       __m256i hash_vector = _mm256_set1_epi32(h);
246 
247       // Same effect as repeated multiplication by 0x9e3779b9 thanks to
248       // associativity of multiplication.
249       hash_vector = _mm256_mullo_epi32(hash_vector, multipliers);
250 
251       // Now the top 9 bits of each of the eight 32-bit values in
252       // hash_vector are bit addresses for probes within the cache line.
253       // While the platform-independent code uses byte addressing (6 bits
254       // to pick a byte + 3 bits to pick a bit within a byte), here we work
255       // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit
256       // within a word) because that works well with AVX2 and is equivalent
257       // under little-endian.
258 
259       // Shift each right by 28 bits to get 4-bit word addresses.
260       const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28);
261 
262       // Gather 32-bit values spread over 512 bits by 4-bit address. In
263       // essence, we are dereferencing eight pointers within the cache
264       // line.
265       //
266       // Option 1: AVX2 gather (seems to be a little slow - understandable)
267       // const __m256i value_vector =
268       //     _mm256_i32gather_epi32(static_cast<const int
269       //     *>(data_at_cache_line),
270       //                            word_addresses,
271       //                            /*bytes / i32*/ 4);
272       // END Option 1
273       // Potentially unaligned as we're not *always* cache-aligned -> loadu
274       const __m256i *mm_data =
275           reinterpret_cast<const __m256i *>(data_at_cache_line);
276       __m256i lower = _mm256_loadu_si256(mm_data);
277       __m256i upper = _mm256_loadu_si256(mm_data + 1);
278       // Option 2: AVX512VL permute hack
279       // Only negligibly faster than Option 3, so not yet worth supporting
280       // const __m256i value_vector =
281       //    _mm256_permutex2var_epi32(lower, word_addresses, upper);
282       // END Option 2
283       // Option 3: AVX2 permute+blend hack
284       // Use lowest three bits to order probing values, as if all from same
285       // 256 bit piece.
286       lower = _mm256_permutevar8x32_epi32(lower, word_addresses);
287       upper = _mm256_permutevar8x32_epi32(upper, word_addresses);
288       // Just top 1 bit of address, to select between lower and upper.
289       const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31);
290       // Finally: the next 8 probed 32-bit values, in probing sequence order.
291       const __m256i value_vector =
292           _mm256_blendv_epi8(lower, upper, upper_lower_selector);
293       // END Option 3
294 
295       // We might not need to probe all 8, so build a mask for selecting only
296       // what we need. (The k_selector(s) could be pre-computed but that
297       // doesn't seem to make a noticeable performance difference.)
298       const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
299       // Subtract rem_probes from each of those constants
300       __m256i k_selector =
301           _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(rem_probes));
302       // Negative after subtract -> use/select
303       // Keep only high bit (logical shift right each by 31).
304       k_selector = _mm256_srli_epi32(k_selector, 31);
305 
306       // Strip off the 4 bit word address (shift left)
307       __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4);
308       // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses.
309       bit_addresses = _mm256_srli_epi32(bit_addresses, 27);
310       // Build a bit mask
311       const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses);
312 
313       // Like ((~value_vector) & bit_mask) == 0)
314       bool match = _mm256_testc_si256(value_vector, bit_mask) != 0;
315 
316       // This check first so that it's easy for branch predictor to optimize
317       // num_probes <= 8 case, making it free of unpredictable branches.
318       if (rem_probes <= 8) {
319         return match;
320       } else if (!match) {
321         return false;
322       }
323       // otherwise
324       // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power
325       h *= 0xab25f4c1;
326       rem_probes -= 8;
327     }
328 #else
329     for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
330       // 9-bit address within 512 bit cache line
331       int bitpos = h >> (32 - 9);
332       if ((data_at_cache_line[bitpos >> 3] & (char(1) << (bitpos & 7))) == 0) {
333         return false;
334       }
335     }
336     return true;
337 #endif
338   }
339 };
340 
341 // A legacy Bloom filter implementation with no locality of probes (slow).
342 // It uses double hashing to generate a sequence of hash values.
343 // Asymptotic analysis is in [Kirsch,Mitzenmacher 2006], but known to have
344 // subtle accuracy flaws for practical sizes [Dillinger,Manolios 2004].
345 //
346 // DO NOT REUSE
347 //
348 class LegacyNoLocalityBloomImpl {
349  public:
ChooseNumProbes(int bits_per_key)350   static inline int ChooseNumProbes(int bits_per_key) {
351     // We intentionally round down to reduce probing cost a little bit
352     int num_probes = static_cast<int>(bits_per_key * 0.69);  // 0.69 =~ ln(2)
353     if (num_probes < 1) num_probes = 1;
354     if (num_probes > 30) num_probes = 30;
355     return num_probes;
356   }
357 
AddHash(uint32_t h,uint32_t total_bits,int num_probes,char * data)358   static inline void AddHash(uint32_t h, uint32_t total_bits, int num_probes,
359                              char *data) {
360     const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
361     for (int i = 0; i < num_probes; i++) {
362       const uint32_t bitpos = h % total_bits;
363       data[bitpos / 8] |= (1 << (bitpos % 8));
364       h += delta;
365     }
366   }
367 
HashMayMatch(uint32_t h,uint32_t total_bits,int num_probes,const char * data)368   static inline bool HashMayMatch(uint32_t h, uint32_t total_bits,
369                                   int num_probes, const char *data) {
370     const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
371     for (int i = 0; i < num_probes; i++) {
372       const uint32_t bitpos = h % total_bits;
373       if ((data[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
374         return false;
375       }
376       h += delta;
377     }
378     return true;
379   }
380 };
381 
382 // A legacy Bloom filter implementation with probes local to a single
383 // cache line (fast). Because SST files might be transported between
384 // platforms, the cache line size is a parameter rather than hard coded.
385 // (But if specified as a constant parameter, an optimizing compiler
386 // should take advantage of that.)
387 //
388 // When ExtraRotates is false, this implementation is notably deficient in
389 // accuracy. Specifically, it uses double hashing with a 1/512 chance of the
390 // increment being zero (when cache line size is 512 bits). Thus, there's a
391 // 1/512 chance of probing only one index, which we'd expect to incur about
392 // a 1/2 * 1/512 or absolute 0.1% FP rate penalty. More detail at
393 // https://github.com/facebook/rocksdb/issues/4120
394 //
395 // DO NOT REUSE
396 //
397 template <bool ExtraRotates>
398 class LegacyLocalityBloomImpl {
399  private:
GetLine(uint32_t h,uint32_t num_lines)400   static inline uint32_t GetLine(uint32_t h, uint32_t num_lines) {
401     uint32_t offset_h = ExtraRotates ? (h >> 11) | (h << 21) : h;
402     return offset_h % num_lines;
403   }
404 
405  public:
406   // NOTE: this has only been validated to enough accuracy for producing
407   // reasonable warnings / user feedback, not for making functional decisions.
EstimatedFpRate(size_t keys,size_t bytes,int num_probes)408   static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes) {
409     double bits_per_key = 8.0 * bytes / keys;
410     double filter_rate = BloomMath::CacheLocalFpRate(bits_per_key, num_probes,
411                                                      /*cache line bits*/ 512);
412     if (!ExtraRotates) {
413       // Good estimate of impact of flaw in index computation.
414       // Adds roughly 0.002 around 50 bits/key and 0.001 around 100 bits/key.
415       // The + 22 shifts it nicely to fit for lower bits/key.
416       filter_rate += 0.1 / (bits_per_key * 0.75 + 22);
417     } else {
418       // Not yet validated
419       assert(false);
420     }
421     // Always uses 32-bit hash
422     double fingerprint_rate = BloomMath::FingerprintFpRate(keys, 32);
423     return BloomMath::IndependentProbabilitySum(filter_rate, fingerprint_rate);
424   }
425 
AddHash(uint32_t h,uint32_t num_lines,int num_probes,char * data,int log2_cache_line_bytes)426   static inline void AddHash(uint32_t h, uint32_t num_lines, int num_probes,
427                              char *data, int log2_cache_line_bytes) {
428     const int log2_cache_line_bits = log2_cache_line_bytes + 3;
429 
430     char *data_at_offset =
431         data + (GetLine(h, num_lines) << log2_cache_line_bytes);
432     const uint32_t delta = (h >> 17) | (h << 15);
433     for (int i = 0; i < num_probes; ++i) {
434       // Mask to bit-within-cache-line address
435       const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
436       data_at_offset[bitpos / 8] |= (1 << (bitpos % 8));
437       if (ExtraRotates) {
438         h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
439       }
440       h += delta;
441     }
442   }
443 
PrepareHashMayMatch(uint32_t h,uint32_t num_lines,const char * data,uint32_t * byte_offset,int log2_cache_line_bytes)444   static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines,
445                                          const char *data,
446                                          uint32_t /*out*/ *byte_offset,
447                                          int log2_cache_line_bytes) {
448     uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
449     PREFETCH(data + b, 0 /* rw */, 1 /* locality */);
450     PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1), 0 /* rw */,
451              1 /* locality */);
452     *byte_offset = b;
453   }
454 
HashMayMatch(uint32_t h,uint32_t num_lines,int num_probes,const char * data,int log2_cache_line_bytes)455   static inline bool HashMayMatch(uint32_t h, uint32_t num_lines,
456                                   int num_probes, const char *data,
457                                   int log2_cache_line_bytes) {
458     uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
459     return HashMayMatchPrepared(h, num_probes, data + b, log2_cache_line_bytes);
460   }
461 
HashMayMatchPrepared(uint32_t h,int num_probes,const char * data_at_offset,int log2_cache_line_bytes)462   static inline bool HashMayMatchPrepared(uint32_t h, int num_probes,
463                                           const char *data_at_offset,
464                                           int log2_cache_line_bytes) {
465     const int log2_cache_line_bits = log2_cache_line_bytes + 3;
466 
467     const uint32_t delta = (h >> 17) | (h << 15);
468     for (int i = 0; i < num_probes; ++i) {
469       // Mask to bit-within-cache-line address
470       const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
471       if (((data_at_offset[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
472         return false;
473       }
474       if (ExtraRotates) {
475         h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
476       }
477       h += delta;
478     }
479     return true;
480   }
481 };
482 
483 }  // namespace rocksdb
484