1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2012 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #include "rocksdb/filter_policy.h"
11 
12 #include <array>
13 #include <deque>
14 #include <limits>
15 
16 #include "logging/logging.h"
17 #include "rocksdb/slice.h"
18 #include "table/block_based/block_based_filter_block.h"
19 #include "table/block_based/filter_policy_internal.h"
20 #include "table/block_based/full_filter_block.h"
21 #include "third-party/folly/folly/ConstexprMath.h"
22 #include "util/bloom_impl.h"
23 #include "util/coding.h"
24 #include "util/hash.h"
25 #include "util/ribbon_config.h"
26 #include "util/ribbon_impl.h"
27 
28 namespace ROCKSDB_NAMESPACE {
29 
30 namespace {
31 
32 // Metadata trailer size for built-in filters. (This is separate from
33 // block-based table block trailer.)
34 //
35 // Originally this was 1 byte for num_probes and 4 bytes for number of
36 // cache lines in the Bloom filter, but now the first trailer byte is
37 // usually an implementation marker and remaining 4 bytes have various
38 // meanings.
39 static constexpr uint32_t kMetadataLen = 5;
40 
FinishAlwaysFalse(std::unique_ptr<const char[]> *)41 Slice FinishAlwaysFalse(std::unique_ptr<const char[]>* /*buf*/) {
42   // Missing metadata, treated as zero entries
43   return Slice(nullptr, 0);
44 }
45 
46 // Base class for filter builders using the XXH3 preview hash,
47 // also known as Hash64 or GetSliceHash64.
48 class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
49  public:
XXPH3FilterBitsBuilder(std::atomic<int64_t> * aggregate_rounding_balance)50   explicit XXPH3FilterBitsBuilder(
51       std::atomic<int64_t>* aggregate_rounding_balance)
52       : aggregate_rounding_balance_(aggregate_rounding_balance) {}
53 
~XXPH3FilterBitsBuilder()54   ~XXPH3FilterBitsBuilder() override {}
55 
AddKey(const Slice & key)56   virtual void AddKey(const Slice& key) override {
57     uint64_t hash = GetSliceHash64(key);
58     // Especially with prefixes, it is common to have repetition,
59     // though only adjacent repetition, which we want to immediately
60     // recognize and collapse for estimating true filter space
61     // requirements.
62     if (hash_entries_.empty() || hash != hash_entries_.back()) {
63       hash_entries_.push_back(hash);
64     }
65   }
66 
EstimateEntriesAdded()67   virtual size_t EstimateEntriesAdded() override {
68     return hash_entries_.size();
69   }
70 
71  protected:
72   static constexpr uint32_t kMetadataLen = 5;
73 
74   // For delegating between XXPH3FilterBitsBuilders
SwapEntriesWith(XXPH3FilterBitsBuilder * other)75   void SwapEntriesWith(XXPH3FilterBitsBuilder* other) {
76     std::swap(hash_entries_, other->hash_entries_);
77   }
78 
79   virtual size_t RoundDownUsableSpace(size_t available_size) = 0;
80 
81   // To choose size using malloc_usable_size, we have to actually allocate.
AllocateMaybeRounding(size_t target_len_with_metadata,size_t num_entries,std::unique_ptr<char[]> * buf)82   size_t AllocateMaybeRounding(size_t target_len_with_metadata,
83                                size_t num_entries,
84                                std::unique_ptr<char[]>* buf) {
85     // Return value set to a default; overwritten in some cases
86     size_t rv = target_len_with_metadata;
87 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
88     if (aggregate_rounding_balance_ != nullptr) {
89       // Do optimize_filters_for_memory, using malloc_usable_size.
90       // Approach: try to keep FP rate balance better than or on
91       // target (negative aggregate_rounding_balance_). We can then select a
92       // lower bound filter size (within reasonable limits) that gets us as
93       // close to on target as possible. We request allocation for that filter
94       // size and use malloc_usable_size to "round up" to the actual
95       // allocation size.
96 
97       // Although it can be considered bad practice to use malloc_usable_size
98       // to access an object beyond its original size, this approach should be
99       // quite general: working for all allocators that properly support
100       // malloc_usable_size.
101 
102       // Race condition on balance is OK because it can only cause temporary
103       // skew in rounding up vs. rounding down, as long as updates are atomic
104       // and relative.
105       int64_t balance = aggregate_rounding_balance_->load();
106 
107       double target_fp_rate =
108           EstimatedFpRate(num_entries, target_len_with_metadata);
109       double rv_fp_rate = target_fp_rate;
110 
111       if (balance < 0) {
112         // See formula for BloomFilterPolicy::aggregate_rounding_balance_
113         double for_balance_fp_rate =
114             -balance / double{0x100000000} + target_fp_rate;
115 
116         // To simplify, we just try a few modified smaller sizes. This also
117         // caps how much we vary filter size vs. target, to avoid outlier
118         // behavior from excessive variance.
119         size_t target_len = target_len_with_metadata - kMetadataLen;
120         assert(target_len < target_len_with_metadata);  // check underflow
121         for (uint64_t maybe_len_rough :
122              {uint64_t{3} * target_len / 4, uint64_t{13} * target_len / 16,
123               uint64_t{7} * target_len / 8, uint64_t{15} * target_len / 16}) {
124           size_t maybe_len_with_metadata =
125               RoundDownUsableSpace(maybe_len_rough + kMetadataLen);
126           double maybe_fp_rate =
127               EstimatedFpRate(num_entries, maybe_len_with_metadata);
128           if (maybe_fp_rate <= for_balance_fp_rate) {
129             rv = maybe_len_with_metadata;
130             rv_fp_rate = maybe_fp_rate;
131             break;
132           }
133         }
134       }
135 
136       // Filter blocks are loaded into block cache with their block trailer.
137       // We need to make sure that's accounted for in choosing a
138       // fragmentation-friendly size.
139       const size_t kExtraPadding = kBlockTrailerSize;
140       size_t requested = rv + kExtraPadding;
141 
142       // Allocate and get usable size
143       buf->reset(new char[requested]);
144       size_t usable = malloc_usable_size(buf->get());
145 
146       if (usable - usable / 4 > requested) {
147         // Ratio greater than 4/3 is too much for utilizing, if it's
148         // not a buggy or mislinked malloc_usable_size implementation.
149         // Non-linearity of FP rates with bits/key means rapidly
150         // diminishing returns in overall accuracy for additional
151         // storage on disk.
152         // Nothing to do, except assert that the result is accurate about
153         // the usable size. (Assignment never used.)
154         assert(((*buf)[usable - 1] = 'x'));
155       } else if (usable > requested) {
156         rv = RoundDownUsableSpace(usable - kExtraPadding);
157         assert(rv <= usable - kExtraPadding);
158         rv_fp_rate = EstimatedFpRate(num_entries, rv);
159       } else {
160         // Too small means bad malloc_usable_size
161         assert(usable == requested);
162       }
163       memset(buf->get(), 0, rv);
164 
165       // Update balance
166       int64_t diff = static_cast<int64_t>((rv_fp_rate - target_fp_rate) *
167                                           double{0x100000000});
168       *aggregate_rounding_balance_ += diff;
169     } else {
170       buf->reset(new char[rv]());
171     }
172 #else
173     (void)num_entries;
174     buf->reset(new char[rv]());
175 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
176     return rv;
177   }
178 
179   // A deque avoids unnecessary copying of already-saved values
180   // and has near-minimal peak memory use.
181   std::deque<uint64_t> hash_entries_;
182 
183   // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr,
184   // always "round up" like historic behavior.
185   std::atomic<int64_t>* aggregate_rounding_balance_;
186 };
187 
188 // #################### FastLocalBloom implementation ################## //
189 // ############## also known as format_version=5 Bloom filter ########## //
190 
191 // See description in FastLocalBloomImpl
192 class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder {
193  public:
194   // Non-null aggregate_rounding_balance implies optimize_filters_for_memory
FastLocalBloomBitsBuilder(const int millibits_per_key,std::atomic<int64_t> * aggregate_rounding_balance)195   explicit FastLocalBloomBitsBuilder(
196       const int millibits_per_key,
197       std::atomic<int64_t>* aggregate_rounding_balance)
198       : XXPH3FilterBitsBuilder(aggregate_rounding_balance),
199         millibits_per_key_(millibits_per_key) {
200     assert(millibits_per_key >= 1000);
201   }
202 
203   // No Copy allowed
204   FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete;
205   void operator=(const FastLocalBloomBitsBuilder&) = delete;
206 
~FastLocalBloomBitsBuilder()207   ~FastLocalBloomBitsBuilder() override {}
208 
Finish(std::unique_ptr<const char[]> * buf)209   virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
210     size_t num_entries = hash_entries_.size();
211     size_t len_with_metadata = CalculateSpace(num_entries);
212 
213     std::unique_ptr<char[]> mutable_buf;
214     len_with_metadata =
215         AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf);
216 
217     assert(mutable_buf);
218     assert(len_with_metadata >= kMetadataLen);
219 
220     // Max size supported by implementation
221     assert(len_with_metadata <= 0xffffffffU);
222 
223     // Compute num_probes after any rounding / adjustments
224     int num_probes = GetNumProbes(num_entries, len_with_metadata);
225 
226     uint32_t len = static_cast<uint32_t>(len_with_metadata - kMetadataLen);
227     if (len > 0) {
228       AddAllEntries(mutable_buf.get(), len, num_probes);
229     }
230 
231     assert(hash_entries_.empty());
232 
233     // See BloomFilterPolicy::GetBloomBitsReader re: metadata
234     // -1 = Marker for newer Bloom implementations
235     mutable_buf[len] = static_cast<char>(-1);
236     // 0 = Marker for this sub-implementation
237     mutable_buf[len + 1] = static_cast<char>(0);
238     // num_probes (and 0 in upper bits for 64-byte block size)
239     mutable_buf[len + 2] = static_cast<char>(num_probes);
240     // rest of metadata stays zero
241 
242     Slice rv(mutable_buf.get(), len_with_metadata);
243     *buf = std::move(mutable_buf);
244     return rv;
245   }
246 
ApproximateNumEntries(size_t bytes)247   size_t ApproximateNumEntries(size_t bytes) override {
248     size_t bytes_no_meta =
249         bytes >= kMetadataLen ? RoundDownUsableSpace(bytes) - kMetadataLen : 0;
250     return static_cast<size_t>(uint64_t{8000} * bytes_no_meta /
251                                millibits_per_key_);
252   }
253 
CalculateSpace(size_t num_entries)254   size_t CalculateSpace(size_t num_entries) override {
255     // If not for cache line blocks in the filter, what would the target
256     // length in bytes be?
257     size_t raw_target_len = static_cast<size_t>(
258         (uint64_t{num_entries} * millibits_per_key_ + 7999) / 8000);
259 
260     if (raw_target_len >= size_t{0xffffffc0}) {
261       // Max supported for this data structure implementation
262       raw_target_len = size_t{0xffffffc0};
263     }
264 
265     // Round up to nearest multiple of 64 (block size). This adjustment is
266     // used for target FP rate only so that we don't receive complaints about
267     // lower FP rate vs. historic Bloom filter behavior.
268     return ((raw_target_len + 63) & ~size_t{63}) + kMetadataLen;
269   }
270 
EstimatedFpRate(size_t keys,size_t len_with_metadata)271   double EstimatedFpRate(size_t keys, size_t len_with_metadata) override {
272     int num_probes = GetNumProbes(keys, len_with_metadata);
273     return FastLocalBloomImpl::EstimatedFpRate(
274         keys, len_with_metadata - kMetadataLen, num_probes, /*hash bits*/ 64);
275   }
276 
277  protected:
RoundDownUsableSpace(size_t available_size)278   size_t RoundDownUsableSpace(size_t available_size) override {
279     size_t rv = available_size - kMetadataLen;
280 
281     if (rv >= size_t{0xffffffc0}) {
282       // Max supported for this data structure implementation
283       rv = size_t{0xffffffc0};
284     }
285 
286     // round down to multiple of 64 (block size)
287     rv &= ~size_t{63};
288 
289     return rv + kMetadataLen;
290   }
291 
292  private:
293   // Compute num_probes after any rounding / adjustments
GetNumProbes(size_t keys,size_t len_with_metadata)294   int GetNumProbes(size_t keys, size_t len_with_metadata) {
295     uint64_t millibits = uint64_t{len_with_metadata - kMetadataLen} * 8000;
296     int actual_millibits_per_key =
297         static_cast<int>(millibits / std::max(keys, size_t{1}));
298     // BEGIN XXX/TODO(peterd): preserving old/default behavior for now to
299     // minimize unit test churn. Remove this some time.
300     if (!aggregate_rounding_balance_) {
301       actual_millibits_per_key = millibits_per_key_;
302     }
303     // END XXX/TODO
304     return FastLocalBloomImpl::ChooseNumProbes(actual_millibits_per_key);
305   }
306 
AddAllEntries(char * data,uint32_t len,int num_probes)307   void AddAllEntries(char* data, uint32_t len, int num_probes) {
308     // Simple version without prefetching:
309     //
310     // for (auto h : hash_entries_) {
311     //   FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len,
312     //                               num_probes, data);
313     // }
314 
315     const size_t num_entries = hash_entries_.size();
316     constexpr size_t kBufferMask = 7;
317     static_assert(((kBufferMask + 1) & kBufferMask) == 0,
318                   "Must be power of 2 minus 1");
319 
320     std::array<uint32_t, kBufferMask + 1> hashes;
321     std::array<uint32_t, kBufferMask + 1> byte_offsets;
322 
323     // Prime the buffer
324     size_t i = 0;
325     for (; i <= kBufferMask && i < num_entries; ++i) {
326       uint64_t h = hash_entries_.front();
327       hash_entries_.pop_front();
328       FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
329                                       /*out*/ &byte_offsets[i]);
330       hashes[i] = Upper32of64(h);
331     }
332 
333     // Process and buffer
334     for (; i < num_entries; ++i) {
335       uint32_t& hash_ref = hashes[i & kBufferMask];
336       uint32_t& byte_offset_ref = byte_offsets[i & kBufferMask];
337       // Process (add)
338       FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes,
339                                           data + byte_offset_ref);
340       // And buffer
341       uint64_t h = hash_entries_.front();
342       hash_entries_.pop_front();
343       FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
344                                       /*out*/ &byte_offset_ref);
345       hash_ref = Upper32of64(h);
346     }
347 
348     // Finish processing
349     for (i = 0; i <= kBufferMask && i < num_entries; ++i) {
350       FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes,
351                                           data + byte_offsets[i]);
352     }
353   }
354 
355   // Target allocation per added key, in thousandths of a bit.
356   int millibits_per_key_;
357 };
358 
359 // See description in FastLocalBloomImpl
360 class FastLocalBloomBitsReader : public FilterBitsReader {
361  public:
FastLocalBloomBitsReader(const char * data,int num_probes,uint32_t len_bytes)362   FastLocalBloomBitsReader(const char* data, int num_probes, uint32_t len_bytes)
363       : data_(data), num_probes_(num_probes), len_bytes_(len_bytes) {}
364 
365   // No Copy allowed
366   FastLocalBloomBitsReader(const FastLocalBloomBitsReader&) = delete;
367   void operator=(const FastLocalBloomBitsReader&) = delete;
368 
~FastLocalBloomBitsReader()369   ~FastLocalBloomBitsReader() override {}
370 
MayMatch(const Slice & key)371   bool MayMatch(const Slice& key) override {
372     uint64_t h = GetSliceHash64(key);
373     uint32_t byte_offset;
374     FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
375                                     /*out*/ &byte_offset);
376     return FastLocalBloomImpl::HashMayMatchPrepared(Upper32of64(h), num_probes_,
377                                                     data_ + byte_offset);
378   }
379 
MayMatch(int num_keys,Slice ** keys,bool * may_match)380   virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
381     std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
382     std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
383     for (int i = 0; i < num_keys; ++i) {
384       uint64_t h = GetSliceHash64(*keys[i]);
385       FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
386                                       /*out*/ &byte_offsets[i]);
387       hashes[i] = Upper32of64(h);
388     }
389     for (int i = 0; i < num_keys; ++i) {
390       may_match[i] = FastLocalBloomImpl::HashMayMatchPrepared(
391           hashes[i], num_probes_, data_ + byte_offsets[i]);
392     }
393   }
394 
395  private:
396   const char* data_;
397   const int num_probes_;
398   const uint32_t len_bytes_;
399 };
400 
401 // ##################### Ribbon filter implementation ################### //
402 
403 // Implements concept RehasherTypesAndSettings in ribbon_impl.h
404 struct Standard128RibbonRehasherTypesAndSettings {
405   // These are schema-critical. Any change almost certainly changes
406   // underlying data.
407   static constexpr bool kIsFilter = true;
408   static constexpr bool kHomogeneous = false;
409   static constexpr bool kFirstCoeffAlwaysOne = true;
410   static constexpr bool kUseSmash = false;
411   using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
412   using Hash = uint64_t;
413   using Seed = uint32_t;
414   // Changing these doesn't necessarily change underlying data,
415   // but might affect supported scalability of those dimensions.
416   using Index = uint32_t;
417   using ResultRow = uint32_t;
418   // Save a conditional in Ribbon queries
419   static constexpr bool kAllowZeroStarts = false;
420 };
421 
422 using Standard128RibbonTypesAndSettings =
423     ribbon::StandardRehasherAdapter<Standard128RibbonRehasherTypesAndSettings>;
424 
425 class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder {
426  public:
Standard128RibbonBitsBuilder(double desired_one_in_fp_rate,int bloom_millibits_per_key,std::atomic<int64_t> * aggregate_rounding_balance,Logger * info_log)427   explicit Standard128RibbonBitsBuilder(
428       double desired_one_in_fp_rate, int bloom_millibits_per_key,
429       std::atomic<int64_t>* aggregate_rounding_balance, Logger* info_log)
430       : XXPH3FilterBitsBuilder(aggregate_rounding_balance),
431         desired_one_in_fp_rate_(desired_one_in_fp_rate),
432         info_log_(info_log),
433         bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance) {
434     assert(desired_one_in_fp_rate >= 1.0);
435   }
436 
437   // No Copy allowed
438   Standard128RibbonBitsBuilder(const Standard128RibbonBitsBuilder&) = delete;
439   void operator=(const Standard128RibbonBitsBuilder&) = delete;
440 
~Standard128RibbonBitsBuilder()441   ~Standard128RibbonBitsBuilder() override {}
442 
Finish(std::unique_ptr<const char[]> * buf)443   virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
444     if (hash_entries_.size() > kMaxRibbonEntries) {
445       ROCKS_LOG_WARN(info_log_, "Too many keys for Ribbon filter: %llu",
446                      static_cast<unsigned long long>(hash_entries_.size()));
447       SwapEntriesWith(&bloom_fallback_);
448       assert(hash_entries_.empty());
449       return bloom_fallback_.Finish(buf);
450     }
451     if (hash_entries_.size() == 0) {
452       // Save a conditional in Ribbon queries by using alternate reader
453       // for zero entries added.
454       return FinishAlwaysFalse(buf);
455     }
456     uint32_t num_entries = static_cast<uint32_t>(hash_entries_.size());
457     uint32_t num_slots;
458     size_t len_with_metadata;
459 
460     CalculateSpaceAndSlots(num_entries, &len_with_metadata, &num_slots);
461 
462     // Bloom fall-back indicator
463     if (num_slots == 0) {
464       SwapEntriesWith(&bloom_fallback_);
465       assert(hash_entries_.empty());
466       return bloom_fallback_.Finish(buf);
467     }
468 
469     uint32_t entropy = 0;
470     if (!hash_entries_.empty()) {
471       entropy = Lower32of64(hash_entries_.front());
472     }
473 
474     BandingType banding;
475     bool success = banding.ResetAndFindSeedToSolve(
476         num_slots, hash_entries_.begin(), hash_entries_.end(),
477         /*starting seed*/ entropy & 255, /*seed mask*/ 255);
478     if (!success) {
479       ROCKS_LOG_WARN(info_log_,
480                      "Too many re-seeds (256) for Ribbon filter, %llu / %llu",
481                      static_cast<unsigned long long>(hash_entries_.size()),
482                      static_cast<unsigned long long>(num_slots));
483       SwapEntriesWith(&bloom_fallback_);
484       assert(hash_entries_.empty());
485       return bloom_fallback_.Finish(buf);
486     }
487     hash_entries_.clear();
488 
489     uint32_t seed = banding.GetOrdinalSeed();
490     assert(seed < 256);
491 
492     std::unique_ptr<char[]> mutable_buf;
493     len_with_metadata =
494         AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf);
495 
496     SolnType soln(mutable_buf.get(), len_with_metadata);
497     soln.BackSubstFrom(banding);
498     uint32_t num_blocks = soln.GetNumBlocks();
499     // This should be guaranteed:
500     // num_entries < 2^30
501     // => (overhead_factor < 2.0)
502     // num_entries * overhead_factor == num_slots < 2^31
503     // => (num_blocks = num_slots / 128)
504     // num_blocks < 2^24
505     assert(num_blocks < 0x1000000U);
506 
507     // See BloomFilterPolicy::GetBloomBitsReader re: metadata
508     // -2 = Marker for Standard128 Ribbon
509     mutable_buf[len_with_metadata - 5] = static_cast<char>(-2);
510     // Hash seed
511     mutable_buf[len_with_metadata - 4] = static_cast<char>(seed);
512     // Number of blocks, in 24 bits
513     // (Along with bytes, we can derive other settings)
514     mutable_buf[len_with_metadata - 3] = static_cast<char>(num_blocks & 255);
515     mutable_buf[len_with_metadata - 2] =
516         static_cast<char>((num_blocks >> 8) & 255);
517     mutable_buf[len_with_metadata - 1] =
518         static_cast<char>((num_blocks >> 16) & 255);
519 
520     Slice rv(mutable_buf.get(), len_with_metadata);
521     *buf = std::move(mutable_buf);
522     return rv;
523   }
524 
525   // Setting num_slots to 0 means "fall back on Bloom filter."
526   // And note this implementation does not support num_entries or num_slots
527   // beyond uint32_t; see kMaxRibbonEntries.
CalculateSpaceAndSlots(size_t num_entries,size_t * target_len_with_metadata,uint32_t * num_slots)528   void CalculateSpaceAndSlots(size_t num_entries,
529                               size_t* target_len_with_metadata,
530                               uint32_t* num_slots) {
531     if (num_entries > kMaxRibbonEntries) {
532       // More entries than supported by this Ribbon
533       *num_slots = 0;  // use Bloom
534       *target_len_with_metadata = bloom_fallback_.CalculateSpace(num_entries);
535       return;
536     }
537     uint32_t entropy = 0;
538     if (!hash_entries_.empty()) {
539       entropy = Upper32of64(hash_entries_.front());
540     }
541 
542     *num_slots = NumEntriesToNumSlots(static_cast<uint32_t>(num_entries));
543     *target_len_with_metadata =
544         SolnType::GetBytesForOneInFpRate(*num_slots, desired_one_in_fp_rate_,
545                                          /*rounding*/ entropy) +
546         kMetadataLen;
547 
548     // Consider possible Bloom fallback for small filters
549     if (*num_slots < 1024) {
550       size_t bloom = bloom_fallback_.CalculateSpace(num_entries);
551       if (bloom < *target_len_with_metadata) {
552         *num_slots = 0;  // use Bloom
553         *target_len_with_metadata = bloom;
554         return;
555       }
556     }
557   }
558 
CalculateSpace(size_t num_entries)559   size_t CalculateSpace(size_t num_entries) override {
560     if (num_entries == 0) {
561       // See FinishAlwaysFalse
562       return 0;
563     }
564     size_t target_len_with_metadata;
565     uint32_t num_slots;
566     CalculateSpaceAndSlots(num_entries, &target_len_with_metadata, &num_slots);
567     (void)num_slots;
568     return target_len_with_metadata;
569   }
570 
571   // This is a somewhat ugly but reasonably fast and reasonably accurate
572   // reversal of CalculateSpace.
ApproximateNumEntries(size_t bytes)573   size_t ApproximateNumEntries(size_t bytes) override {
574     size_t len_no_metadata =
575         RoundDownUsableSpace(std::max(bytes, size_t{kMetadataLen})) -
576         kMetadataLen;
577 
578     if (!(desired_one_in_fp_rate_ > 1.0)) {
579       // Effectively asking for 100% FP rate, or NaN etc.
580       // Note that NaN is neither < 1.0 nor > 1.0
581       return kMaxRibbonEntries;
582     }
583 
584     // Find a slight under-estimate for actual average bits per slot
585     double min_real_bits_per_slot;
586     if (desired_one_in_fp_rate_ >= 1.0 + std::numeric_limits<uint32_t>::max()) {
587       // Max of 32 solution columns (result bits)
588       min_real_bits_per_slot = 32.0;
589     } else {
590       // Account for mix of b and b+1 solution columns being slightly
591       // suboptimal vs. ideal log2(1/fp_rate) bits.
592       uint32_t rounded = static_cast<uint32_t>(desired_one_in_fp_rate_);
593       int upper_bits_per_key = 1 + FloorLog2(rounded);
594       double fp_rate_for_upper = std::pow(2.0, -upper_bits_per_key);
595       double portion_lower =
596           (1.0 / desired_one_in_fp_rate_ - fp_rate_for_upper) /
597           fp_rate_for_upper;
598       min_real_bits_per_slot = upper_bits_per_key - portion_lower;
599       assert(min_real_bits_per_slot > 0.0);
600       assert(min_real_bits_per_slot <= 32.0);
601     }
602 
603     // An overestimate, but this should only be O(1) slots away from truth.
604     double max_slots = len_no_metadata * 8.0 / min_real_bits_per_slot;
605 
606     // Let's not bother accounting for overflow to Bloom filter
607     // (Includes NaN case)
608     if (!(max_slots < ConfigHelper::GetNumSlots(kMaxRibbonEntries))) {
609       return kMaxRibbonEntries;
610     }
611 
612     // Set up for short iteration
613     uint32_t slots = static_cast<uint32_t>(max_slots);
614     slots = SolnType::RoundUpNumSlots(slots);
615 
616     // Assert that we have a valid upper bound on slots
617     assert(SolnType::GetBytesForOneInFpRate(
618                SolnType::RoundUpNumSlots(slots + 1), desired_one_in_fp_rate_,
619                /*rounding*/ 0) > len_no_metadata);
620 
621     // Iterate up to a few times to rather precisely account for small effects
622     for (int i = 0; slots > 0; ++i) {
623       size_t reqd_bytes =
624           SolnType::GetBytesForOneInFpRate(slots, desired_one_in_fp_rate_,
625                                            /*rounding*/ 0);
626       if (reqd_bytes <= len_no_metadata) {
627         break;  // done
628       }
629       if (i >= 2) {
630         // should have been enough iterations
631         assert(false);
632         break;
633       }
634       slots = SolnType::RoundDownNumSlots(slots - 1);
635     }
636 
637     uint32_t num_entries = ConfigHelper::GetNumToAdd(slots);
638 
639     // Consider possible Bloom fallback for small filters
640     if (slots < 1024) {
641       size_t bloom = bloom_fallback_.ApproximateNumEntries(bytes);
642       if (bloom > num_entries) {
643         return bloom;
644       } else {
645         return num_entries;
646       }
647     } else {
648       return std::min(num_entries, kMaxRibbonEntries);
649     }
650   }
651 
EstimatedFpRate(size_t num_entries,size_t len_with_metadata)652   double EstimatedFpRate(size_t num_entries,
653                          size_t len_with_metadata) override {
654     if (num_entries > kMaxRibbonEntries) {
655       // More entries than supported by this Ribbon
656       return bloom_fallback_.EstimatedFpRate(num_entries, len_with_metadata);
657     }
658     uint32_t num_slots =
659         NumEntriesToNumSlots(static_cast<uint32_t>(num_entries));
660     SolnType fake_soln(nullptr, len_with_metadata);
661     fake_soln.ConfigureForNumSlots(num_slots);
662     return fake_soln.ExpectedFpRate();
663   }
664 
665  protected:
RoundDownUsableSpace(size_t available_size)666   size_t RoundDownUsableSpace(size_t available_size) override {
667     size_t rv = available_size - kMetadataLen;
668 
669     // round down to multiple of 16 (segment size)
670     rv &= ~size_t{15};
671 
672     return rv + kMetadataLen;
673   }
674 
675  private:
676   using TS = Standard128RibbonTypesAndSettings;
677   using SolnType = ribbon::SerializableInterleavedSolution<TS>;
678   using BandingType = ribbon::StandardBanding<TS>;
679   using ConfigHelper = ribbon::BandingConfigHelper1TS<ribbon::kOneIn20, TS>;
680 
NumEntriesToNumSlots(uint32_t num_entries)681   static uint32_t NumEntriesToNumSlots(uint32_t num_entries) {
682     uint32_t num_slots1 = ConfigHelper::GetNumSlots(num_entries);
683     return SolnType::RoundUpNumSlots(num_slots1);
684   }
685 
686   // Approximate num_entries to ensure number of bytes fits in 32 bits, which
687   // is not an inherent limitation but does ensure somewhat graceful Bloom
688   // fallback for crazy high number of entries, since the Bloom implementation
689   // does not support number of bytes bigger than fits in 32 bits. This is
690   // within an order of magnitude of implementation limit on num_slots
691   // fitting in 32 bits, and even closer for num_blocks fitting in 24 bits
692   // (for filter metadata).
693   static constexpr uint32_t kMaxRibbonEntries = 950000000;  // ~ 1 billion
694 
695   // A desired value for 1/fp_rate. For example, 100 -> 1% fp rate.
696   double desired_one_in_fp_rate_;
697 
698   // For warnings, or can be nullptr
699   Logger* info_log_;
700 
701   // For falling back on Bloom filter in some exceptional cases and
702   // very small filter cases
703   FastLocalBloomBitsBuilder bloom_fallback_;
704 };
705 
706 // for the linker, at least with DEBUG_LEVEL=2
707 constexpr uint32_t Standard128RibbonBitsBuilder::kMaxRibbonEntries;
708 
709 class Standard128RibbonBitsReader : public FilterBitsReader {
710  public:
Standard128RibbonBitsReader(const char * data,size_t len_bytes,uint32_t num_blocks,uint32_t seed)711   Standard128RibbonBitsReader(const char* data, size_t len_bytes,
712                               uint32_t num_blocks, uint32_t seed)
713       : soln_(const_cast<char*>(data), len_bytes) {
714     soln_.ConfigureForNumBlocks(num_blocks);
715     hasher_.SetOrdinalSeed(seed);
716   }
717 
718   // No Copy allowed
719   Standard128RibbonBitsReader(const Standard128RibbonBitsReader&) = delete;
720   void operator=(const Standard128RibbonBitsReader&) = delete;
721 
~Standard128RibbonBitsReader()722   ~Standard128RibbonBitsReader() override {}
723 
MayMatch(const Slice & key)724   bool MayMatch(const Slice& key) override {
725     uint64_t h = GetSliceHash64(key);
726     return soln_.FilterQuery(h, hasher_);
727   }
728 
MayMatch(int num_keys,Slice ** keys,bool * may_match)729   virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
730     struct SavedData {
731       uint64_t seeded_hash;
732       uint32_t segment_num;
733       uint32_t num_columns;
734       uint32_t start_bits;
735     };
736     std::array<SavedData, MultiGetContext::MAX_BATCH_SIZE> saved;
737     for (int i = 0; i < num_keys; ++i) {
738       ribbon::InterleavedPrepareQuery(
739           GetSliceHash64(*keys[i]), hasher_, soln_, &saved[i].seeded_hash,
740           &saved[i].segment_num, &saved[i].num_columns, &saved[i].start_bits);
741     }
742     for (int i = 0; i < num_keys; ++i) {
743       may_match[i] = ribbon::InterleavedFilterQuery(
744           saved[i].seeded_hash, saved[i].segment_num, saved[i].num_columns,
745           saved[i].start_bits, hasher_, soln_);
746     }
747   }
748 
749  private:
750   using TS = Standard128RibbonTypesAndSettings;
751   ribbon::SerializableInterleavedSolution<TS> soln_;
752   ribbon::StandardHasher<TS> hasher_;
753 };
754 
755 // ##################### Legacy Bloom implementation ################### //
756 
757 using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>;
758 
759 class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
760  public:
761   explicit LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log);
762 
763   // No Copy allowed
764   LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete;
765   void operator=(const LegacyBloomBitsBuilder&) = delete;
766 
767   ~LegacyBloomBitsBuilder() override;
768 
769   void AddKey(const Slice& key) override;
770 
EstimateEntriesAdded()771   virtual size_t EstimateEntriesAdded() override {
772     return hash_entries_.size();
773   }
774 
775   Slice Finish(std::unique_ptr<const char[]>* buf) override;
776 
CalculateSpace(size_t num_entries)777   size_t CalculateSpace(size_t num_entries) override {
778     uint32_t dont_care1;
779     uint32_t dont_care2;
780     return CalculateSpace(num_entries, &dont_care1, &dont_care2);
781   }
782 
EstimatedFpRate(size_t keys,size_t bytes)783   double EstimatedFpRate(size_t keys, size_t bytes) override {
784     return LegacyBloomImpl::EstimatedFpRate(keys, bytes - kMetadataLen,
785                                             num_probes_);
786   }
787 
788   size_t ApproximateNumEntries(size_t bytes) override;
789 
790  private:
791   int bits_per_key_;
792   int num_probes_;
793   std::vector<uint32_t> hash_entries_;
794   Logger* info_log_;
795 
796   // Get totalbits that optimized for cpu cache line
797   uint32_t GetTotalBitsForLocality(uint32_t total_bits);
798 
799   // Reserve space for new filter
800   char* ReserveSpace(size_t num_entries, uint32_t* total_bits,
801                      uint32_t* num_lines);
802 
803   // Implementation-specific variant of public CalculateSpace
804   uint32_t CalculateSpace(size_t num_entries, uint32_t* total_bits,
805                           uint32_t* num_lines);
806 
807   // Assuming single threaded access to this function.
808   void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
809 };
810 
LegacyBloomBitsBuilder(const int bits_per_key,Logger * info_log)811 LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key,
812                                                Logger* info_log)
813     : bits_per_key_(bits_per_key),
814       num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)),
815       info_log_(info_log) {
816   assert(bits_per_key_);
817 }
818 
~LegacyBloomBitsBuilder()819 LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() {}
820 
AddKey(const Slice & key)821 void LegacyBloomBitsBuilder::AddKey(const Slice& key) {
822   uint32_t hash = BloomHash(key);
823   if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
824     hash_entries_.push_back(hash);
825   }
826 }
827 
Finish(std::unique_ptr<const char[]> * buf)828 Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
829   uint32_t total_bits, num_lines;
830   size_t num_entries = hash_entries_.size();
831   char* data =
832       ReserveSpace(static_cast<int>(num_entries), &total_bits, &num_lines);
833   assert(data);
834 
835   if (total_bits != 0 && num_lines != 0) {
836     for (auto h : hash_entries_) {
837       AddHash(h, data, num_lines, total_bits);
838     }
839 
840     // Check for excessive entries for 32-bit hash function
841     if (num_entries >= /* minimum of 3 million */ 3000000U) {
842       // More specifically, we can detect that the 32-bit hash function
843       // is causing significant increase in FP rate by comparing current
844       // estimated FP rate to what we would get with a normal number of
845       // keys at same memory ratio.
846       double est_fp_rate = LegacyBloomImpl::EstimatedFpRate(
847           num_entries, total_bits / 8, num_probes_);
848       double vs_fp_rate = LegacyBloomImpl::EstimatedFpRate(
849           1U << 16, (1U << 16) * bits_per_key_ / 8, num_probes_);
850 
851       if (est_fp_rate >= 1.50 * vs_fp_rate) {
852         // For more details, see
853         // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
854         ROCKS_LOG_WARN(
855             info_log_,
856             "Using legacy SST/BBT Bloom filter with excessive key count "
857             "(%.1fM @ %dbpk), causing estimated %.1fx higher filter FP rate. "
858             "Consider using new Bloom with format_version>=5, smaller SST "
859             "file size, or partitioned filters.",
860             num_entries / 1000000.0, bits_per_key_, est_fp_rate / vs_fp_rate);
861       }
862     }
863   }
864   // See BloomFilterPolicy::GetFilterBitsReader for metadata
865   data[total_bits / 8] = static_cast<char>(num_probes_);
866   EncodeFixed32(data + total_bits / 8 + 1, static_cast<uint32_t>(num_lines));
867 
868   const char* const_data = data;
869   buf->reset(const_data);
870   hash_entries_.clear();
871 
872   return Slice(data, total_bits / 8 + kMetadataLen);
873 }
874 
ApproximateNumEntries(size_t bytes)875 size_t LegacyBloomBitsBuilder::ApproximateNumEntries(size_t bytes) {
876   assert(bits_per_key_);
877   assert(bytes > 0);
878 
879   uint64_t total_bits_tmp = bytes * 8;
880   // total bits, including temporary computations, cannot exceed 2^32
881   // for compatibility
882   total_bits_tmp = std::min(total_bits_tmp, uint64_t{0xffff0000});
883 
884   uint32_t high = static_cast<uint32_t>(total_bits_tmp) /
885                       static_cast<uint32_t>(bits_per_key_) +
886                   1;
887   uint32_t low = 1;
888   uint32_t n = high;
889   for (; n >= low; n--) {
890     if (CalculateSpace(n) <= bytes) {
891       break;
892     }
893   }
894   return n;
895 }
896 
GetTotalBitsForLocality(uint32_t total_bits)897 uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
898   uint32_t num_lines =
899       (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
900 
901   // Make num_lines an odd number to make sure more bits are involved
902   // when determining which block.
903   if (num_lines % 2 == 0) {
904     num_lines++;
905   }
906   return num_lines * (CACHE_LINE_SIZE * 8);
907 }
908 
CalculateSpace(size_t num_entries,uint32_t * total_bits,uint32_t * num_lines)909 uint32_t LegacyBloomBitsBuilder::CalculateSpace(size_t num_entries,
910                                                 uint32_t* total_bits,
911                                                 uint32_t* num_lines) {
912   assert(bits_per_key_);
913   if (num_entries != 0) {
914     size_t total_bits_tmp = num_entries * bits_per_key_;
915     // total bits, including temporary computations, cannot exceed 2^32
916     // for compatibility
917     total_bits_tmp = std::min(total_bits_tmp, size_t{0xffff0000});
918 
919     *total_bits =
920         GetTotalBitsForLocality(static_cast<uint32_t>(total_bits_tmp));
921     *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
922     assert(*total_bits > 0 && *total_bits % 8 == 0);
923   } else {
924     // filter is empty, just leave space for metadata
925     *total_bits = 0;
926     *num_lines = 0;
927   }
928 
929   // Reserve space for Filter
930   uint32_t sz = *total_bits / 8;
931   sz += kMetadataLen;  // 4 bytes for num_lines, 1 byte for num_probes
932   return sz;
933 }
934 
ReserveSpace(size_t num_entries,uint32_t * total_bits,uint32_t * num_lines)935 char* LegacyBloomBitsBuilder::ReserveSpace(size_t num_entries,
936                                            uint32_t* total_bits,
937                                            uint32_t* num_lines) {
938   uint32_t sz = CalculateSpace(num_entries, total_bits, num_lines);
939   char* data = new char[sz];
940   memset(data, 0, sz);
941   return data;
942 }
943 
AddHash(uint32_t h,char * data,uint32_t num_lines,uint32_t total_bits)944 inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data,
945                                             uint32_t num_lines,
946                                             uint32_t total_bits) {
947 #ifdef NDEBUG
948   static_cast<void>(total_bits);
949 #endif
950   assert(num_lines > 0 && total_bits > 0);
951 
952   LegacyBloomImpl::AddHash(h, num_lines, num_probes_, data,
953                            folly::constexpr_log2(CACHE_LINE_SIZE));
954 }
955 
956 class LegacyBloomBitsReader : public FilterBitsReader {
957  public:
LegacyBloomBitsReader(const char * data,int num_probes,uint32_t num_lines,uint32_t log2_cache_line_size)958   LegacyBloomBitsReader(const char* data, int num_probes, uint32_t num_lines,
959                         uint32_t log2_cache_line_size)
960       : data_(data),
961         num_probes_(num_probes),
962         num_lines_(num_lines),
963         log2_cache_line_size_(log2_cache_line_size) {}
964 
965   // No Copy allowed
966   LegacyBloomBitsReader(const LegacyBloomBitsReader&) = delete;
967   void operator=(const LegacyBloomBitsReader&) = delete;
968 
~LegacyBloomBitsReader()969   ~LegacyBloomBitsReader() override {}
970 
971   // "contents" contains the data built by a preceding call to
972   // FilterBitsBuilder::Finish. MayMatch must return true if the key was
973   // passed to FilterBitsBuilder::AddKey. This method may return true or false
974   // if the key was not on the list, but it should aim to return false with a
975   // high probability.
MayMatch(const Slice & key)976   bool MayMatch(const Slice& key) override {
977     uint32_t hash = BloomHash(key);
978     uint32_t byte_offset;
979     LegacyBloomImpl::PrepareHashMayMatch(
980         hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_);
981     return LegacyBloomImpl::HashMayMatchPrepared(
982         hash, num_probes_, data_ + byte_offset, log2_cache_line_size_);
983   }
984 
MayMatch(int num_keys,Slice ** keys,bool * may_match)985   virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
986     std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
987     std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
988     for (int i = 0; i < num_keys; ++i) {
989       hashes[i] = BloomHash(*keys[i]);
990       LegacyBloomImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_,
991                                            /*out*/ &byte_offsets[i],
992                                            log2_cache_line_size_);
993     }
994     for (int i = 0; i < num_keys; ++i) {
995       may_match[i] = LegacyBloomImpl::HashMayMatchPrepared(
996           hashes[i], num_probes_, data_ + byte_offsets[i],
997           log2_cache_line_size_);
998     }
999   }
1000 
1001  private:
1002   const char* data_;
1003   const int num_probes_;
1004   const uint32_t num_lines_;
1005   const uint32_t log2_cache_line_size_;
1006 };
1007 
1008 class AlwaysTrueFilter : public FilterBitsReader {
1009  public:
MayMatch(const Slice &)1010   bool MayMatch(const Slice&) override { return true; }
1011   using FilterBitsReader::MayMatch;  // inherit overload
1012 };
1013 
1014 class AlwaysFalseFilter : public FilterBitsReader {
1015  public:
MayMatch(const Slice &)1016   bool MayMatch(const Slice&) override { return false; }
1017   using FilterBitsReader::MayMatch;  // inherit overload
1018 };
1019 
1020 }  // namespace
1021 
1022 const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllFixedImpls = {
1023     kLegacyBloom,
1024     kDeprecatedBlock,
1025     kFastLocalBloom,
1026     kStandard128Ribbon,
1027 };
1028 
1029 const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllUserModes = {
1030     kDeprecatedBlock,
1031     kAutoBloom,
1032     kStandard128Ribbon,
1033 };
1034 
BloomFilterPolicy(double bits_per_key,Mode mode)1035 BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode)
1036     : mode_(mode), warned_(false), aggregate_rounding_balance_(0) {
1037   // Sanitize bits_per_key
1038   if (bits_per_key < 1.0) {
1039     bits_per_key = 1.0;
1040   } else if (!(bits_per_key < 100.0)) {  // including NaN
1041     bits_per_key = 100.0;
1042   }
1043 
1044   // Includes a nudge toward rounding up, to ensure on all platforms
1045   // that doubles specified with three decimal digits after the decimal
1046   // point are interpreted accurately.
1047   millibits_per_key_ = static_cast<int>(bits_per_key * 1000.0 + 0.500001);
1048 
1049   // For now configure Ribbon filter to match Bloom FP rate and save
1050   // memory. (Ribbon bits per key will be ~30% less than Bloom bits per key
1051   // for same FP rate.)
1052   desired_one_in_fp_rate_ =
1053       1.0 / BloomMath::CacheLocalFpRate(
1054                 bits_per_key,
1055                 FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_),
1056                 /*cache_line_bits*/ 512);
1057 
1058   // For better or worse, this is a rounding up of a nudged rounding up,
1059   // e.g. 7.4999999999999 will round up to 8, but that provides more
1060   // predictability against small arithmetic errors in floating point.
1061   whole_bits_per_key_ = (millibits_per_key_ + 500) / 1000;
1062 }
1063 
~BloomFilterPolicy()1064 BloomFilterPolicy::~BloomFilterPolicy() {}
1065 
Name() const1066 const char* BuiltinFilterPolicy::Name() const {
1067   return "rocksdb.BuiltinBloomFilter";
1068 }
1069 
CreateFilter(const Slice * keys,int n,std::string * dst) const1070 void BloomFilterPolicy::CreateFilter(const Slice* keys, int n,
1071                                      std::string* dst) const {
1072   // We should ideally only be using this deprecated interface for
1073   // appropriately constructed BloomFilterPolicy
1074   assert(mode_ == kDeprecatedBlock);
1075 
1076   // Compute bloom filter size (in both bits and bytes)
1077   uint32_t bits = static_cast<uint32_t>(n * whole_bits_per_key_);
1078 
1079   // For small n, we can see a very high false positive rate.  Fix it
1080   // by enforcing a minimum bloom filter length.
1081   if (bits < 64) bits = 64;
1082 
1083   uint32_t bytes = (bits + 7) / 8;
1084   bits = bytes * 8;
1085 
1086   int num_probes =
1087       LegacyNoLocalityBloomImpl::ChooseNumProbes(whole_bits_per_key_);
1088 
1089   const size_t init_size = dst->size();
1090   dst->resize(init_size + bytes, 0);
1091   dst->push_back(static_cast<char>(num_probes));  // Remember # of probes
1092   char* array = &(*dst)[init_size];
1093   for (int i = 0; i < n; i++) {
1094     LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes,
1095                                        array);
1096   }
1097 }
1098 
KeyMayMatch(const Slice & key,const Slice & bloom_filter) const1099 bool BuiltinFilterPolicy::KeyMayMatch(const Slice& key,
1100                                       const Slice& bloom_filter) const {
1101   const size_t len = bloom_filter.size();
1102   if (len < 2 || len > 0xffffffffU) {
1103     return false;
1104   }
1105 
1106   const char* array = bloom_filter.data();
1107   const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
1108 
1109   // Use the encoded k so that we can read filters generated by
1110   // bloom filters created using different parameters.
1111   const int k = static_cast<uint8_t>(array[len - 1]);
1112   if (k > 30) {
1113     // Reserved for potentially new encodings for short bloom filters.
1114     // Consider it a match.
1115     return true;
1116   }
1117   // NB: using stored k not num_probes for whole_bits_per_key_
1118   return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k,
1119                                                  array);
1120 }
1121 
GetFilterBitsBuilder() const1122 FilterBitsBuilder* BuiltinFilterPolicy::GetFilterBitsBuilder() const {
1123   // This code path should no longer be used, for the built-in
1124   // BloomFilterPolicy. Internal to RocksDB and outside
1125   // BloomFilterPolicy, only get a FilterBitsBuilder with
1126   // BloomFilterPolicy::GetBuilderFromContext(), which will call
1127   // BloomFilterPolicy::GetBuilderWithContext(). RocksDB users have
1128   // been warned (HISTORY.md) that they can no longer call this on
1129   // the built-in BloomFilterPolicy (unlikely).
1130   assert(false);
1131   return GetBuilderWithContext(FilterBuildingContext(BlockBasedTableOptions()));
1132 }
1133 
GetBuilderWithContext(const FilterBuildingContext & context) const1134 FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext(
1135     const FilterBuildingContext& context) const {
1136   Mode cur = mode_;
1137   bool offm = context.table_options.optimize_filters_for_memory;
1138   // Unusual code construction so that we can have just
1139   // one exhaustive switch without (risky) recursion
1140   for (int i = 0; i < 2; ++i) {
1141     switch (cur) {
1142       case kAutoBloom:
1143         if (context.table_options.format_version < 5) {
1144           cur = kLegacyBloom;
1145         } else {
1146           cur = kFastLocalBloom;
1147         }
1148         break;
1149       case kDeprecatedBlock:
1150         return nullptr;
1151       case kFastLocalBloom:
1152         return new FastLocalBloomBitsBuilder(
1153             millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr);
1154       case kLegacyBloom:
1155         if (whole_bits_per_key_ >= 14 && context.info_log &&
1156             !warned_.load(std::memory_order_relaxed)) {
1157           warned_ = true;
1158           const char* adjective;
1159           if (whole_bits_per_key_ >= 20) {
1160             adjective = "Dramatic";
1161           } else {
1162             adjective = "Significant";
1163           }
1164           // For more details, see
1165           // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
1166           ROCKS_LOG_WARN(
1167               context.info_log,
1168               "Using legacy Bloom filter with high (%d) bits/key. "
1169               "%s filter space and/or accuracy improvement is available "
1170               "with format_version>=5.",
1171               whole_bits_per_key_, adjective);
1172         }
1173         return new LegacyBloomBitsBuilder(whole_bits_per_key_,
1174                                           context.info_log);
1175       case kStandard128Ribbon:
1176         return new Standard128RibbonBitsBuilder(
1177             desired_one_in_fp_rate_, millibits_per_key_,
1178             offm ? &aggregate_rounding_balance_ : nullptr, context.info_log);
1179     }
1180   }
1181   assert(false);
1182   return nullptr;  // something legal
1183 }
1184 
GetBuilderFromContext(const FilterBuildingContext & context)1185 FilterBitsBuilder* BloomFilterPolicy::GetBuilderFromContext(
1186     const FilterBuildingContext& context) {
1187   if (context.table_options.filter_policy) {
1188     return context.table_options.filter_policy->GetBuilderWithContext(context);
1189   } else {
1190     return nullptr;
1191   }
1192 }
1193 
1194 // Read metadata to determine what kind of FilterBitsReader is needed
1195 // and return a new one.
GetFilterBitsReader(const Slice & contents) const1196 FilterBitsReader* BuiltinFilterPolicy::GetFilterBitsReader(
1197     const Slice& contents) const {
1198   uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
1199   if (len_with_meta <= kMetadataLen) {
1200     // filter is empty or broken. Treat like zero keys added.
1201     return new AlwaysFalseFilter();
1202   }
1203 
1204   // Legacy Bloom filter data:
1205   //             0 +-----------------------------------+
1206   //               | Raw Bloom filter data             |
1207   //               | ...                               |
1208   //           len +-----------------------------------+
1209   //               | byte for num_probes or            |
1210   //               |   marker for new implementations  |
1211   //         len+1 +-----------------------------------+
1212   //               | four bytes for number of cache    |
1213   //               |   lines                           |
1214   // len_with_meta +-----------------------------------+
1215 
1216   int8_t raw_num_probes =
1217       static_cast<int8_t>(contents.data()[len_with_meta - kMetadataLen]);
1218   // NB: *num_probes > 30 and < 128 probably have not been used, because of
1219   // BloomFilterPolicy::initialize, unless directly calling
1220   // LegacyBloomBitsBuilder as an API, but we are leaving those cases in
1221   // limbo with LegacyBloomBitsReader for now.
1222 
1223   if (raw_num_probes < 1) {
1224     // Note: < 0 (or unsigned > 127) indicate special new implementations
1225     // (or reserved for future use)
1226     switch (raw_num_probes) {
1227       case 0:
1228         // Treat as zero probes (always FP)
1229         return new AlwaysTrueFilter();
1230       case -1:
1231         // Marker for newer Bloom implementations
1232         return GetBloomBitsReader(contents);
1233       case -2:
1234         // Marker for Ribbon implementations
1235         return GetRibbonBitsReader(contents);
1236       default:
1237         // Reserved (treat as zero probes, always FP, for now)
1238         return new AlwaysTrueFilter();
1239     }
1240   }
1241   // else attempt decode for LegacyBloomBitsReader
1242 
1243   int num_probes = raw_num_probes;
1244   assert(num_probes >= 1);
1245   assert(num_probes <= 127);
1246 
1247   uint32_t len = len_with_meta - kMetadataLen;
1248   assert(len > 0);
1249 
1250   uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
1251   uint32_t log2_cache_line_size;
1252 
1253   if (num_lines * CACHE_LINE_SIZE == len) {
1254     // Common case
1255     log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE);
1256   } else if (num_lines == 0 || len % num_lines != 0) {
1257     // Invalid (no solution to num_lines * x == len)
1258     // Treat as zero probes (always FP) for now.
1259     return new AlwaysTrueFilter();
1260   } else {
1261     // Determine the non-native cache line size (from another system)
1262     log2_cache_line_size = 0;
1263     while ((num_lines << log2_cache_line_size) < len) {
1264       ++log2_cache_line_size;
1265     }
1266     if ((num_lines << log2_cache_line_size) != len) {
1267       // Invalid (block size not a power of two)
1268       // Treat as zero probes (always FP) for now.
1269       return new AlwaysTrueFilter();
1270     }
1271   }
1272   // if not early return
1273   return new LegacyBloomBitsReader(contents.data(), num_probes, num_lines,
1274                                    log2_cache_line_size);
1275 }
1276 
GetRibbonBitsReader(const Slice & contents) const1277 FilterBitsReader* BuiltinFilterPolicy::GetRibbonBitsReader(
1278     const Slice& contents) const {
1279   uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
1280   uint32_t len = len_with_meta - kMetadataLen;
1281 
1282   assert(len > 0);  // precondition
1283 
1284   uint32_t seed = static_cast<uint8_t>(contents.data()[len + 1]);
1285   uint32_t num_blocks = static_cast<uint8_t>(contents.data()[len + 2]);
1286   num_blocks |= static_cast<uint8_t>(contents.data()[len + 3]) << 8;
1287   num_blocks |= static_cast<uint8_t>(contents.data()[len + 4]) << 16;
1288   if (num_blocks < 2) {
1289     // Not supported
1290     // num_blocks == 1 is not used because num_starts == 1 is problematic
1291     // for the hashing scheme. num_blocks == 0 is unused because there's
1292     // already a concise encoding of an "always false" filter.
1293     // Return something safe:
1294     return new AlwaysTrueFilter();
1295   }
1296   return new Standard128RibbonBitsReader(contents.data(), len, num_blocks,
1297                                          seed);
1298 }
1299 
1300 // For newer Bloom filter implementations
GetBloomBitsReader(const Slice & contents) const1301 FilterBitsReader* BuiltinFilterPolicy::GetBloomBitsReader(
1302     const Slice& contents) const {
1303   uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
1304   uint32_t len = len_with_meta - kMetadataLen;
1305 
1306   assert(len > 0);  // precondition
1307 
1308   // New Bloom filter data:
1309   //             0 +-----------------------------------+
1310   //               | Raw Bloom filter data             |
1311   //               | ...                               |
1312   //           len +-----------------------------------+
1313   //               | char{-1} byte -> new Bloom filter |
1314   //         len+1 +-----------------------------------+
1315   //               | byte for subimplementation        |
1316   //               |   0: FastLocalBloom               |
1317   //               |   other: reserved                 |
1318   //         len+2 +-----------------------------------+
1319   //               | byte for block_and_probes         |
1320   //               |   0 in top 3 bits -> 6 -> 64-byte |
1321   //               |   reserved:                       |
1322   //               |   1 in top 3 bits -> 7 -> 128-byte|
1323   //               |   2 in top 3 bits -> 8 -> 256-byte|
1324   //               |   ...                             |
1325   //               |   num_probes in bottom 5 bits,    |
1326   //               |     except 0 and 31 reserved      |
1327   //         len+3 +-----------------------------------+
1328   //               | two bytes reserved                |
1329   //               |   possibly for hash seed          |
1330   // len_with_meta +-----------------------------------+
1331 
1332   // Read more metadata (see above)
1333   char sub_impl_val = contents.data()[len_with_meta - 4];
1334   char block_and_probes = contents.data()[len_with_meta - 3];
1335   int log2_block_bytes = ((block_and_probes >> 5) & 7) + 6;
1336 
1337   int num_probes = (block_and_probes & 31);
1338   if (num_probes < 1 || num_probes > 30) {
1339     // Reserved / future safe
1340     return new AlwaysTrueFilter();
1341   }
1342 
1343   uint16_t rest = DecodeFixed16(contents.data() + len_with_meta - 2);
1344   if (rest != 0) {
1345     // Reserved, possibly for hash seed
1346     // Future safe
1347     return new AlwaysTrueFilter();
1348   }
1349 
1350   if (sub_impl_val == 0) {        // FastLocalBloom
1351     if (log2_block_bytes == 6) {  // Only block size supported for now
1352       return new FastLocalBloomBitsReader(contents.data(), num_probes, len);
1353     }
1354   }
1355   // otherwise
1356   // Reserved / future safe
1357   return new AlwaysTrueFilter();
1358 }
1359 
NewBloomFilterPolicy(double bits_per_key,bool use_block_based_builder)1360 const FilterPolicy* NewBloomFilterPolicy(double bits_per_key,
1361                                          bool use_block_based_builder) {
1362   BloomFilterPolicy::Mode m;
1363   if (use_block_based_builder) {
1364     m = BloomFilterPolicy::kDeprecatedBlock;
1365   } else {
1366     m = BloomFilterPolicy::kAutoBloom;
1367   }
1368   assert(std::find(BloomFilterPolicy::kAllUserModes.begin(),
1369                    BloomFilterPolicy::kAllUserModes.end(),
1370                    m) != BloomFilterPolicy::kAllUserModes.end());
1371   return new BloomFilterPolicy(bits_per_key, m);
1372 }
1373 
1374 // Chooses between two filter policies based on LSM level, but
1375 // only for Level and Universal compaction styles. Flush is treated
1376 // as level -1. Policy b is considered fallback / primary policy.
LevelThresholdFilterPolicy(std::unique_ptr<const FilterPolicy> && a,std::unique_ptr<const FilterPolicy> && b,int starting_level_for_b)1377 LevelThresholdFilterPolicy::LevelThresholdFilterPolicy(
1378     std::unique_ptr<const FilterPolicy>&& a,
1379     std::unique_ptr<const FilterPolicy>&& b, int starting_level_for_b)
1380     : policy_a_(std::move(a)),
1381       policy_b_(std::move(b)),
1382       starting_level_for_b_(starting_level_for_b) {
1383   // Don't use this wrapper class if you were going to set to -1
1384   assert(starting_level_for_b_ >= 0);
1385 }
1386 
1387 // Deprecated block-based filter only
CreateFilter(const Slice * keys,int n,std::string * dst) const1388 void LevelThresholdFilterPolicy::CreateFilter(const Slice* keys, int n,
1389                                               std::string* dst) const {
1390   policy_b_->CreateFilter(keys, n, dst);
1391 }
1392 
GetBuilderWithContext(const FilterBuildingContext & context) const1393 FilterBitsBuilder* LevelThresholdFilterPolicy::GetBuilderWithContext(
1394     const FilterBuildingContext& context) const {
1395   switch (context.compaction_style) {
1396     case kCompactionStyleLevel:
1397     case kCompactionStyleUniversal: {
1398       int levelish;
1399       if (context.reason == TableFileCreationReason::kFlush) {
1400         // Treat flush as level -1
1401         assert(context.level_at_creation == 0);
1402         levelish = -1;
1403       } else if (context.level_at_creation == -1) {
1404         // Unknown level
1405         // Policy b considered fallback / primary
1406         return policy_b_->GetBuilderWithContext(context);
1407       } else {
1408         levelish = context.level_at_creation;
1409       }
1410       if (levelish >= starting_level_for_b_) {
1411         return policy_b_->GetBuilderWithContext(context);
1412       } else {
1413         return policy_a_->GetBuilderWithContext(context);
1414       }
1415     }
1416     case kCompactionStyleFIFO:
1417     case kCompactionStyleNone:
1418       break;
1419   }
1420   // Policy b considered fallback / primary
1421   return policy_b_->GetBuilderWithContext(context);
1422 }
1423 
NewRibbonFilterPolicy(double bloom_equivalent_bits_per_key,int bloom_before_level)1424 const FilterPolicy* NewRibbonFilterPolicy(double bloom_equivalent_bits_per_key,
1425                                           int bloom_before_level) {
1426   std::unique_ptr<const FilterPolicy> ribbon_only{new BloomFilterPolicy(
1427       bloom_equivalent_bits_per_key, BloomFilterPolicy::kStandard128Ribbon)};
1428   if (bloom_before_level > -1) {
1429     // Could also use Bloom policy
1430     std::unique_ptr<const FilterPolicy> bloom_only{new BloomFilterPolicy(
1431         bloom_equivalent_bits_per_key, BloomFilterPolicy::kFastLocalBloom)};
1432     return new LevelThresholdFilterPolicy(
1433         std::move(bloom_only), std::move(ribbon_only), bloom_before_level);
1434   } else {
1435     return ribbon_only.release();
1436   }
1437 }
1438 
FilterBuildingContext(const BlockBasedTableOptions & _table_options)1439 FilterBuildingContext::FilterBuildingContext(
1440     const BlockBasedTableOptions& _table_options)
1441     : table_options(_table_options) {}
1442 
~FilterPolicy()1443 FilterPolicy::~FilterPolicy() { }
1444 
CreateFromString(const ConfigOptions &,const std::string & value,std::shared_ptr<const FilterPolicy> * policy)1445 Status FilterPolicy::CreateFromString(
1446     const ConfigOptions& /*options*/, const std::string& value,
1447     std::shared_ptr<const FilterPolicy>* policy) {
1448   const std::string kBloomName = "bloomfilter:";
1449   const std::string kExpRibbonName = "experimental_ribbon:";
1450   const std::string kRibbonName = "ribbonfilter:";
1451   if (value == kNullptrString || value == "rocksdb.BuiltinBloomFilter") {
1452     policy->reset();
1453 #ifndef ROCKSDB_LITE
1454   } else if (value.compare(0, kBloomName.size(), kBloomName) == 0) {
1455     size_t pos = value.find(':', kBloomName.size());
1456     if (pos == std::string::npos) {
1457       return Status::InvalidArgument(
1458           "Invalid filter policy config, missing bits_per_key");
1459     } else {
1460       double bits_per_key = ParseDouble(
1461           trim(value.substr(kBloomName.size(), pos - kBloomName.size())));
1462       bool use_block_based_builder =
1463           ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1)));
1464       policy->reset(
1465           NewBloomFilterPolicy(bits_per_key, use_block_based_builder));
1466     }
1467   } else if (value.compare(0, kExpRibbonName.size(), kExpRibbonName) == 0) {
1468     double bloom_equivalent_bits_per_key =
1469         ParseDouble(trim(value.substr(kExpRibbonName.size())));
1470     policy->reset(
1471         NewExperimentalRibbonFilterPolicy(bloom_equivalent_bits_per_key));
1472   } else if (value.compare(0, kRibbonName.size(), kRibbonName) == 0) {
1473     size_t pos = value.find(':', kRibbonName.size());
1474     int bloom_before_level;
1475     if (pos == std::string::npos) {
1476       pos = value.size();
1477       bloom_before_level = 0;
1478     } else {
1479       bloom_before_level = ParseInt(trim(value.substr(pos + 1)));
1480     }
1481     double bloom_equivalent_bits_per_key =
1482         ParseDouble(trim(value.substr(kRibbonName.size(), pos)));
1483     policy->reset(NewRibbonFilterPolicy(bloom_equivalent_bits_per_key,
1484                                         bloom_before_level));
1485   } else {
1486     return Status::NotFound("Invalid filter policy name ", value);
1487 #else
1488   } else {
1489     return Status::NotSupported("Cannot load filter policy in LITE mode ",
1490                                 value);
1491 #endif  // ROCKSDB_LITE
1492   }
1493   return Status::OK();
1494 }
1495 }  // namespace ROCKSDB_NAMESPACE
1496