1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2012 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #include "rocksdb/filter_policy.h"
11
12 #include <array>
13 #include <deque>
14 #include <limits>
15
16 #include "logging/logging.h"
17 #include "rocksdb/slice.h"
18 #include "table/block_based/block_based_filter_block.h"
19 #include "table/block_based/filter_policy_internal.h"
20 #include "table/block_based/full_filter_block.h"
21 #include "third-party/folly/folly/ConstexprMath.h"
22 #include "util/bloom_impl.h"
23 #include "util/coding.h"
24 #include "util/hash.h"
25 #include "util/ribbon_config.h"
26 #include "util/ribbon_impl.h"
27
28 namespace ROCKSDB_NAMESPACE {
29
30 namespace {
31
32 // Metadata trailer size for built-in filters. (This is separate from
33 // block-based table block trailer.)
34 //
35 // Originally this was 1 byte for num_probes and 4 bytes for number of
36 // cache lines in the Bloom filter, but now the first trailer byte is
37 // usually an implementation marker and remaining 4 bytes have various
38 // meanings.
39 static constexpr uint32_t kMetadataLen = 5;
40
FinishAlwaysFalse(std::unique_ptr<const char[]> *)41 Slice FinishAlwaysFalse(std::unique_ptr<const char[]>* /*buf*/) {
42 // Missing metadata, treated as zero entries
43 return Slice(nullptr, 0);
44 }
45
46 // Base class for filter builders using the XXH3 preview hash,
47 // also known as Hash64 or GetSliceHash64.
48 class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
49 public:
XXPH3FilterBitsBuilder(std::atomic<int64_t> * aggregate_rounding_balance)50 explicit XXPH3FilterBitsBuilder(
51 std::atomic<int64_t>* aggregate_rounding_balance)
52 : aggregate_rounding_balance_(aggregate_rounding_balance) {}
53
~XXPH3FilterBitsBuilder()54 ~XXPH3FilterBitsBuilder() override {}
55
AddKey(const Slice & key)56 virtual void AddKey(const Slice& key) override {
57 uint64_t hash = GetSliceHash64(key);
58 // Especially with prefixes, it is common to have repetition,
59 // though only adjacent repetition, which we want to immediately
60 // recognize and collapse for estimating true filter space
61 // requirements.
62 if (hash_entries_.empty() || hash != hash_entries_.back()) {
63 hash_entries_.push_back(hash);
64 }
65 }
66
EstimateEntriesAdded()67 virtual size_t EstimateEntriesAdded() override {
68 return hash_entries_.size();
69 }
70
71 protected:
72 static constexpr uint32_t kMetadataLen = 5;
73
74 // For delegating between XXPH3FilterBitsBuilders
SwapEntriesWith(XXPH3FilterBitsBuilder * other)75 void SwapEntriesWith(XXPH3FilterBitsBuilder* other) {
76 std::swap(hash_entries_, other->hash_entries_);
77 }
78
79 virtual size_t RoundDownUsableSpace(size_t available_size) = 0;
80
81 // To choose size using malloc_usable_size, we have to actually allocate.
AllocateMaybeRounding(size_t target_len_with_metadata,size_t num_entries,std::unique_ptr<char[]> * buf)82 size_t AllocateMaybeRounding(size_t target_len_with_metadata,
83 size_t num_entries,
84 std::unique_ptr<char[]>* buf) {
85 // Return value set to a default; overwritten in some cases
86 size_t rv = target_len_with_metadata;
87 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
88 if (aggregate_rounding_balance_ != nullptr) {
89 // Do optimize_filters_for_memory, using malloc_usable_size.
90 // Approach: try to keep FP rate balance better than or on
91 // target (negative aggregate_rounding_balance_). We can then select a
92 // lower bound filter size (within reasonable limits) that gets us as
93 // close to on target as possible. We request allocation for that filter
94 // size and use malloc_usable_size to "round up" to the actual
95 // allocation size.
96
97 // Although it can be considered bad practice to use malloc_usable_size
98 // to access an object beyond its original size, this approach should be
99 // quite general: working for all allocators that properly support
100 // malloc_usable_size.
101
102 // Race condition on balance is OK because it can only cause temporary
103 // skew in rounding up vs. rounding down, as long as updates are atomic
104 // and relative.
105 int64_t balance = aggregate_rounding_balance_->load();
106
107 double target_fp_rate =
108 EstimatedFpRate(num_entries, target_len_with_metadata);
109 double rv_fp_rate = target_fp_rate;
110
111 if (balance < 0) {
112 // See formula for BloomFilterPolicy::aggregate_rounding_balance_
113 double for_balance_fp_rate =
114 -balance / double{0x100000000} + target_fp_rate;
115
116 // To simplify, we just try a few modified smaller sizes. This also
117 // caps how much we vary filter size vs. target, to avoid outlier
118 // behavior from excessive variance.
119 size_t target_len = target_len_with_metadata - kMetadataLen;
120 assert(target_len < target_len_with_metadata); // check underflow
121 for (uint64_t maybe_len_rough :
122 {uint64_t{3} * target_len / 4, uint64_t{13} * target_len / 16,
123 uint64_t{7} * target_len / 8, uint64_t{15} * target_len / 16}) {
124 size_t maybe_len_with_metadata =
125 RoundDownUsableSpace(maybe_len_rough + kMetadataLen);
126 double maybe_fp_rate =
127 EstimatedFpRate(num_entries, maybe_len_with_metadata);
128 if (maybe_fp_rate <= for_balance_fp_rate) {
129 rv = maybe_len_with_metadata;
130 rv_fp_rate = maybe_fp_rate;
131 break;
132 }
133 }
134 }
135
136 // Filter blocks are loaded into block cache with their block trailer.
137 // We need to make sure that's accounted for in choosing a
138 // fragmentation-friendly size.
139 const size_t kExtraPadding = kBlockTrailerSize;
140 size_t requested = rv + kExtraPadding;
141
142 // Allocate and get usable size
143 buf->reset(new char[requested]);
144 size_t usable = malloc_usable_size(buf->get());
145
146 if (usable - usable / 4 > requested) {
147 // Ratio greater than 4/3 is too much for utilizing, if it's
148 // not a buggy or mislinked malloc_usable_size implementation.
149 // Non-linearity of FP rates with bits/key means rapidly
150 // diminishing returns in overall accuracy for additional
151 // storage on disk.
152 // Nothing to do, except assert that the result is accurate about
153 // the usable size. (Assignment never used.)
154 assert(((*buf)[usable - 1] = 'x'));
155 } else if (usable > requested) {
156 rv = RoundDownUsableSpace(usable - kExtraPadding);
157 assert(rv <= usable - kExtraPadding);
158 rv_fp_rate = EstimatedFpRate(num_entries, rv);
159 } else {
160 // Too small means bad malloc_usable_size
161 assert(usable == requested);
162 }
163 memset(buf->get(), 0, rv);
164
165 // Update balance
166 int64_t diff = static_cast<int64_t>((rv_fp_rate - target_fp_rate) *
167 double{0x100000000});
168 *aggregate_rounding_balance_ += diff;
169 } else {
170 buf->reset(new char[rv]());
171 }
172 #else
173 (void)num_entries;
174 buf->reset(new char[rv]());
175 #endif // ROCKSDB_MALLOC_USABLE_SIZE
176 return rv;
177 }
178
179 // A deque avoids unnecessary copying of already-saved values
180 // and has near-minimal peak memory use.
181 std::deque<uint64_t> hash_entries_;
182
183 // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr,
184 // always "round up" like historic behavior.
185 std::atomic<int64_t>* aggregate_rounding_balance_;
186 };
187
188 // #################### FastLocalBloom implementation ################## //
189 // ############## also known as format_version=5 Bloom filter ########## //
190
191 // See description in FastLocalBloomImpl
192 class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder {
193 public:
194 // Non-null aggregate_rounding_balance implies optimize_filters_for_memory
FastLocalBloomBitsBuilder(const int millibits_per_key,std::atomic<int64_t> * aggregate_rounding_balance)195 explicit FastLocalBloomBitsBuilder(
196 const int millibits_per_key,
197 std::atomic<int64_t>* aggregate_rounding_balance)
198 : XXPH3FilterBitsBuilder(aggregate_rounding_balance),
199 millibits_per_key_(millibits_per_key) {
200 assert(millibits_per_key >= 1000);
201 }
202
203 // No Copy allowed
204 FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete;
205 void operator=(const FastLocalBloomBitsBuilder&) = delete;
206
~FastLocalBloomBitsBuilder()207 ~FastLocalBloomBitsBuilder() override {}
208
Finish(std::unique_ptr<const char[]> * buf)209 virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
210 size_t num_entries = hash_entries_.size();
211 size_t len_with_metadata = CalculateSpace(num_entries);
212
213 std::unique_ptr<char[]> mutable_buf;
214 len_with_metadata =
215 AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf);
216
217 assert(mutable_buf);
218 assert(len_with_metadata >= kMetadataLen);
219
220 // Max size supported by implementation
221 assert(len_with_metadata <= 0xffffffffU);
222
223 // Compute num_probes after any rounding / adjustments
224 int num_probes = GetNumProbes(num_entries, len_with_metadata);
225
226 uint32_t len = static_cast<uint32_t>(len_with_metadata - kMetadataLen);
227 if (len > 0) {
228 AddAllEntries(mutable_buf.get(), len, num_probes);
229 }
230
231 assert(hash_entries_.empty());
232
233 // See BloomFilterPolicy::GetBloomBitsReader re: metadata
234 // -1 = Marker for newer Bloom implementations
235 mutable_buf[len] = static_cast<char>(-1);
236 // 0 = Marker for this sub-implementation
237 mutable_buf[len + 1] = static_cast<char>(0);
238 // num_probes (and 0 in upper bits for 64-byte block size)
239 mutable_buf[len + 2] = static_cast<char>(num_probes);
240 // rest of metadata stays zero
241
242 Slice rv(mutable_buf.get(), len_with_metadata);
243 *buf = std::move(mutable_buf);
244 return rv;
245 }
246
ApproximateNumEntries(size_t bytes)247 size_t ApproximateNumEntries(size_t bytes) override {
248 size_t bytes_no_meta =
249 bytes >= kMetadataLen ? RoundDownUsableSpace(bytes) - kMetadataLen : 0;
250 return static_cast<size_t>(uint64_t{8000} * bytes_no_meta /
251 millibits_per_key_);
252 }
253
CalculateSpace(size_t num_entries)254 size_t CalculateSpace(size_t num_entries) override {
255 // If not for cache line blocks in the filter, what would the target
256 // length in bytes be?
257 size_t raw_target_len = static_cast<size_t>(
258 (uint64_t{num_entries} * millibits_per_key_ + 7999) / 8000);
259
260 if (raw_target_len >= size_t{0xffffffc0}) {
261 // Max supported for this data structure implementation
262 raw_target_len = size_t{0xffffffc0};
263 }
264
265 // Round up to nearest multiple of 64 (block size). This adjustment is
266 // used for target FP rate only so that we don't receive complaints about
267 // lower FP rate vs. historic Bloom filter behavior.
268 return ((raw_target_len + 63) & ~size_t{63}) + kMetadataLen;
269 }
270
EstimatedFpRate(size_t keys,size_t len_with_metadata)271 double EstimatedFpRate(size_t keys, size_t len_with_metadata) override {
272 int num_probes = GetNumProbes(keys, len_with_metadata);
273 return FastLocalBloomImpl::EstimatedFpRate(
274 keys, len_with_metadata - kMetadataLen, num_probes, /*hash bits*/ 64);
275 }
276
277 protected:
RoundDownUsableSpace(size_t available_size)278 size_t RoundDownUsableSpace(size_t available_size) override {
279 size_t rv = available_size - kMetadataLen;
280
281 if (rv >= size_t{0xffffffc0}) {
282 // Max supported for this data structure implementation
283 rv = size_t{0xffffffc0};
284 }
285
286 // round down to multiple of 64 (block size)
287 rv &= ~size_t{63};
288
289 return rv + kMetadataLen;
290 }
291
292 private:
293 // Compute num_probes after any rounding / adjustments
GetNumProbes(size_t keys,size_t len_with_metadata)294 int GetNumProbes(size_t keys, size_t len_with_metadata) {
295 uint64_t millibits = uint64_t{len_with_metadata - kMetadataLen} * 8000;
296 int actual_millibits_per_key =
297 static_cast<int>(millibits / std::max(keys, size_t{1}));
298 // BEGIN XXX/TODO(peterd): preserving old/default behavior for now to
299 // minimize unit test churn. Remove this some time.
300 if (!aggregate_rounding_balance_) {
301 actual_millibits_per_key = millibits_per_key_;
302 }
303 // END XXX/TODO
304 return FastLocalBloomImpl::ChooseNumProbes(actual_millibits_per_key);
305 }
306
AddAllEntries(char * data,uint32_t len,int num_probes)307 void AddAllEntries(char* data, uint32_t len, int num_probes) {
308 // Simple version without prefetching:
309 //
310 // for (auto h : hash_entries_) {
311 // FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len,
312 // num_probes, data);
313 // }
314
315 const size_t num_entries = hash_entries_.size();
316 constexpr size_t kBufferMask = 7;
317 static_assert(((kBufferMask + 1) & kBufferMask) == 0,
318 "Must be power of 2 minus 1");
319
320 std::array<uint32_t, kBufferMask + 1> hashes;
321 std::array<uint32_t, kBufferMask + 1> byte_offsets;
322
323 // Prime the buffer
324 size_t i = 0;
325 for (; i <= kBufferMask && i < num_entries; ++i) {
326 uint64_t h = hash_entries_.front();
327 hash_entries_.pop_front();
328 FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
329 /*out*/ &byte_offsets[i]);
330 hashes[i] = Upper32of64(h);
331 }
332
333 // Process and buffer
334 for (; i < num_entries; ++i) {
335 uint32_t& hash_ref = hashes[i & kBufferMask];
336 uint32_t& byte_offset_ref = byte_offsets[i & kBufferMask];
337 // Process (add)
338 FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes,
339 data + byte_offset_ref);
340 // And buffer
341 uint64_t h = hash_entries_.front();
342 hash_entries_.pop_front();
343 FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
344 /*out*/ &byte_offset_ref);
345 hash_ref = Upper32of64(h);
346 }
347
348 // Finish processing
349 for (i = 0; i <= kBufferMask && i < num_entries; ++i) {
350 FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes,
351 data + byte_offsets[i]);
352 }
353 }
354
355 // Target allocation per added key, in thousandths of a bit.
356 int millibits_per_key_;
357 };
358
359 // See description in FastLocalBloomImpl
360 class FastLocalBloomBitsReader : public FilterBitsReader {
361 public:
FastLocalBloomBitsReader(const char * data,int num_probes,uint32_t len_bytes)362 FastLocalBloomBitsReader(const char* data, int num_probes, uint32_t len_bytes)
363 : data_(data), num_probes_(num_probes), len_bytes_(len_bytes) {}
364
365 // No Copy allowed
366 FastLocalBloomBitsReader(const FastLocalBloomBitsReader&) = delete;
367 void operator=(const FastLocalBloomBitsReader&) = delete;
368
~FastLocalBloomBitsReader()369 ~FastLocalBloomBitsReader() override {}
370
MayMatch(const Slice & key)371 bool MayMatch(const Slice& key) override {
372 uint64_t h = GetSliceHash64(key);
373 uint32_t byte_offset;
374 FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
375 /*out*/ &byte_offset);
376 return FastLocalBloomImpl::HashMayMatchPrepared(Upper32of64(h), num_probes_,
377 data_ + byte_offset);
378 }
379
MayMatch(int num_keys,Slice ** keys,bool * may_match)380 virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
381 std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
382 std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
383 for (int i = 0; i < num_keys; ++i) {
384 uint64_t h = GetSliceHash64(*keys[i]);
385 FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
386 /*out*/ &byte_offsets[i]);
387 hashes[i] = Upper32of64(h);
388 }
389 for (int i = 0; i < num_keys; ++i) {
390 may_match[i] = FastLocalBloomImpl::HashMayMatchPrepared(
391 hashes[i], num_probes_, data_ + byte_offsets[i]);
392 }
393 }
394
395 private:
396 const char* data_;
397 const int num_probes_;
398 const uint32_t len_bytes_;
399 };
400
401 // ##################### Ribbon filter implementation ################### //
402
403 // Implements concept RehasherTypesAndSettings in ribbon_impl.h
404 struct Standard128RibbonRehasherTypesAndSettings {
405 // These are schema-critical. Any change almost certainly changes
406 // underlying data.
407 static constexpr bool kIsFilter = true;
408 static constexpr bool kHomogeneous = false;
409 static constexpr bool kFirstCoeffAlwaysOne = true;
410 static constexpr bool kUseSmash = false;
411 using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
412 using Hash = uint64_t;
413 using Seed = uint32_t;
414 // Changing these doesn't necessarily change underlying data,
415 // but might affect supported scalability of those dimensions.
416 using Index = uint32_t;
417 using ResultRow = uint32_t;
418 // Save a conditional in Ribbon queries
419 static constexpr bool kAllowZeroStarts = false;
420 };
421
422 using Standard128RibbonTypesAndSettings =
423 ribbon::StandardRehasherAdapter<Standard128RibbonRehasherTypesAndSettings>;
424
425 class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder {
426 public:
Standard128RibbonBitsBuilder(double desired_one_in_fp_rate,int bloom_millibits_per_key,std::atomic<int64_t> * aggregate_rounding_balance,Logger * info_log)427 explicit Standard128RibbonBitsBuilder(
428 double desired_one_in_fp_rate, int bloom_millibits_per_key,
429 std::atomic<int64_t>* aggregate_rounding_balance, Logger* info_log)
430 : XXPH3FilterBitsBuilder(aggregate_rounding_balance),
431 desired_one_in_fp_rate_(desired_one_in_fp_rate),
432 info_log_(info_log),
433 bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance) {
434 assert(desired_one_in_fp_rate >= 1.0);
435 }
436
437 // No Copy allowed
438 Standard128RibbonBitsBuilder(const Standard128RibbonBitsBuilder&) = delete;
439 void operator=(const Standard128RibbonBitsBuilder&) = delete;
440
~Standard128RibbonBitsBuilder()441 ~Standard128RibbonBitsBuilder() override {}
442
Finish(std::unique_ptr<const char[]> * buf)443 virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
444 if (hash_entries_.size() > kMaxRibbonEntries) {
445 ROCKS_LOG_WARN(info_log_, "Too many keys for Ribbon filter: %llu",
446 static_cast<unsigned long long>(hash_entries_.size()));
447 SwapEntriesWith(&bloom_fallback_);
448 assert(hash_entries_.empty());
449 return bloom_fallback_.Finish(buf);
450 }
451 if (hash_entries_.size() == 0) {
452 // Save a conditional in Ribbon queries by using alternate reader
453 // for zero entries added.
454 return FinishAlwaysFalse(buf);
455 }
456 uint32_t num_entries = static_cast<uint32_t>(hash_entries_.size());
457 uint32_t num_slots;
458 size_t len_with_metadata;
459
460 CalculateSpaceAndSlots(num_entries, &len_with_metadata, &num_slots);
461
462 // Bloom fall-back indicator
463 if (num_slots == 0) {
464 SwapEntriesWith(&bloom_fallback_);
465 assert(hash_entries_.empty());
466 return bloom_fallback_.Finish(buf);
467 }
468
469 uint32_t entropy = 0;
470 if (!hash_entries_.empty()) {
471 entropy = Lower32of64(hash_entries_.front());
472 }
473
474 BandingType banding;
475 bool success = banding.ResetAndFindSeedToSolve(
476 num_slots, hash_entries_.begin(), hash_entries_.end(),
477 /*starting seed*/ entropy & 255, /*seed mask*/ 255);
478 if (!success) {
479 ROCKS_LOG_WARN(info_log_,
480 "Too many re-seeds (256) for Ribbon filter, %llu / %llu",
481 static_cast<unsigned long long>(hash_entries_.size()),
482 static_cast<unsigned long long>(num_slots));
483 SwapEntriesWith(&bloom_fallback_);
484 assert(hash_entries_.empty());
485 return bloom_fallback_.Finish(buf);
486 }
487 hash_entries_.clear();
488
489 uint32_t seed = banding.GetOrdinalSeed();
490 assert(seed < 256);
491
492 std::unique_ptr<char[]> mutable_buf;
493 len_with_metadata =
494 AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf);
495
496 SolnType soln(mutable_buf.get(), len_with_metadata);
497 soln.BackSubstFrom(banding);
498 uint32_t num_blocks = soln.GetNumBlocks();
499 // This should be guaranteed:
500 // num_entries < 2^30
501 // => (overhead_factor < 2.0)
502 // num_entries * overhead_factor == num_slots < 2^31
503 // => (num_blocks = num_slots / 128)
504 // num_blocks < 2^24
505 assert(num_blocks < 0x1000000U);
506
507 // See BloomFilterPolicy::GetBloomBitsReader re: metadata
508 // -2 = Marker for Standard128 Ribbon
509 mutable_buf[len_with_metadata - 5] = static_cast<char>(-2);
510 // Hash seed
511 mutable_buf[len_with_metadata - 4] = static_cast<char>(seed);
512 // Number of blocks, in 24 bits
513 // (Along with bytes, we can derive other settings)
514 mutable_buf[len_with_metadata - 3] = static_cast<char>(num_blocks & 255);
515 mutable_buf[len_with_metadata - 2] =
516 static_cast<char>((num_blocks >> 8) & 255);
517 mutable_buf[len_with_metadata - 1] =
518 static_cast<char>((num_blocks >> 16) & 255);
519
520 Slice rv(mutable_buf.get(), len_with_metadata);
521 *buf = std::move(mutable_buf);
522 return rv;
523 }
524
525 // Setting num_slots to 0 means "fall back on Bloom filter."
526 // And note this implementation does not support num_entries or num_slots
527 // beyond uint32_t; see kMaxRibbonEntries.
CalculateSpaceAndSlots(size_t num_entries,size_t * target_len_with_metadata,uint32_t * num_slots)528 void CalculateSpaceAndSlots(size_t num_entries,
529 size_t* target_len_with_metadata,
530 uint32_t* num_slots) {
531 if (num_entries > kMaxRibbonEntries) {
532 // More entries than supported by this Ribbon
533 *num_slots = 0; // use Bloom
534 *target_len_with_metadata = bloom_fallback_.CalculateSpace(num_entries);
535 return;
536 }
537 uint32_t entropy = 0;
538 if (!hash_entries_.empty()) {
539 entropy = Upper32of64(hash_entries_.front());
540 }
541
542 *num_slots = NumEntriesToNumSlots(static_cast<uint32_t>(num_entries));
543 *target_len_with_metadata =
544 SolnType::GetBytesForOneInFpRate(*num_slots, desired_one_in_fp_rate_,
545 /*rounding*/ entropy) +
546 kMetadataLen;
547
548 // Consider possible Bloom fallback for small filters
549 if (*num_slots < 1024) {
550 size_t bloom = bloom_fallback_.CalculateSpace(num_entries);
551 if (bloom < *target_len_with_metadata) {
552 *num_slots = 0; // use Bloom
553 *target_len_with_metadata = bloom;
554 return;
555 }
556 }
557 }
558
CalculateSpace(size_t num_entries)559 size_t CalculateSpace(size_t num_entries) override {
560 if (num_entries == 0) {
561 // See FinishAlwaysFalse
562 return 0;
563 }
564 size_t target_len_with_metadata;
565 uint32_t num_slots;
566 CalculateSpaceAndSlots(num_entries, &target_len_with_metadata, &num_slots);
567 (void)num_slots;
568 return target_len_with_metadata;
569 }
570
571 // This is a somewhat ugly but reasonably fast and reasonably accurate
572 // reversal of CalculateSpace.
ApproximateNumEntries(size_t bytes)573 size_t ApproximateNumEntries(size_t bytes) override {
574 size_t len_no_metadata =
575 RoundDownUsableSpace(std::max(bytes, size_t{kMetadataLen})) -
576 kMetadataLen;
577
578 if (!(desired_one_in_fp_rate_ > 1.0)) {
579 // Effectively asking for 100% FP rate, or NaN etc.
580 // Note that NaN is neither < 1.0 nor > 1.0
581 return kMaxRibbonEntries;
582 }
583
584 // Find a slight under-estimate for actual average bits per slot
585 double min_real_bits_per_slot;
586 if (desired_one_in_fp_rate_ >= 1.0 + std::numeric_limits<uint32_t>::max()) {
587 // Max of 32 solution columns (result bits)
588 min_real_bits_per_slot = 32.0;
589 } else {
590 // Account for mix of b and b+1 solution columns being slightly
591 // suboptimal vs. ideal log2(1/fp_rate) bits.
592 uint32_t rounded = static_cast<uint32_t>(desired_one_in_fp_rate_);
593 int upper_bits_per_key = 1 + FloorLog2(rounded);
594 double fp_rate_for_upper = std::pow(2.0, -upper_bits_per_key);
595 double portion_lower =
596 (1.0 / desired_one_in_fp_rate_ - fp_rate_for_upper) /
597 fp_rate_for_upper;
598 min_real_bits_per_slot = upper_bits_per_key - portion_lower;
599 assert(min_real_bits_per_slot > 0.0);
600 assert(min_real_bits_per_slot <= 32.0);
601 }
602
603 // An overestimate, but this should only be O(1) slots away from truth.
604 double max_slots = len_no_metadata * 8.0 / min_real_bits_per_slot;
605
606 // Let's not bother accounting for overflow to Bloom filter
607 // (Includes NaN case)
608 if (!(max_slots < ConfigHelper::GetNumSlots(kMaxRibbonEntries))) {
609 return kMaxRibbonEntries;
610 }
611
612 // Set up for short iteration
613 uint32_t slots = static_cast<uint32_t>(max_slots);
614 slots = SolnType::RoundUpNumSlots(slots);
615
616 // Assert that we have a valid upper bound on slots
617 assert(SolnType::GetBytesForOneInFpRate(
618 SolnType::RoundUpNumSlots(slots + 1), desired_one_in_fp_rate_,
619 /*rounding*/ 0) > len_no_metadata);
620
621 // Iterate up to a few times to rather precisely account for small effects
622 for (int i = 0; slots > 0; ++i) {
623 size_t reqd_bytes =
624 SolnType::GetBytesForOneInFpRate(slots, desired_one_in_fp_rate_,
625 /*rounding*/ 0);
626 if (reqd_bytes <= len_no_metadata) {
627 break; // done
628 }
629 if (i >= 2) {
630 // should have been enough iterations
631 assert(false);
632 break;
633 }
634 slots = SolnType::RoundDownNumSlots(slots - 1);
635 }
636
637 uint32_t num_entries = ConfigHelper::GetNumToAdd(slots);
638
639 // Consider possible Bloom fallback for small filters
640 if (slots < 1024) {
641 size_t bloom = bloom_fallback_.ApproximateNumEntries(bytes);
642 if (bloom > num_entries) {
643 return bloom;
644 } else {
645 return num_entries;
646 }
647 } else {
648 return std::min(num_entries, kMaxRibbonEntries);
649 }
650 }
651
EstimatedFpRate(size_t num_entries,size_t len_with_metadata)652 double EstimatedFpRate(size_t num_entries,
653 size_t len_with_metadata) override {
654 if (num_entries > kMaxRibbonEntries) {
655 // More entries than supported by this Ribbon
656 return bloom_fallback_.EstimatedFpRate(num_entries, len_with_metadata);
657 }
658 uint32_t num_slots =
659 NumEntriesToNumSlots(static_cast<uint32_t>(num_entries));
660 SolnType fake_soln(nullptr, len_with_metadata);
661 fake_soln.ConfigureForNumSlots(num_slots);
662 return fake_soln.ExpectedFpRate();
663 }
664
665 protected:
RoundDownUsableSpace(size_t available_size)666 size_t RoundDownUsableSpace(size_t available_size) override {
667 size_t rv = available_size - kMetadataLen;
668
669 // round down to multiple of 16 (segment size)
670 rv &= ~size_t{15};
671
672 return rv + kMetadataLen;
673 }
674
675 private:
676 using TS = Standard128RibbonTypesAndSettings;
677 using SolnType = ribbon::SerializableInterleavedSolution<TS>;
678 using BandingType = ribbon::StandardBanding<TS>;
679 using ConfigHelper = ribbon::BandingConfigHelper1TS<ribbon::kOneIn20, TS>;
680
NumEntriesToNumSlots(uint32_t num_entries)681 static uint32_t NumEntriesToNumSlots(uint32_t num_entries) {
682 uint32_t num_slots1 = ConfigHelper::GetNumSlots(num_entries);
683 return SolnType::RoundUpNumSlots(num_slots1);
684 }
685
686 // Approximate num_entries to ensure number of bytes fits in 32 bits, which
687 // is not an inherent limitation but does ensure somewhat graceful Bloom
688 // fallback for crazy high number of entries, since the Bloom implementation
689 // does not support number of bytes bigger than fits in 32 bits. This is
690 // within an order of magnitude of implementation limit on num_slots
691 // fitting in 32 bits, and even closer for num_blocks fitting in 24 bits
692 // (for filter metadata).
693 static constexpr uint32_t kMaxRibbonEntries = 950000000; // ~ 1 billion
694
695 // A desired value for 1/fp_rate. For example, 100 -> 1% fp rate.
696 double desired_one_in_fp_rate_;
697
698 // For warnings, or can be nullptr
699 Logger* info_log_;
700
701 // For falling back on Bloom filter in some exceptional cases and
702 // very small filter cases
703 FastLocalBloomBitsBuilder bloom_fallback_;
704 };
705
706 // for the linker, at least with DEBUG_LEVEL=2
707 constexpr uint32_t Standard128RibbonBitsBuilder::kMaxRibbonEntries;
708
709 class Standard128RibbonBitsReader : public FilterBitsReader {
710 public:
Standard128RibbonBitsReader(const char * data,size_t len_bytes,uint32_t num_blocks,uint32_t seed)711 Standard128RibbonBitsReader(const char* data, size_t len_bytes,
712 uint32_t num_blocks, uint32_t seed)
713 : soln_(const_cast<char*>(data), len_bytes) {
714 soln_.ConfigureForNumBlocks(num_blocks);
715 hasher_.SetOrdinalSeed(seed);
716 }
717
718 // No Copy allowed
719 Standard128RibbonBitsReader(const Standard128RibbonBitsReader&) = delete;
720 void operator=(const Standard128RibbonBitsReader&) = delete;
721
~Standard128RibbonBitsReader()722 ~Standard128RibbonBitsReader() override {}
723
MayMatch(const Slice & key)724 bool MayMatch(const Slice& key) override {
725 uint64_t h = GetSliceHash64(key);
726 return soln_.FilterQuery(h, hasher_);
727 }
728
MayMatch(int num_keys,Slice ** keys,bool * may_match)729 virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
730 struct SavedData {
731 uint64_t seeded_hash;
732 uint32_t segment_num;
733 uint32_t num_columns;
734 uint32_t start_bits;
735 };
736 std::array<SavedData, MultiGetContext::MAX_BATCH_SIZE> saved;
737 for (int i = 0; i < num_keys; ++i) {
738 ribbon::InterleavedPrepareQuery(
739 GetSliceHash64(*keys[i]), hasher_, soln_, &saved[i].seeded_hash,
740 &saved[i].segment_num, &saved[i].num_columns, &saved[i].start_bits);
741 }
742 for (int i = 0; i < num_keys; ++i) {
743 may_match[i] = ribbon::InterleavedFilterQuery(
744 saved[i].seeded_hash, saved[i].segment_num, saved[i].num_columns,
745 saved[i].start_bits, hasher_, soln_);
746 }
747 }
748
749 private:
750 using TS = Standard128RibbonTypesAndSettings;
751 ribbon::SerializableInterleavedSolution<TS> soln_;
752 ribbon::StandardHasher<TS> hasher_;
753 };
754
755 // ##################### Legacy Bloom implementation ################### //
756
757 using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>;
758
759 class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
760 public:
761 explicit LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log);
762
763 // No Copy allowed
764 LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete;
765 void operator=(const LegacyBloomBitsBuilder&) = delete;
766
767 ~LegacyBloomBitsBuilder() override;
768
769 void AddKey(const Slice& key) override;
770
EstimateEntriesAdded()771 virtual size_t EstimateEntriesAdded() override {
772 return hash_entries_.size();
773 }
774
775 Slice Finish(std::unique_ptr<const char[]>* buf) override;
776
CalculateSpace(size_t num_entries)777 size_t CalculateSpace(size_t num_entries) override {
778 uint32_t dont_care1;
779 uint32_t dont_care2;
780 return CalculateSpace(num_entries, &dont_care1, &dont_care2);
781 }
782
EstimatedFpRate(size_t keys,size_t bytes)783 double EstimatedFpRate(size_t keys, size_t bytes) override {
784 return LegacyBloomImpl::EstimatedFpRate(keys, bytes - kMetadataLen,
785 num_probes_);
786 }
787
788 size_t ApproximateNumEntries(size_t bytes) override;
789
790 private:
791 int bits_per_key_;
792 int num_probes_;
793 std::vector<uint32_t> hash_entries_;
794 Logger* info_log_;
795
796 // Get totalbits that optimized for cpu cache line
797 uint32_t GetTotalBitsForLocality(uint32_t total_bits);
798
799 // Reserve space for new filter
800 char* ReserveSpace(size_t num_entries, uint32_t* total_bits,
801 uint32_t* num_lines);
802
803 // Implementation-specific variant of public CalculateSpace
804 uint32_t CalculateSpace(size_t num_entries, uint32_t* total_bits,
805 uint32_t* num_lines);
806
807 // Assuming single threaded access to this function.
808 void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
809 };
810
LegacyBloomBitsBuilder(const int bits_per_key,Logger * info_log)811 LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key,
812 Logger* info_log)
813 : bits_per_key_(bits_per_key),
814 num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)),
815 info_log_(info_log) {
816 assert(bits_per_key_);
817 }
818
~LegacyBloomBitsBuilder()819 LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() {}
820
AddKey(const Slice & key)821 void LegacyBloomBitsBuilder::AddKey(const Slice& key) {
822 uint32_t hash = BloomHash(key);
823 if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
824 hash_entries_.push_back(hash);
825 }
826 }
827
Finish(std::unique_ptr<const char[]> * buf)828 Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
829 uint32_t total_bits, num_lines;
830 size_t num_entries = hash_entries_.size();
831 char* data =
832 ReserveSpace(static_cast<int>(num_entries), &total_bits, &num_lines);
833 assert(data);
834
835 if (total_bits != 0 && num_lines != 0) {
836 for (auto h : hash_entries_) {
837 AddHash(h, data, num_lines, total_bits);
838 }
839
840 // Check for excessive entries for 32-bit hash function
841 if (num_entries >= /* minimum of 3 million */ 3000000U) {
842 // More specifically, we can detect that the 32-bit hash function
843 // is causing significant increase in FP rate by comparing current
844 // estimated FP rate to what we would get with a normal number of
845 // keys at same memory ratio.
846 double est_fp_rate = LegacyBloomImpl::EstimatedFpRate(
847 num_entries, total_bits / 8, num_probes_);
848 double vs_fp_rate = LegacyBloomImpl::EstimatedFpRate(
849 1U << 16, (1U << 16) * bits_per_key_ / 8, num_probes_);
850
851 if (est_fp_rate >= 1.50 * vs_fp_rate) {
852 // For more details, see
853 // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
854 ROCKS_LOG_WARN(
855 info_log_,
856 "Using legacy SST/BBT Bloom filter with excessive key count "
857 "(%.1fM @ %dbpk), causing estimated %.1fx higher filter FP rate. "
858 "Consider using new Bloom with format_version>=5, smaller SST "
859 "file size, or partitioned filters.",
860 num_entries / 1000000.0, bits_per_key_, est_fp_rate / vs_fp_rate);
861 }
862 }
863 }
864 // See BloomFilterPolicy::GetFilterBitsReader for metadata
865 data[total_bits / 8] = static_cast<char>(num_probes_);
866 EncodeFixed32(data + total_bits / 8 + 1, static_cast<uint32_t>(num_lines));
867
868 const char* const_data = data;
869 buf->reset(const_data);
870 hash_entries_.clear();
871
872 return Slice(data, total_bits / 8 + kMetadataLen);
873 }
874
ApproximateNumEntries(size_t bytes)875 size_t LegacyBloomBitsBuilder::ApproximateNumEntries(size_t bytes) {
876 assert(bits_per_key_);
877 assert(bytes > 0);
878
879 uint64_t total_bits_tmp = bytes * 8;
880 // total bits, including temporary computations, cannot exceed 2^32
881 // for compatibility
882 total_bits_tmp = std::min(total_bits_tmp, uint64_t{0xffff0000});
883
884 uint32_t high = static_cast<uint32_t>(total_bits_tmp) /
885 static_cast<uint32_t>(bits_per_key_) +
886 1;
887 uint32_t low = 1;
888 uint32_t n = high;
889 for (; n >= low; n--) {
890 if (CalculateSpace(n) <= bytes) {
891 break;
892 }
893 }
894 return n;
895 }
896
GetTotalBitsForLocality(uint32_t total_bits)897 uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
898 uint32_t num_lines =
899 (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
900
901 // Make num_lines an odd number to make sure more bits are involved
902 // when determining which block.
903 if (num_lines % 2 == 0) {
904 num_lines++;
905 }
906 return num_lines * (CACHE_LINE_SIZE * 8);
907 }
908
CalculateSpace(size_t num_entries,uint32_t * total_bits,uint32_t * num_lines)909 uint32_t LegacyBloomBitsBuilder::CalculateSpace(size_t num_entries,
910 uint32_t* total_bits,
911 uint32_t* num_lines) {
912 assert(bits_per_key_);
913 if (num_entries != 0) {
914 size_t total_bits_tmp = num_entries * bits_per_key_;
915 // total bits, including temporary computations, cannot exceed 2^32
916 // for compatibility
917 total_bits_tmp = std::min(total_bits_tmp, size_t{0xffff0000});
918
919 *total_bits =
920 GetTotalBitsForLocality(static_cast<uint32_t>(total_bits_tmp));
921 *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
922 assert(*total_bits > 0 && *total_bits % 8 == 0);
923 } else {
924 // filter is empty, just leave space for metadata
925 *total_bits = 0;
926 *num_lines = 0;
927 }
928
929 // Reserve space for Filter
930 uint32_t sz = *total_bits / 8;
931 sz += kMetadataLen; // 4 bytes for num_lines, 1 byte for num_probes
932 return sz;
933 }
934
ReserveSpace(size_t num_entries,uint32_t * total_bits,uint32_t * num_lines)935 char* LegacyBloomBitsBuilder::ReserveSpace(size_t num_entries,
936 uint32_t* total_bits,
937 uint32_t* num_lines) {
938 uint32_t sz = CalculateSpace(num_entries, total_bits, num_lines);
939 char* data = new char[sz];
940 memset(data, 0, sz);
941 return data;
942 }
943
AddHash(uint32_t h,char * data,uint32_t num_lines,uint32_t total_bits)944 inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data,
945 uint32_t num_lines,
946 uint32_t total_bits) {
947 #ifdef NDEBUG
948 static_cast<void>(total_bits);
949 #endif
950 assert(num_lines > 0 && total_bits > 0);
951
952 LegacyBloomImpl::AddHash(h, num_lines, num_probes_, data,
953 folly::constexpr_log2(CACHE_LINE_SIZE));
954 }
955
956 class LegacyBloomBitsReader : public FilterBitsReader {
957 public:
LegacyBloomBitsReader(const char * data,int num_probes,uint32_t num_lines,uint32_t log2_cache_line_size)958 LegacyBloomBitsReader(const char* data, int num_probes, uint32_t num_lines,
959 uint32_t log2_cache_line_size)
960 : data_(data),
961 num_probes_(num_probes),
962 num_lines_(num_lines),
963 log2_cache_line_size_(log2_cache_line_size) {}
964
965 // No Copy allowed
966 LegacyBloomBitsReader(const LegacyBloomBitsReader&) = delete;
967 void operator=(const LegacyBloomBitsReader&) = delete;
968
~LegacyBloomBitsReader()969 ~LegacyBloomBitsReader() override {}
970
971 // "contents" contains the data built by a preceding call to
972 // FilterBitsBuilder::Finish. MayMatch must return true if the key was
973 // passed to FilterBitsBuilder::AddKey. This method may return true or false
974 // if the key was not on the list, but it should aim to return false with a
975 // high probability.
MayMatch(const Slice & key)976 bool MayMatch(const Slice& key) override {
977 uint32_t hash = BloomHash(key);
978 uint32_t byte_offset;
979 LegacyBloomImpl::PrepareHashMayMatch(
980 hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_);
981 return LegacyBloomImpl::HashMayMatchPrepared(
982 hash, num_probes_, data_ + byte_offset, log2_cache_line_size_);
983 }
984
MayMatch(int num_keys,Slice ** keys,bool * may_match)985 virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
986 std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
987 std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
988 for (int i = 0; i < num_keys; ++i) {
989 hashes[i] = BloomHash(*keys[i]);
990 LegacyBloomImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_,
991 /*out*/ &byte_offsets[i],
992 log2_cache_line_size_);
993 }
994 for (int i = 0; i < num_keys; ++i) {
995 may_match[i] = LegacyBloomImpl::HashMayMatchPrepared(
996 hashes[i], num_probes_, data_ + byte_offsets[i],
997 log2_cache_line_size_);
998 }
999 }
1000
1001 private:
1002 const char* data_;
1003 const int num_probes_;
1004 const uint32_t num_lines_;
1005 const uint32_t log2_cache_line_size_;
1006 };
1007
1008 class AlwaysTrueFilter : public FilterBitsReader {
1009 public:
MayMatch(const Slice &)1010 bool MayMatch(const Slice&) override { return true; }
1011 using FilterBitsReader::MayMatch; // inherit overload
1012 };
1013
1014 class AlwaysFalseFilter : public FilterBitsReader {
1015 public:
MayMatch(const Slice &)1016 bool MayMatch(const Slice&) override { return false; }
1017 using FilterBitsReader::MayMatch; // inherit overload
1018 };
1019
1020 } // namespace
1021
1022 const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllFixedImpls = {
1023 kLegacyBloom,
1024 kDeprecatedBlock,
1025 kFastLocalBloom,
1026 kStandard128Ribbon,
1027 };
1028
1029 const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllUserModes = {
1030 kDeprecatedBlock,
1031 kAutoBloom,
1032 kStandard128Ribbon,
1033 };
1034
BloomFilterPolicy(double bits_per_key,Mode mode)1035 BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode)
1036 : mode_(mode), warned_(false), aggregate_rounding_balance_(0) {
1037 // Sanitize bits_per_key
1038 if (bits_per_key < 1.0) {
1039 bits_per_key = 1.0;
1040 } else if (!(bits_per_key < 100.0)) { // including NaN
1041 bits_per_key = 100.0;
1042 }
1043
1044 // Includes a nudge toward rounding up, to ensure on all platforms
1045 // that doubles specified with three decimal digits after the decimal
1046 // point are interpreted accurately.
1047 millibits_per_key_ = static_cast<int>(bits_per_key * 1000.0 + 0.500001);
1048
1049 // For now configure Ribbon filter to match Bloom FP rate and save
1050 // memory. (Ribbon bits per key will be ~30% less than Bloom bits per key
1051 // for same FP rate.)
1052 desired_one_in_fp_rate_ =
1053 1.0 / BloomMath::CacheLocalFpRate(
1054 bits_per_key,
1055 FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_),
1056 /*cache_line_bits*/ 512);
1057
1058 // For better or worse, this is a rounding up of a nudged rounding up,
1059 // e.g. 7.4999999999999 will round up to 8, but that provides more
1060 // predictability against small arithmetic errors in floating point.
1061 whole_bits_per_key_ = (millibits_per_key_ + 500) / 1000;
1062 }
1063
~BloomFilterPolicy()1064 BloomFilterPolicy::~BloomFilterPolicy() {}
1065
Name() const1066 const char* BuiltinFilterPolicy::Name() const {
1067 return "rocksdb.BuiltinBloomFilter";
1068 }
1069
CreateFilter(const Slice * keys,int n,std::string * dst) const1070 void BloomFilterPolicy::CreateFilter(const Slice* keys, int n,
1071 std::string* dst) const {
1072 // We should ideally only be using this deprecated interface for
1073 // appropriately constructed BloomFilterPolicy
1074 assert(mode_ == kDeprecatedBlock);
1075
1076 // Compute bloom filter size (in both bits and bytes)
1077 uint32_t bits = static_cast<uint32_t>(n * whole_bits_per_key_);
1078
1079 // For small n, we can see a very high false positive rate. Fix it
1080 // by enforcing a minimum bloom filter length.
1081 if (bits < 64) bits = 64;
1082
1083 uint32_t bytes = (bits + 7) / 8;
1084 bits = bytes * 8;
1085
1086 int num_probes =
1087 LegacyNoLocalityBloomImpl::ChooseNumProbes(whole_bits_per_key_);
1088
1089 const size_t init_size = dst->size();
1090 dst->resize(init_size + bytes, 0);
1091 dst->push_back(static_cast<char>(num_probes)); // Remember # of probes
1092 char* array = &(*dst)[init_size];
1093 for (int i = 0; i < n; i++) {
1094 LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes,
1095 array);
1096 }
1097 }
1098
KeyMayMatch(const Slice & key,const Slice & bloom_filter) const1099 bool BuiltinFilterPolicy::KeyMayMatch(const Slice& key,
1100 const Slice& bloom_filter) const {
1101 const size_t len = bloom_filter.size();
1102 if (len < 2 || len > 0xffffffffU) {
1103 return false;
1104 }
1105
1106 const char* array = bloom_filter.data();
1107 const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
1108
1109 // Use the encoded k so that we can read filters generated by
1110 // bloom filters created using different parameters.
1111 const int k = static_cast<uint8_t>(array[len - 1]);
1112 if (k > 30) {
1113 // Reserved for potentially new encodings for short bloom filters.
1114 // Consider it a match.
1115 return true;
1116 }
1117 // NB: using stored k not num_probes for whole_bits_per_key_
1118 return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k,
1119 array);
1120 }
1121
GetFilterBitsBuilder() const1122 FilterBitsBuilder* BuiltinFilterPolicy::GetFilterBitsBuilder() const {
1123 // This code path should no longer be used, for the built-in
1124 // BloomFilterPolicy. Internal to RocksDB and outside
1125 // BloomFilterPolicy, only get a FilterBitsBuilder with
1126 // BloomFilterPolicy::GetBuilderFromContext(), which will call
1127 // BloomFilterPolicy::GetBuilderWithContext(). RocksDB users have
1128 // been warned (HISTORY.md) that they can no longer call this on
1129 // the built-in BloomFilterPolicy (unlikely).
1130 assert(false);
1131 return GetBuilderWithContext(FilterBuildingContext(BlockBasedTableOptions()));
1132 }
1133
GetBuilderWithContext(const FilterBuildingContext & context) const1134 FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext(
1135 const FilterBuildingContext& context) const {
1136 Mode cur = mode_;
1137 bool offm = context.table_options.optimize_filters_for_memory;
1138 // Unusual code construction so that we can have just
1139 // one exhaustive switch without (risky) recursion
1140 for (int i = 0; i < 2; ++i) {
1141 switch (cur) {
1142 case kAutoBloom:
1143 if (context.table_options.format_version < 5) {
1144 cur = kLegacyBloom;
1145 } else {
1146 cur = kFastLocalBloom;
1147 }
1148 break;
1149 case kDeprecatedBlock:
1150 return nullptr;
1151 case kFastLocalBloom:
1152 return new FastLocalBloomBitsBuilder(
1153 millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr);
1154 case kLegacyBloom:
1155 if (whole_bits_per_key_ >= 14 && context.info_log &&
1156 !warned_.load(std::memory_order_relaxed)) {
1157 warned_ = true;
1158 const char* adjective;
1159 if (whole_bits_per_key_ >= 20) {
1160 adjective = "Dramatic";
1161 } else {
1162 adjective = "Significant";
1163 }
1164 // For more details, see
1165 // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
1166 ROCKS_LOG_WARN(
1167 context.info_log,
1168 "Using legacy Bloom filter with high (%d) bits/key. "
1169 "%s filter space and/or accuracy improvement is available "
1170 "with format_version>=5.",
1171 whole_bits_per_key_, adjective);
1172 }
1173 return new LegacyBloomBitsBuilder(whole_bits_per_key_,
1174 context.info_log);
1175 case kStandard128Ribbon:
1176 return new Standard128RibbonBitsBuilder(
1177 desired_one_in_fp_rate_, millibits_per_key_,
1178 offm ? &aggregate_rounding_balance_ : nullptr, context.info_log);
1179 }
1180 }
1181 assert(false);
1182 return nullptr; // something legal
1183 }
1184
GetBuilderFromContext(const FilterBuildingContext & context)1185 FilterBitsBuilder* BloomFilterPolicy::GetBuilderFromContext(
1186 const FilterBuildingContext& context) {
1187 if (context.table_options.filter_policy) {
1188 return context.table_options.filter_policy->GetBuilderWithContext(context);
1189 } else {
1190 return nullptr;
1191 }
1192 }
1193
1194 // Read metadata to determine what kind of FilterBitsReader is needed
1195 // and return a new one.
GetFilterBitsReader(const Slice & contents) const1196 FilterBitsReader* BuiltinFilterPolicy::GetFilterBitsReader(
1197 const Slice& contents) const {
1198 uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
1199 if (len_with_meta <= kMetadataLen) {
1200 // filter is empty or broken. Treat like zero keys added.
1201 return new AlwaysFalseFilter();
1202 }
1203
1204 // Legacy Bloom filter data:
1205 // 0 +-----------------------------------+
1206 // | Raw Bloom filter data |
1207 // | ... |
1208 // len +-----------------------------------+
1209 // | byte for num_probes or |
1210 // | marker for new implementations |
1211 // len+1 +-----------------------------------+
1212 // | four bytes for number of cache |
1213 // | lines |
1214 // len_with_meta +-----------------------------------+
1215
1216 int8_t raw_num_probes =
1217 static_cast<int8_t>(contents.data()[len_with_meta - kMetadataLen]);
1218 // NB: *num_probes > 30 and < 128 probably have not been used, because of
1219 // BloomFilterPolicy::initialize, unless directly calling
1220 // LegacyBloomBitsBuilder as an API, but we are leaving those cases in
1221 // limbo with LegacyBloomBitsReader for now.
1222
1223 if (raw_num_probes < 1) {
1224 // Note: < 0 (or unsigned > 127) indicate special new implementations
1225 // (or reserved for future use)
1226 switch (raw_num_probes) {
1227 case 0:
1228 // Treat as zero probes (always FP)
1229 return new AlwaysTrueFilter();
1230 case -1:
1231 // Marker for newer Bloom implementations
1232 return GetBloomBitsReader(contents);
1233 case -2:
1234 // Marker for Ribbon implementations
1235 return GetRibbonBitsReader(contents);
1236 default:
1237 // Reserved (treat as zero probes, always FP, for now)
1238 return new AlwaysTrueFilter();
1239 }
1240 }
1241 // else attempt decode for LegacyBloomBitsReader
1242
1243 int num_probes = raw_num_probes;
1244 assert(num_probes >= 1);
1245 assert(num_probes <= 127);
1246
1247 uint32_t len = len_with_meta - kMetadataLen;
1248 assert(len > 0);
1249
1250 uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
1251 uint32_t log2_cache_line_size;
1252
1253 if (num_lines * CACHE_LINE_SIZE == len) {
1254 // Common case
1255 log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE);
1256 } else if (num_lines == 0 || len % num_lines != 0) {
1257 // Invalid (no solution to num_lines * x == len)
1258 // Treat as zero probes (always FP) for now.
1259 return new AlwaysTrueFilter();
1260 } else {
1261 // Determine the non-native cache line size (from another system)
1262 log2_cache_line_size = 0;
1263 while ((num_lines << log2_cache_line_size) < len) {
1264 ++log2_cache_line_size;
1265 }
1266 if ((num_lines << log2_cache_line_size) != len) {
1267 // Invalid (block size not a power of two)
1268 // Treat as zero probes (always FP) for now.
1269 return new AlwaysTrueFilter();
1270 }
1271 }
1272 // if not early return
1273 return new LegacyBloomBitsReader(contents.data(), num_probes, num_lines,
1274 log2_cache_line_size);
1275 }
1276
GetRibbonBitsReader(const Slice & contents) const1277 FilterBitsReader* BuiltinFilterPolicy::GetRibbonBitsReader(
1278 const Slice& contents) const {
1279 uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
1280 uint32_t len = len_with_meta - kMetadataLen;
1281
1282 assert(len > 0); // precondition
1283
1284 uint32_t seed = static_cast<uint8_t>(contents.data()[len + 1]);
1285 uint32_t num_blocks = static_cast<uint8_t>(contents.data()[len + 2]);
1286 num_blocks |= static_cast<uint8_t>(contents.data()[len + 3]) << 8;
1287 num_blocks |= static_cast<uint8_t>(contents.data()[len + 4]) << 16;
1288 if (num_blocks < 2) {
1289 // Not supported
1290 // num_blocks == 1 is not used because num_starts == 1 is problematic
1291 // for the hashing scheme. num_blocks == 0 is unused because there's
1292 // already a concise encoding of an "always false" filter.
1293 // Return something safe:
1294 return new AlwaysTrueFilter();
1295 }
1296 return new Standard128RibbonBitsReader(contents.data(), len, num_blocks,
1297 seed);
1298 }
1299
1300 // For newer Bloom filter implementations
GetBloomBitsReader(const Slice & contents) const1301 FilterBitsReader* BuiltinFilterPolicy::GetBloomBitsReader(
1302 const Slice& contents) const {
1303 uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
1304 uint32_t len = len_with_meta - kMetadataLen;
1305
1306 assert(len > 0); // precondition
1307
1308 // New Bloom filter data:
1309 // 0 +-----------------------------------+
1310 // | Raw Bloom filter data |
1311 // | ... |
1312 // len +-----------------------------------+
1313 // | char{-1} byte -> new Bloom filter |
1314 // len+1 +-----------------------------------+
1315 // | byte for subimplementation |
1316 // | 0: FastLocalBloom |
1317 // | other: reserved |
1318 // len+2 +-----------------------------------+
1319 // | byte for block_and_probes |
1320 // | 0 in top 3 bits -> 6 -> 64-byte |
1321 // | reserved: |
1322 // | 1 in top 3 bits -> 7 -> 128-byte|
1323 // | 2 in top 3 bits -> 8 -> 256-byte|
1324 // | ... |
1325 // | num_probes in bottom 5 bits, |
1326 // | except 0 and 31 reserved |
1327 // len+3 +-----------------------------------+
1328 // | two bytes reserved |
1329 // | possibly for hash seed |
1330 // len_with_meta +-----------------------------------+
1331
1332 // Read more metadata (see above)
1333 char sub_impl_val = contents.data()[len_with_meta - 4];
1334 char block_and_probes = contents.data()[len_with_meta - 3];
1335 int log2_block_bytes = ((block_and_probes >> 5) & 7) + 6;
1336
1337 int num_probes = (block_and_probes & 31);
1338 if (num_probes < 1 || num_probes > 30) {
1339 // Reserved / future safe
1340 return new AlwaysTrueFilter();
1341 }
1342
1343 uint16_t rest = DecodeFixed16(contents.data() + len_with_meta - 2);
1344 if (rest != 0) {
1345 // Reserved, possibly for hash seed
1346 // Future safe
1347 return new AlwaysTrueFilter();
1348 }
1349
1350 if (sub_impl_val == 0) { // FastLocalBloom
1351 if (log2_block_bytes == 6) { // Only block size supported for now
1352 return new FastLocalBloomBitsReader(contents.data(), num_probes, len);
1353 }
1354 }
1355 // otherwise
1356 // Reserved / future safe
1357 return new AlwaysTrueFilter();
1358 }
1359
NewBloomFilterPolicy(double bits_per_key,bool use_block_based_builder)1360 const FilterPolicy* NewBloomFilterPolicy(double bits_per_key,
1361 bool use_block_based_builder) {
1362 BloomFilterPolicy::Mode m;
1363 if (use_block_based_builder) {
1364 m = BloomFilterPolicy::kDeprecatedBlock;
1365 } else {
1366 m = BloomFilterPolicy::kAutoBloom;
1367 }
1368 assert(std::find(BloomFilterPolicy::kAllUserModes.begin(),
1369 BloomFilterPolicy::kAllUserModes.end(),
1370 m) != BloomFilterPolicy::kAllUserModes.end());
1371 return new BloomFilterPolicy(bits_per_key, m);
1372 }
1373
1374 // Chooses between two filter policies based on LSM level, but
1375 // only for Level and Universal compaction styles. Flush is treated
1376 // as level -1. Policy b is considered fallback / primary policy.
LevelThresholdFilterPolicy(std::unique_ptr<const FilterPolicy> && a,std::unique_ptr<const FilterPolicy> && b,int starting_level_for_b)1377 LevelThresholdFilterPolicy::LevelThresholdFilterPolicy(
1378 std::unique_ptr<const FilterPolicy>&& a,
1379 std::unique_ptr<const FilterPolicy>&& b, int starting_level_for_b)
1380 : policy_a_(std::move(a)),
1381 policy_b_(std::move(b)),
1382 starting_level_for_b_(starting_level_for_b) {
1383 // Don't use this wrapper class if you were going to set to -1
1384 assert(starting_level_for_b_ >= 0);
1385 }
1386
1387 // Deprecated block-based filter only
CreateFilter(const Slice * keys,int n,std::string * dst) const1388 void LevelThresholdFilterPolicy::CreateFilter(const Slice* keys, int n,
1389 std::string* dst) const {
1390 policy_b_->CreateFilter(keys, n, dst);
1391 }
1392
GetBuilderWithContext(const FilterBuildingContext & context) const1393 FilterBitsBuilder* LevelThresholdFilterPolicy::GetBuilderWithContext(
1394 const FilterBuildingContext& context) const {
1395 switch (context.compaction_style) {
1396 case kCompactionStyleLevel:
1397 case kCompactionStyleUniversal: {
1398 int levelish;
1399 if (context.reason == TableFileCreationReason::kFlush) {
1400 // Treat flush as level -1
1401 assert(context.level_at_creation == 0);
1402 levelish = -1;
1403 } else if (context.level_at_creation == -1) {
1404 // Unknown level
1405 // Policy b considered fallback / primary
1406 return policy_b_->GetBuilderWithContext(context);
1407 } else {
1408 levelish = context.level_at_creation;
1409 }
1410 if (levelish >= starting_level_for_b_) {
1411 return policy_b_->GetBuilderWithContext(context);
1412 } else {
1413 return policy_a_->GetBuilderWithContext(context);
1414 }
1415 }
1416 case kCompactionStyleFIFO:
1417 case kCompactionStyleNone:
1418 break;
1419 }
1420 // Policy b considered fallback / primary
1421 return policy_b_->GetBuilderWithContext(context);
1422 }
1423
NewRibbonFilterPolicy(double bloom_equivalent_bits_per_key,int bloom_before_level)1424 const FilterPolicy* NewRibbonFilterPolicy(double bloom_equivalent_bits_per_key,
1425 int bloom_before_level) {
1426 std::unique_ptr<const FilterPolicy> ribbon_only{new BloomFilterPolicy(
1427 bloom_equivalent_bits_per_key, BloomFilterPolicy::kStandard128Ribbon)};
1428 if (bloom_before_level > -1) {
1429 // Could also use Bloom policy
1430 std::unique_ptr<const FilterPolicy> bloom_only{new BloomFilterPolicy(
1431 bloom_equivalent_bits_per_key, BloomFilterPolicy::kFastLocalBloom)};
1432 return new LevelThresholdFilterPolicy(
1433 std::move(bloom_only), std::move(ribbon_only), bloom_before_level);
1434 } else {
1435 return ribbon_only.release();
1436 }
1437 }
1438
FilterBuildingContext(const BlockBasedTableOptions & _table_options)1439 FilterBuildingContext::FilterBuildingContext(
1440 const BlockBasedTableOptions& _table_options)
1441 : table_options(_table_options) {}
1442
~FilterPolicy()1443 FilterPolicy::~FilterPolicy() { }
1444
CreateFromString(const ConfigOptions &,const std::string & value,std::shared_ptr<const FilterPolicy> * policy)1445 Status FilterPolicy::CreateFromString(
1446 const ConfigOptions& /*options*/, const std::string& value,
1447 std::shared_ptr<const FilterPolicy>* policy) {
1448 const std::string kBloomName = "bloomfilter:";
1449 const std::string kExpRibbonName = "experimental_ribbon:";
1450 const std::string kRibbonName = "ribbonfilter:";
1451 if (value == kNullptrString || value == "rocksdb.BuiltinBloomFilter") {
1452 policy->reset();
1453 #ifndef ROCKSDB_LITE
1454 } else if (value.compare(0, kBloomName.size(), kBloomName) == 0) {
1455 size_t pos = value.find(':', kBloomName.size());
1456 if (pos == std::string::npos) {
1457 return Status::InvalidArgument(
1458 "Invalid filter policy config, missing bits_per_key");
1459 } else {
1460 double bits_per_key = ParseDouble(
1461 trim(value.substr(kBloomName.size(), pos - kBloomName.size())));
1462 bool use_block_based_builder =
1463 ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1)));
1464 policy->reset(
1465 NewBloomFilterPolicy(bits_per_key, use_block_based_builder));
1466 }
1467 } else if (value.compare(0, kExpRibbonName.size(), kExpRibbonName) == 0) {
1468 double bloom_equivalent_bits_per_key =
1469 ParseDouble(trim(value.substr(kExpRibbonName.size())));
1470 policy->reset(
1471 NewExperimentalRibbonFilterPolicy(bloom_equivalent_bits_per_key));
1472 } else if (value.compare(0, kRibbonName.size(), kRibbonName) == 0) {
1473 size_t pos = value.find(':', kRibbonName.size());
1474 int bloom_before_level;
1475 if (pos == std::string::npos) {
1476 pos = value.size();
1477 bloom_before_level = 0;
1478 } else {
1479 bloom_before_level = ParseInt(trim(value.substr(pos + 1)));
1480 }
1481 double bloom_equivalent_bits_per_key =
1482 ParseDouble(trim(value.substr(kRibbonName.size(), pos)));
1483 policy->reset(NewRibbonFilterPolicy(bloom_equivalent_bits_per_key,
1484 bloom_before_level));
1485 } else {
1486 return Status::NotFound("Invalid filter policy name ", value);
1487 #else
1488 } else {
1489 return Status::NotSupported("Cannot load filter policy in LITE mode ",
1490 value);
1491 #endif // ROCKSDB_LITE
1492 }
1493 return Status::OK();
1494 }
1495 } // namespace ROCKSDB_NAMESPACE
1496