1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2012 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #include "table/block_based/block_based_filter_block.h"
11 #include <algorithm>
12 
13 #include "db/dbformat.h"
14 #include "monitoring/perf_context_imp.h"
15 #include "rocksdb/filter_policy.h"
16 #include "table/block_based/block_based_table_reader.h"
17 #include "util/coding.h"
18 #include "util/string_util.h"
19 
20 namespace ROCKSDB_NAMESPACE {
21 
22 namespace {
23 
AppendItem(std::string * props,const std::string & key,const std::string & value)24 void AppendItem(std::string* props, const std::string& key,
25                 const std::string& value) {
26   char cspace = ' ';
27   std::string value_str("");
28   size_t i = 0;
29   const size_t dataLength = 64;
30   const size_t tabLength = 2;
31   const size_t offLength = 16;
32 
33   value_str.append(&value[i], std::min(size_t(dataLength), value.size()));
34   i += dataLength;
35   while (i < value.size()) {
36     value_str.append("\n");
37     value_str.append(offLength, cspace);
38     value_str.append(&value[i], std::min(size_t(dataLength), value.size() - i));
39     i += dataLength;
40   }
41 
42   std::string result("");
43   if (key.size() < (offLength - tabLength))
44     result.append(size_t((offLength - tabLength)) - key.size(), cspace);
45   result.append(key);
46 
47   props->append(result + ": " + value_str + "\n");
48 }
49 
50 template <class TKey>
AppendItem(std::string * props,const TKey & key,const std::string & value)51 void AppendItem(std::string* props, const TKey& key, const std::string& value) {
52   std::string key_str = ROCKSDB_NAMESPACE::ToString(key);
53   AppendItem(props, key_str, value);
54 }
55 }  // namespace
56 
57 // See doc/table_format.txt for an explanation of the filter block format.
58 
59 // Generate new filter every 2KB of data
60 static const size_t kFilterBaseLg = 11;
61 static const size_t kFilterBase = 1 << kFilterBaseLg;
62 
BlockBasedFilterBlockBuilder(const SliceTransform * prefix_extractor,const BlockBasedTableOptions & table_opt)63 BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder(
64     const SliceTransform* prefix_extractor,
65     const BlockBasedTableOptions& table_opt)
66     : policy_(table_opt.filter_policy.get()),
67       prefix_extractor_(prefix_extractor),
68       whole_key_filtering_(table_opt.whole_key_filtering),
69       prev_prefix_start_(0),
70       prev_prefix_size_(0),
71       total_added_in_built_(0) {
72   assert(policy_);
73 }
74 
StartBlock(uint64_t block_offset)75 void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) {
76   uint64_t filter_index = (block_offset / kFilterBase);
77   assert(filter_index >= filter_offsets_.size());
78   while (filter_index > filter_offsets_.size()) {
79     GenerateFilter();
80   }
81 }
82 
EstimateEntriesAdded()83 size_t BlockBasedFilterBlockBuilder::EstimateEntriesAdded() {
84   return total_added_in_built_ + start_.size();
85 }
86 
Add(const Slice & key_without_ts)87 void BlockBasedFilterBlockBuilder::Add(const Slice& key_without_ts) {
88   if (prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts)) {
89     AddPrefix(key_without_ts);
90   }
91 
92   if (whole_key_filtering_) {
93     AddKey(key_without_ts);
94   }
95 }
96 
97 // Add key to filter if needed
AddKey(const Slice & key)98 inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) {
99   start_.push_back(entries_.size());
100   entries_.append(key.data(), key.size());
101 }
102 
103 // Add prefix to filter if needed
AddPrefix(const Slice & key)104 inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) {
105   // get slice for most recently added entry
106   Slice prev;
107   if (prev_prefix_size_ > 0) {
108     prev = Slice(entries_.data() + prev_prefix_start_, prev_prefix_size_);
109   }
110 
111   Slice prefix = prefix_extractor_->Transform(key);
112   // insert prefix only when it's different from the previous prefix.
113   if (prev.size() == 0 || prefix != prev) {
114     prev_prefix_start_ = entries_.size();
115     prev_prefix_size_ = prefix.size();
116     AddKey(prefix);
117   }
118 }
119 
Finish(const BlockHandle &,Status * status)120 Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
121                                            Status* status) {
122   // In this impl we ignore BlockHandle
123   *status = Status::OK();
124 
125   if (!start_.empty()) {
126     GenerateFilter();
127   }
128 
129   // Append array of per-filter offsets
130   const uint32_t array_offset = static_cast<uint32_t>(result_.size());
131   for (size_t i = 0; i < filter_offsets_.size(); i++) {
132     PutFixed32(&result_, filter_offsets_[i]);
133   }
134 
135   PutFixed32(&result_, array_offset);
136   result_.push_back(kFilterBaseLg);  // Save encoding parameter in result
137   return Slice(result_);
138 }
139 
GenerateFilter()140 void BlockBasedFilterBlockBuilder::GenerateFilter() {
141   const size_t num_entries = start_.size();
142   if (num_entries == 0) {
143     // Fast path if there are no keys for this filter
144     filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
145     return;
146   }
147   total_added_in_built_ += num_entries;
148 
149   // Make list of keys from flattened key structure
150   start_.push_back(entries_.size());  // Simplify length computation
151   tmp_entries_.resize(num_entries);
152   for (size_t i = 0; i < num_entries; i++) {
153     const char* base = entries_.data() + start_[i];
154     size_t length = start_[i + 1] - start_[i];
155     tmp_entries_[i] = Slice(base, length);
156   }
157 
158   // Generate filter for current set of keys and append to result_.
159   filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
160   policy_->CreateFilter(&tmp_entries_[0], static_cast<int>(num_entries),
161                         &result_);
162 
163   tmp_entries_.clear();
164   entries_.clear();
165   start_.clear();
166   prev_prefix_start_ = 0;
167   prev_prefix_size_ = 0;
168 }
169 
BlockBasedFilterBlockReader(const BlockBasedTable * t,CachableEntry<BlockContents> && filter_block)170 BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
171     const BlockBasedTable* t, CachableEntry<BlockContents>&& filter_block)
172     : FilterBlockReaderCommon(t, std::move(filter_block)) {
173   assert(table());
174   assert(table()->get_rep());
175   assert(table()->get_rep()->filter_policy);
176 }
177 
Create(const BlockBasedTable * table,const ReadOptions & ro,FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context)178 std::unique_ptr<FilterBlockReader> BlockBasedFilterBlockReader::Create(
179     const BlockBasedTable* table, const ReadOptions& ro,
180     FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
181     bool pin, BlockCacheLookupContext* lookup_context) {
182   assert(table);
183   assert(table->get_rep());
184   assert(!pin || prefetch);
185 
186   CachableEntry<BlockContents> filter_block;
187   if (prefetch || !use_cache) {
188     const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache,
189                                      nullptr /* get_context */, lookup_context,
190                                      &filter_block);
191     if (!s.ok()) {
192       IGNORE_STATUS_IF_ERROR(s);
193       return std::unique_ptr<FilterBlockReader>();
194     }
195 
196     if (use_cache && !pin) {
197       filter_block.Reset();
198     }
199   }
200 
201   return std::unique_ptr<FilterBlockReader>(
202       new BlockBasedFilterBlockReader(table, std::move(filter_block)));
203 }
204 
KeyMayMatch(const Slice & key,const SliceTransform *,uint64_t block_offset,const bool no_io,const Slice * const,GetContext * get_context,BlockCacheLookupContext * lookup_context)205 bool BlockBasedFilterBlockReader::KeyMayMatch(
206     const Slice& key, const SliceTransform* /* prefix_extractor */,
207     uint64_t block_offset, const bool no_io,
208     const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
209     BlockCacheLookupContext* lookup_context) {
210   assert(block_offset != kNotValid);
211   if (!whole_key_filtering()) {
212     return true;
213   }
214   return MayMatch(key, block_offset, no_io, get_context, lookup_context);
215 }
216 
PrefixMayMatch(const Slice & prefix,const SliceTransform *,uint64_t block_offset,const bool no_io,const Slice * const,GetContext * get_context,BlockCacheLookupContext * lookup_context)217 bool BlockBasedFilterBlockReader::PrefixMayMatch(
218     const Slice& prefix, const SliceTransform* /* prefix_extractor */,
219     uint64_t block_offset, const bool no_io,
220     const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
221     BlockCacheLookupContext* lookup_context) {
222   assert(block_offset != kNotValid);
223   return MayMatch(prefix, block_offset, no_io, get_context, lookup_context);
224 }
225 
ParseFieldsFromBlock(const BlockContents & contents,const char ** data,const char ** offset,size_t * num,size_t * base_lg)226 bool BlockBasedFilterBlockReader::ParseFieldsFromBlock(
227     const BlockContents& contents, const char** data, const char** offset,
228     size_t* num, size_t* base_lg) {
229   assert(data);
230   assert(offset);
231   assert(num);
232   assert(base_lg);
233 
234   const size_t n = contents.data.size();
235   if (n < 5) {  // 1 byte for base_lg and 4 for start of offset array
236     return false;
237   }
238 
239   const uint32_t last_word = DecodeFixed32(contents.data.data() + n - 5);
240   if (last_word > n - 5) {
241     return false;
242   }
243 
244   *data = contents.data.data();
245   *offset = (*data) + last_word;
246   *num = (n - 5 - last_word) / 4;
247   *base_lg = contents.data[n - 1];
248 
249   return true;
250 }
251 
MayMatch(const Slice & entry,uint64_t block_offset,bool no_io,GetContext * get_context,BlockCacheLookupContext * lookup_context) const252 bool BlockBasedFilterBlockReader::MayMatch(
253     const Slice& entry, uint64_t block_offset, bool no_io,
254     GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
255   CachableEntry<BlockContents> filter_block;
256 
257   const Status s =
258       GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
259   if (!s.ok()) {
260     return true;
261   }
262 
263   assert(filter_block.GetValue());
264 
265   const char* data = nullptr;
266   const char* offset = nullptr;
267   size_t num = 0;
268   size_t base_lg = 0;
269   if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num,
270                             &base_lg)) {
271     return true;  // Errors are treated as potential matches
272   }
273 
274   const uint64_t index = block_offset >> base_lg;
275   if (index < num) {
276     const uint32_t start = DecodeFixed32(offset + index * 4);
277     const uint32_t limit = DecodeFixed32(offset + index * 4 + 4);
278     if (start <= limit && limit <= (uint32_t)(offset - data)) {
279       const Slice filter = Slice(data + start, limit - start);
280 
281       assert(table());
282       assert(table()->get_rep());
283       const FilterPolicy* const policy = table()->get_rep()->filter_policy;
284 
285       const bool may_match = policy->KeyMayMatch(entry, filter);
286       if (may_match) {
287         PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
288         return true;
289       } else {
290         PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
291         return false;
292       }
293     } else if (start == limit) {
294       // Empty filters do not match any entries
295       return false;
296     }
297   }
298   return true;  // Errors are treated as potential matches
299 }
300 
ApproximateMemoryUsage() const301 size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const {
302   size_t usage = ApproximateFilterBlockMemoryUsage();
303 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
304   usage += malloc_usable_size(const_cast<BlockBasedFilterBlockReader*>(this));
305 #else
306   usage += sizeof(*this);
307 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
308   return usage;
309 }
310 
ToString() const311 std::string BlockBasedFilterBlockReader::ToString() const {
312   CachableEntry<BlockContents> filter_block;
313 
314   const Status s =
315       GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
316                            nullptr /* lookup_context */, &filter_block);
317   if (!s.ok()) {
318     return std::string("Unable to retrieve filter block");
319   }
320 
321   assert(filter_block.GetValue());
322 
323   const char* data = nullptr;
324   const char* offset = nullptr;
325   size_t num = 0;
326   size_t base_lg = 0;
327   if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num,
328                             &base_lg)) {
329     return std::string("Error parsing filter block");
330   }
331 
332   std::string result;
333   result.reserve(1024);
334 
335   std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks");
336   AppendItem(&result, s_fb, ROCKSDB_NAMESPACE::ToString(num));
337   AppendItem(&result, s_bo, s_hd);
338 
339   for (size_t index = 0; index < num; index++) {
340     uint32_t start = DecodeFixed32(offset + index * 4);
341     uint32_t limit = DecodeFixed32(offset + index * 4 + 4);
342 
343     if (start != limit) {
344       result.append(" filter block # " +
345                     ROCKSDB_NAMESPACE::ToString(index + 1) + "\n");
346       Slice filter = Slice(data + start, limit - start);
347       AppendItem(&result, start, filter.ToString(true));
348     }
349   }
350   return result;
351 }
352 
353 }  // namespace ROCKSDB_NAMESPACE
354