1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 #include "table/block_based/block_based_table_reader.h"
10 #include <algorithm>
11 #include <array>
12 #include <limits>
13 #include <string>
14 #include <utility>
15 #include <vector>
16 
17 #include "db/dbformat.h"
18 #include "db/pinned_iterators_manager.h"
19 
20 #include "file/file_prefetch_buffer.h"
21 #include "file/random_access_file_reader.h"
22 
23 #include "rocksdb/cache.h"
24 #include "rocksdb/comparator.h"
25 #include "rocksdb/env.h"
26 #include "rocksdb/file_system.h"
27 #include "rocksdb/filter_policy.h"
28 #include "rocksdb/iterator.h"
29 #include "rocksdb/options.h"
30 #include "rocksdb/statistics.h"
31 #include "rocksdb/table.h"
32 #include "rocksdb/table_properties.h"
33 
34 #include "table/block_based/block.h"
35 #include "table/block_based/block_based_filter_block.h"
36 #include "table/block_based/block_based_table_factory.h"
37 #include "table/block_based/block_prefix_index.h"
38 #include "table/block_based/filter_block.h"
39 #include "table/block_based/full_filter_block.h"
40 #include "table/block_based/partitioned_filter_block.h"
41 #include "table/block_fetcher.h"
42 #include "table/format.h"
43 #include "table/get_context.h"
44 #include "table/internal_iterator.h"
45 #include "table/meta_blocks.h"
46 #include "table/multiget_context.h"
47 #include "table/persistent_cache_helper.h"
48 #include "table/sst_file_writer_collectors.h"
49 #include "table/two_level_iterator.h"
50 
51 #include "monitoring/perf_context_imp.h"
52 #include "test_util/sync_point.h"
53 #include "util/coding.h"
54 #include "util/crc32c.h"
55 #include "util/stop_watch.h"
56 #include "util/string_util.h"
57 #include "util/util.h"
58 #include "util/xxhash.h"
59 
60 namespace ROCKSDB_NAMESPACE {
61 
62 extern const uint64_t kBlockBasedTableMagicNumber;
63 extern const std::string kHashIndexPrefixesBlock;
64 extern const std::string kHashIndexPrefixesMetadataBlock;
65 
66 typedef BlockBasedTable::IndexReader IndexReader;
67 
68 // Found that 256 KB readahead size provides the best performance, based on
69 // experiments, for auto readahead. Experiment data is in PR #3282.
70 const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
71 
~BlockBasedTable()72 BlockBasedTable::~BlockBasedTable() {
73   delete rep_;
74 }
75 
76 std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
77 
78 template <typename TBlocklike>
79 class BlocklikeTraits;
80 
81 template <>
82 class BlocklikeTraits<BlockContents> {
83  public:
Create(BlockContents && contents,SequenceNumber,size_t,Statistics *,bool,const FilterPolicy *)84   static BlockContents* Create(BlockContents&& contents,
85                                SequenceNumber /* global_seqno */,
86                                size_t /* read_amp_bytes_per_bit */,
87                                Statistics* /* statistics */,
88                                bool /* using_zstd */,
89                                const FilterPolicy* /* filter_policy */) {
90     return new BlockContents(std::move(contents));
91   }
92 
GetNumRestarts(const BlockContents &)93   static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
94     return 0;
95   }
96 };
97 
98 template <>
99 class BlocklikeTraits<ParsedFullFilterBlock> {
100  public:
Create(BlockContents && contents,SequenceNumber,size_t,Statistics *,bool,const FilterPolicy * filter_policy)101   static ParsedFullFilterBlock* Create(BlockContents&& contents,
102                                        SequenceNumber /* global_seqno */,
103                                        size_t /* read_amp_bytes_per_bit */,
104                                        Statistics* /* statistics */,
105                                        bool /* using_zstd */,
106                                        const FilterPolicy* filter_policy) {
107     return new ParsedFullFilterBlock(filter_policy, std::move(contents));
108   }
109 
GetNumRestarts(const ParsedFullFilterBlock &)110   static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
111     return 0;
112   }
113 };
114 
115 template <>
116 class BlocklikeTraits<Block> {
117  public:
Create(BlockContents && contents,SequenceNumber global_seqno,size_t read_amp_bytes_per_bit,Statistics * statistics,bool,const FilterPolicy *)118   static Block* Create(BlockContents&& contents, SequenceNumber global_seqno,
119                        size_t read_amp_bytes_per_bit, Statistics* statistics,
120                        bool /* using_zstd */,
121                        const FilterPolicy* /* filter_policy */) {
122     return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
123                      statistics);
124   }
125 
GetNumRestarts(const Block & block)126   static uint32_t GetNumRestarts(const Block& block) {
127     return block.NumRestarts();
128   }
129 };
130 
131 template <>
132 class BlocklikeTraits<UncompressionDict> {
133  public:
Create(BlockContents && contents,SequenceNumber,size_t,Statistics *,bool using_zstd,const FilterPolicy *)134   static UncompressionDict* Create(BlockContents&& contents,
135                                    SequenceNumber /* global_seqno */,
136                                    size_t /* read_amp_bytes_per_bit */,
137                                    Statistics* /* statistics */,
138                                    bool using_zstd,
139                                    const FilterPolicy* /* filter_policy */) {
140     return new UncompressionDict(contents.data, std::move(contents.allocation),
141                                  using_zstd);
142   }
143 
GetNumRestarts(const UncompressionDict &)144   static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
145     return 0;
146   }
147 };
148 
149 namespace {
150 // Read the block identified by "handle" from "file".
151 // The only relevant option is options.verify_checksums for now.
152 // On failure return non-OK.
153 // On success fill *result and return OK - caller owns *result
154 // @param uncompression_dict Data for presetting the compression library's
155 //    dictionary.
156 template <typename TBlocklike>
ReadBlockFromFile(RandomAccessFileReader * file,FilePrefetchBuffer * prefetch_buffer,const Footer & footer,const ReadOptions & options,const BlockHandle & handle,std::unique_ptr<TBlocklike> * result,const ImmutableCFOptions & ioptions,bool do_uncompress,bool maybe_compressed,BlockType block_type,const UncompressionDict & uncompression_dict,const PersistentCacheOptions & cache_options,SequenceNumber global_seqno,size_t read_amp_bytes_per_bit,MemoryAllocator * memory_allocator,bool for_compaction,bool using_zstd,const FilterPolicy * filter_policy)157 Status ReadBlockFromFile(
158     RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
159     const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
160     std::unique_ptr<TBlocklike>* result, const ImmutableCFOptions& ioptions,
161     bool do_uncompress, bool maybe_compressed, BlockType block_type,
162     const UncompressionDict& uncompression_dict,
163     const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
164     size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
165     bool for_compaction, bool using_zstd, const FilterPolicy* filter_policy) {
166   assert(result);
167 
168   BlockContents contents;
169   BlockFetcher block_fetcher(
170       file, prefetch_buffer, footer, options, handle, &contents, ioptions,
171       do_uncompress, maybe_compressed, block_type, uncompression_dict,
172       cache_options, memory_allocator, nullptr, for_compaction);
173   Status s = block_fetcher.ReadBlockContents();
174   if (s.ok()) {
175     result->reset(BlocklikeTraits<TBlocklike>::Create(
176         std::move(contents), global_seqno, read_amp_bytes_per_bit,
177         ioptions.statistics, using_zstd, filter_policy));
178   }
179 
180   return s;
181 }
182 
GetMemoryAllocator(const BlockBasedTableOptions & table_options)183 inline MemoryAllocator* GetMemoryAllocator(
184     const BlockBasedTableOptions& table_options) {
185   return table_options.block_cache.get()
186              ? table_options.block_cache->memory_allocator()
187              : nullptr;
188 }
189 
GetMemoryAllocatorForCompressedBlock(const BlockBasedTableOptions & table_options)190 inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
191     const BlockBasedTableOptions& table_options) {
192   return table_options.block_cache_compressed.get()
193              ? table_options.block_cache_compressed->memory_allocator()
194              : nullptr;
195 }
196 
197 // Delete the entry resided in the cache.
198 template <class Entry>
DeleteCachedEntry(const Slice &,void * value)199 void DeleteCachedEntry(const Slice& /*key*/, void* value) {
200   auto entry = reinterpret_cast<Entry*>(value);
201   delete entry;
202 }
203 
204 // Release the cached entry and decrement its ref count.
ForceReleaseCachedEntry(void * arg,void * h)205 void ForceReleaseCachedEntry(void* arg, void* h) {
206   Cache* cache = reinterpret_cast<Cache*>(arg);
207   Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
208   cache->Release(handle, true /* force_erase */);
209 }
210 
211 // Release the cached entry and decrement its ref count.
212 // Do not force erase
ReleaseCachedEntry(void * arg,void * h)213 void ReleaseCachedEntry(void* arg, void* h) {
214   Cache* cache = reinterpret_cast<Cache*>(arg);
215   Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
216   cache->Release(handle, false /* force_erase */);
217 }
218 
219 // For hash based index, return true if prefix_extractor and
220 // prefix_extractor_block mismatch, false otherwise. This flag will be used
221 // as total_order_seek via NewIndexIterator
PrefixExtractorChanged(const TableProperties * table_properties,const SliceTransform * prefix_extractor)222 bool PrefixExtractorChanged(const TableProperties* table_properties,
223                             const SliceTransform* prefix_extractor) {
224   // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
225   // Turn off hash index in prefix_extractor is not set; if  prefix_extractor
226   // is set but prefix_extractor_block is not set, also disable hash index
227   if (prefix_extractor == nullptr || table_properties == nullptr ||
228       table_properties->prefix_extractor_name.empty()) {
229     return true;
230   }
231 
232   // prefix_extractor and prefix_extractor_block are both non-empty
233   if (table_properties->prefix_extractor_name.compare(
234           prefix_extractor->Name()) != 0) {
235     return true;
236   } else {
237     return false;
238   }
239 }
240 
CopyBufferToHeap(MemoryAllocator * allocator,Slice & buf)241 CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
242   CacheAllocationPtr heap_buf;
243   heap_buf = AllocateBlock(buf.size(), allocator);
244   memcpy(heap_buf.get(), buf.data(), buf.size());
245   return heap_buf;
246 }
247 
248 }  // namespace
249 
250 // Encapsulates common functionality for the various index reader
251 // implementations. Provides access to the index block regardless of whether
252 // it is owned by the reader or stored in the cache, or whether it is pinned
253 // in the cache or not.
254 class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
255  public:
IndexReaderCommon(const BlockBasedTable * t,CachableEntry<Block> && index_block)256   IndexReaderCommon(const BlockBasedTable* t,
257                     CachableEntry<Block>&& index_block)
258       : table_(t), index_block_(std::move(index_block)) {
259     assert(table_ != nullptr);
260   }
261 
262  protected:
263   static Status ReadIndexBlock(const BlockBasedTable* table,
264                                FilePrefetchBuffer* prefetch_buffer,
265                                const ReadOptions& read_options, bool use_cache,
266                                GetContext* get_context,
267                                BlockCacheLookupContext* lookup_context,
268                                CachableEntry<Block>* index_block);
269 
table() const270   const BlockBasedTable* table() const { return table_; }
271 
internal_comparator() const272   const InternalKeyComparator* internal_comparator() const {
273     assert(table_ != nullptr);
274     assert(table_->get_rep() != nullptr);
275 
276     return &table_->get_rep()->internal_comparator;
277   }
278 
index_has_first_key() const279   bool index_has_first_key() const {
280     assert(table_ != nullptr);
281     assert(table_->get_rep() != nullptr);
282     return table_->get_rep()->index_has_first_key;
283   }
284 
index_key_includes_seq() const285   bool index_key_includes_seq() const {
286     assert(table_ != nullptr);
287     assert(table_->get_rep() != nullptr);
288     return table_->get_rep()->index_key_includes_seq;
289   }
290 
index_value_is_full() const291   bool index_value_is_full() const {
292     assert(table_ != nullptr);
293     assert(table_->get_rep() != nullptr);
294     return table_->get_rep()->index_value_is_full;
295   }
296 
cache_index_blocks() const297   bool cache_index_blocks() const {
298     assert(table_ != nullptr);
299     assert(table_->get_rep() != nullptr);
300     return table_->get_rep()->table_options.cache_index_and_filter_blocks;
301   }
302 
303   Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
304                              BlockCacheLookupContext* lookup_context,
305                              CachableEntry<Block>* index_block) const;
306 
ApproximateIndexBlockMemoryUsage() const307   size_t ApproximateIndexBlockMemoryUsage() const {
308     assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
309     return index_block_.GetOwnValue()
310                ? index_block_.GetValue()->ApproximateMemoryUsage()
311                : 0;
312   }
313 
314  private:
315   const BlockBasedTable* table_;
316   CachableEntry<Block> index_block_;
317 };
318 
ReadIndexBlock(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,const ReadOptions & read_options,bool use_cache,GetContext * get_context,BlockCacheLookupContext * lookup_context,CachableEntry<Block> * index_block)319 Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
320     const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
321     const ReadOptions& read_options, bool use_cache, GetContext* get_context,
322     BlockCacheLookupContext* lookup_context,
323     CachableEntry<Block>* index_block) {
324   PERF_TIMER_GUARD(read_index_block_nanos);
325 
326   assert(table != nullptr);
327   assert(index_block != nullptr);
328   assert(index_block->IsEmpty());
329 
330   const Rep* const rep = table->get_rep();
331   assert(rep != nullptr);
332 
333   const Status s = table->RetrieveBlock(
334       prefetch_buffer, read_options, rep->footer.index_handle(),
335       UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
336       get_context, lookup_context, /* for_compaction */ false, use_cache);
337 
338   return s;
339 }
340 
GetOrReadIndexBlock(bool no_io,GetContext * get_context,BlockCacheLookupContext * lookup_context,CachableEntry<Block> * index_block) const341 Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
342     bool no_io, GetContext* get_context,
343     BlockCacheLookupContext* lookup_context,
344     CachableEntry<Block>* index_block) const {
345   assert(index_block != nullptr);
346 
347   if (!index_block_.IsEmpty()) {
348     index_block->SetUnownedValue(index_block_.GetValue());
349     return Status::OK();
350   }
351 
352   ReadOptions read_options;
353   if (no_io) {
354     read_options.read_tier = kBlockCacheTier;
355   }
356 
357   return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
358                         cache_index_blocks(), get_context, lookup_context,
359                         index_block);
360 }
361 
362 // Index that allows binary search lookup in a two-level index structure.
363 class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
364  public:
365   // Read the partition index from the file and create an instance for
366   // `PartitionIndexReader`.
367   // On success, index_reader will be populated; otherwise it will remain
368   // unmodified.
Create(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)369   static Status Create(const BlockBasedTable* table,
370                        FilePrefetchBuffer* prefetch_buffer, bool use_cache,
371                        bool prefetch, bool pin,
372                        BlockCacheLookupContext* lookup_context,
373                        std::unique_ptr<IndexReader>* index_reader) {
374     assert(table != nullptr);
375     assert(table->get_rep());
376     assert(!pin || prefetch);
377     assert(index_reader != nullptr);
378 
379     CachableEntry<Block> index_block;
380     if (prefetch || !use_cache) {
381       const Status s =
382           ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
383                          /*get_context=*/nullptr, lookup_context, &index_block);
384       if (!s.ok()) {
385         return s;
386       }
387 
388       if (use_cache && !pin) {
389         index_block.Reset();
390       }
391     }
392 
393     index_reader->reset(
394         new PartitionIndexReader(table, std::move(index_block)));
395 
396     return Status::OK();
397   }
398 
399   // return a two-level iterator: first level is on the partition index
NewIterator(const ReadOptions & read_options,bool,IndexBlockIter * iter,GetContext * get_context,BlockCacheLookupContext * lookup_context)400   InternalIteratorBase<IndexValue>* NewIterator(
401       const ReadOptions& read_options, bool /* disable_prefix_seek */,
402       IndexBlockIter* iter, GetContext* get_context,
403       BlockCacheLookupContext* lookup_context) override {
404     const bool no_io = (read_options.read_tier == kBlockCacheTier);
405     CachableEntry<Block> index_block;
406     const Status s =
407         GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
408     if (!s.ok()) {
409       if (iter != nullptr) {
410         iter->Invalidate(s);
411         return iter;
412       }
413 
414       return NewErrorInternalIterator<IndexValue>(s);
415     }
416 
417     InternalIteratorBase<IndexValue>* it = nullptr;
418 
419     Statistics* kNullStats = nullptr;
420     // Filters are already checked before seeking the index
421     if (!partition_map_.empty()) {
422       // We don't return pinned data from index blocks, so no need
423       // to set `block_contents_pinned`.
424       it = NewTwoLevelIterator(
425           new BlockBasedTable::PartitionedIndexIteratorState(table(),
426                                                              &partition_map_),
427           index_block.GetValue()->NewIndexIterator(
428               internal_comparator(), internal_comparator()->user_comparator(),
429               nullptr, kNullStats, true, index_has_first_key(),
430               index_key_includes_seq(), index_value_is_full()));
431     } else {
432       ReadOptions ro;
433       ro.fill_cache = read_options.fill_cache;
434       // We don't return pinned data from index blocks, so no need
435       // to set `block_contents_pinned`.
436       it = new BlockBasedTableIterator<IndexBlockIter, IndexValue>(
437           table(), ro, *internal_comparator(),
438           index_block.GetValue()->NewIndexIterator(
439               internal_comparator(), internal_comparator()->user_comparator(),
440               nullptr, kNullStats, true, index_has_first_key(),
441               index_key_includes_seq(), index_value_is_full()),
442           false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
443           lookup_context ? lookup_context->caller
444                          : TableReaderCaller::kUncategorized);
445     }
446 
447     assert(it != nullptr);
448     index_block.TransferTo(it);
449 
450     return it;
451 
452     // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
453     // on-stack BlockIter while the state is on heap. Currentlly it assumes
454     // the first level iter is always on heap and will attempt to delete it
455     // in its destructor.
456   }
457 
CacheDependencies(bool pin)458   void CacheDependencies(bool pin) override {
459     // Before read partitions, prefetch them to avoid lots of IOs
460     BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
461     const BlockBasedTable::Rep* rep = table()->rep_;
462     IndexBlockIter biter;
463     BlockHandle handle;
464     Statistics* kNullStats = nullptr;
465 
466     CachableEntry<Block> index_block;
467     Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
468                                    &lookup_context, &index_block);
469     if (!s.ok()) {
470       ROCKS_LOG_WARN(rep->ioptions.info_log,
471                      "Error retrieving top-level index block while trying to "
472                      "cache index partitions: %s",
473                      s.ToString().c_str());
474       return;
475     }
476 
477     // We don't return pinned data from index blocks, so no need
478     // to set `block_contents_pinned`.
479     index_block.GetValue()->NewIndexIterator(
480         internal_comparator(), internal_comparator()->user_comparator(), &biter,
481         kNullStats, true, index_has_first_key(), index_key_includes_seq(),
482         index_value_is_full());
483     // Index partitions are assumed to be consecuitive. Prefetch them all.
484     // Read the first block offset
485     biter.SeekToFirst();
486     if (!biter.Valid()) {
487       // Empty index.
488       return;
489     }
490     handle = biter.value().handle;
491     uint64_t prefetch_off = handle.offset();
492 
493     // Read the last block's offset
494     biter.SeekToLast();
495     if (!biter.Valid()) {
496       // Empty index.
497       return;
498     }
499     handle = biter.value().handle;
500     uint64_t last_off = handle.offset() + block_size(handle);
501     uint64_t prefetch_len = last_off - prefetch_off;
502     std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
503     rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer);
504     s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off,
505                                   static_cast<size_t>(prefetch_len));
506 
507     // After prefetch, read the partitions one by one
508     biter.SeekToFirst();
509     auto ro = ReadOptions();
510     for (; biter.Valid(); biter.Next()) {
511       handle = biter.value().handle;
512       CachableEntry<Block> block;
513       // TODO: Support counter batch update for partitioned index and
514       // filter blocks
515       s = table()->MaybeReadBlockAndLoadToCache(
516           prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
517           &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context,
518           /*contents=*/nullptr);
519 
520       assert(s.ok() || block.GetValue() == nullptr);
521       if (s.ok() && block.GetValue() != nullptr) {
522         if (block.IsCached()) {
523           if (pin) {
524             partition_map_[handle.offset()] = std::move(block);
525           }
526         }
527       }
528     }
529   }
530 
ApproximateMemoryUsage() const531   size_t ApproximateMemoryUsage() const override {
532     size_t usage = ApproximateIndexBlockMemoryUsage();
533 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
534     usage += malloc_usable_size(const_cast<PartitionIndexReader*>(this));
535 #else
536     usage += sizeof(*this);
537 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
538     // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
539     return usage;
540   }
541 
542  private:
PartitionIndexReader(const BlockBasedTable * t,CachableEntry<Block> && index_block)543   PartitionIndexReader(const BlockBasedTable* t,
544                        CachableEntry<Block>&& index_block)
545       : IndexReaderCommon(t, std::move(index_block)) {}
546 
547   std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
548 };
549 
550 // Index that allows binary search lookup for the first key of each block.
551 // This class can be viewed as a thin wrapper for `Block` class which already
552 // supports binary search.
553 class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
554  public:
555   // Read index from the file and create an intance for
556   // `BinarySearchIndexReader`.
557   // On success, index_reader will be populated; otherwise it will remain
558   // unmodified.
Create(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)559   static Status Create(const BlockBasedTable* table,
560                        FilePrefetchBuffer* prefetch_buffer, bool use_cache,
561                        bool prefetch, bool pin,
562                        BlockCacheLookupContext* lookup_context,
563                        std::unique_ptr<IndexReader>* index_reader) {
564     assert(table != nullptr);
565     assert(table->get_rep());
566     assert(!pin || prefetch);
567     assert(index_reader != nullptr);
568 
569     CachableEntry<Block> index_block;
570     if (prefetch || !use_cache) {
571       const Status s =
572           ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
573                          /*get_context=*/nullptr, lookup_context, &index_block);
574       if (!s.ok()) {
575         return s;
576       }
577 
578       if (use_cache && !pin) {
579         index_block.Reset();
580       }
581     }
582 
583     index_reader->reset(
584         new BinarySearchIndexReader(table, std::move(index_block)));
585 
586     return Status::OK();
587   }
588 
NewIterator(const ReadOptions & read_options,bool,IndexBlockIter * iter,GetContext * get_context,BlockCacheLookupContext * lookup_context)589   InternalIteratorBase<IndexValue>* NewIterator(
590       const ReadOptions& read_options, bool /* disable_prefix_seek */,
591       IndexBlockIter* iter, GetContext* get_context,
592       BlockCacheLookupContext* lookup_context) override {
593     const bool no_io = (read_options.read_tier == kBlockCacheTier);
594     CachableEntry<Block> index_block;
595     const Status s =
596         GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
597     if (!s.ok()) {
598       if (iter != nullptr) {
599         iter->Invalidate(s);
600         return iter;
601       }
602 
603       return NewErrorInternalIterator<IndexValue>(s);
604     }
605 
606     Statistics* kNullStats = nullptr;
607     // We don't return pinned data from index blocks, so no need
608     // to set `block_contents_pinned`.
609     auto it = index_block.GetValue()->NewIndexIterator(
610         internal_comparator(), internal_comparator()->user_comparator(), iter,
611         kNullStats, true, index_has_first_key(), index_key_includes_seq(),
612         index_value_is_full());
613 
614     assert(it != nullptr);
615     index_block.TransferTo(it);
616 
617     return it;
618   }
619 
ApproximateMemoryUsage() const620   size_t ApproximateMemoryUsage() const override {
621     size_t usage = ApproximateIndexBlockMemoryUsage();
622 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
623     usage += malloc_usable_size(const_cast<BinarySearchIndexReader*>(this));
624 #else
625     usage += sizeof(*this);
626 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
627     return usage;
628   }
629 
630  private:
BinarySearchIndexReader(const BlockBasedTable * t,CachableEntry<Block> && index_block)631   BinarySearchIndexReader(const BlockBasedTable* t,
632                           CachableEntry<Block>&& index_block)
633       : IndexReaderCommon(t, std::move(index_block)) {}
634 };
635 
636 // Index that leverages an internal hash table to quicken the lookup for a given
637 // key.
638 class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
639  public:
Create(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_index_iter,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)640   static Status Create(const BlockBasedTable* table,
641                        FilePrefetchBuffer* prefetch_buffer,
642                        InternalIterator* meta_index_iter, bool use_cache,
643                        bool prefetch, bool pin,
644                        BlockCacheLookupContext* lookup_context,
645                        std::unique_ptr<IndexReader>* index_reader) {
646     assert(table != nullptr);
647     assert(index_reader != nullptr);
648     assert(!pin || prefetch);
649 
650     const BlockBasedTable::Rep* rep = table->get_rep();
651     assert(rep != nullptr);
652 
653     CachableEntry<Block> index_block;
654     if (prefetch || !use_cache) {
655       const Status s =
656           ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
657                          /*get_context=*/nullptr, lookup_context, &index_block);
658       if (!s.ok()) {
659         return s;
660       }
661 
662       if (use_cache && !pin) {
663         index_block.Reset();
664       }
665     }
666 
667     // Note, failure to create prefix hash index does not need to be a
668     // hard error. We can still fall back to the original binary search index.
669     // So, Create will succeed regardless, from this point on.
670 
671     index_reader->reset(new HashIndexReader(table, std::move(index_block)));
672 
673     // Get prefixes block
674     BlockHandle prefixes_handle;
675     Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
676                              &prefixes_handle);
677     if (!s.ok()) {
678       // TODO: log error
679       return Status::OK();
680     }
681 
682     // Get index metadata block
683     BlockHandle prefixes_meta_handle;
684     s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
685                       &prefixes_meta_handle);
686     if (!s.ok()) {
687       // TODO: log error
688       return Status::OK();
689     }
690 
691     RandomAccessFileReader* const file = rep->file.get();
692     const Footer& footer = rep->footer;
693     const ImmutableCFOptions& ioptions = rep->ioptions;
694     const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
695     MemoryAllocator* const memory_allocator =
696         GetMemoryAllocator(rep->table_options);
697 
698     // Read contents for the blocks
699     BlockContents prefixes_contents;
700     BlockFetcher prefixes_block_fetcher(
701         file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
702         &prefixes_contents, ioptions, true /*decompress*/,
703         true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
704         UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
705     s = prefixes_block_fetcher.ReadBlockContents();
706     if (!s.ok()) {
707       return s;
708     }
709     BlockContents prefixes_meta_contents;
710     BlockFetcher prefixes_meta_block_fetcher(
711         file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
712         &prefixes_meta_contents, ioptions, true /*decompress*/,
713         true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
714         UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
715     s = prefixes_meta_block_fetcher.ReadBlockContents();
716     if (!s.ok()) {
717       // TODO: log error
718       return Status::OK();
719     }
720 
721     BlockPrefixIndex* prefix_index = nullptr;
722     assert(rep->internal_prefix_transform.get() != nullptr);
723     s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
724                                  prefixes_contents.data,
725                                  prefixes_meta_contents.data, &prefix_index);
726     // TODO: log error
727     if (s.ok()) {
728       HashIndexReader* const hash_index_reader =
729           static_cast<HashIndexReader*>(index_reader->get());
730       hash_index_reader->prefix_index_.reset(prefix_index);
731     }
732 
733     return Status::OK();
734   }
735 
NewIterator(const ReadOptions & read_options,bool disable_prefix_seek,IndexBlockIter * iter,GetContext * get_context,BlockCacheLookupContext * lookup_context)736   InternalIteratorBase<IndexValue>* NewIterator(
737       const ReadOptions& read_options, bool disable_prefix_seek,
738       IndexBlockIter* iter, GetContext* get_context,
739       BlockCacheLookupContext* lookup_context) override {
740     const bool no_io = (read_options.read_tier == kBlockCacheTier);
741     CachableEntry<Block> index_block;
742     const Status s =
743         GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
744     if (!s.ok()) {
745       if (iter != nullptr) {
746         iter->Invalidate(s);
747         return iter;
748       }
749 
750       return NewErrorInternalIterator<IndexValue>(s);
751     }
752 
753     Statistics* kNullStats = nullptr;
754     const bool total_order_seek =
755         read_options.total_order_seek || disable_prefix_seek;
756     // We don't return pinned data from index blocks, so no need
757     // to set `block_contents_pinned`.
758     auto it = index_block.GetValue()->NewIndexIterator(
759         internal_comparator(), internal_comparator()->user_comparator(), iter,
760         kNullStats, total_order_seek, index_has_first_key(),
761         index_key_includes_seq(), index_value_is_full(),
762         false /* block_contents_pinned */, prefix_index_.get());
763 
764     assert(it != nullptr);
765     index_block.TransferTo(it);
766 
767     return it;
768   }
769 
ApproximateMemoryUsage() const770   size_t ApproximateMemoryUsage() const override {
771     size_t usage = ApproximateIndexBlockMemoryUsage();
772 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
773     usage += malloc_usable_size(const_cast<HashIndexReader*>(this));
774 #else
775     if (prefix_index_) {
776       usage += prefix_index_->ApproximateMemoryUsage();
777     }
778     usage += sizeof(*this);
779 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
780     return usage;
781   }
782 
783  private:
HashIndexReader(const BlockBasedTable * t,CachableEntry<Block> && index_block)784   HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
785       : IndexReaderCommon(t, std::move(index_block)) {}
786 
787   std::unique_ptr<BlockPrefixIndex> prefix_index_;
788 };
789 
UpdateCacheHitMetrics(BlockType block_type,GetContext * get_context,size_t usage) const790 void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
791                                             GetContext* get_context,
792                                             size_t usage) const {
793   Statistics* const statistics = rep_->ioptions.statistics;
794 
795   PERF_COUNTER_ADD(block_cache_hit_count, 1);
796   PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
797                             static_cast<uint32_t>(rep_->level));
798 
799   if (get_context) {
800     ++get_context->get_context_stats_.num_cache_hit;
801     get_context->get_context_stats_.num_cache_bytes_read += usage;
802   } else {
803     RecordTick(statistics, BLOCK_CACHE_HIT);
804     RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
805   }
806 
807   switch (block_type) {
808     case BlockType::kFilter:
809       PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
810 
811       if (get_context) {
812         ++get_context->get_context_stats_.num_cache_filter_hit;
813       } else {
814         RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
815       }
816       break;
817 
818     case BlockType::kCompressionDictionary:
819       // TODO: introduce perf counter for compression dictionary hit count
820       if (get_context) {
821         ++get_context->get_context_stats_.num_cache_compression_dict_hit;
822       } else {
823         RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
824       }
825       break;
826 
827     case BlockType::kIndex:
828       PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
829 
830       if (get_context) {
831         ++get_context->get_context_stats_.num_cache_index_hit;
832       } else {
833         RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
834       }
835       break;
836 
837     default:
838       // TODO: introduce dedicated tickers/statistics/counters
839       // for range tombstones
840       if (get_context) {
841         ++get_context->get_context_stats_.num_cache_data_hit;
842       } else {
843         RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
844       }
845       break;
846   }
847 }
848 
UpdateCacheMissMetrics(BlockType block_type,GetContext * get_context) const849 void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
850                                              GetContext* get_context) const {
851   Statistics* const statistics = rep_->ioptions.statistics;
852 
853   // TODO: introduce aggregate (not per-level) block cache miss count
854   PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
855                             static_cast<uint32_t>(rep_->level));
856 
857   if (get_context) {
858     ++get_context->get_context_stats_.num_cache_miss;
859   } else {
860     RecordTick(statistics, BLOCK_CACHE_MISS);
861   }
862 
863   // TODO: introduce perf counters for misses per block type
864   switch (block_type) {
865     case BlockType::kFilter:
866       if (get_context) {
867         ++get_context->get_context_stats_.num_cache_filter_miss;
868       } else {
869         RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
870       }
871       break;
872 
873     case BlockType::kCompressionDictionary:
874       if (get_context) {
875         ++get_context->get_context_stats_.num_cache_compression_dict_miss;
876       } else {
877         RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
878       }
879       break;
880 
881     case BlockType::kIndex:
882       if (get_context) {
883         ++get_context->get_context_stats_.num_cache_index_miss;
884       } else {
885         RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
886       }
887       break;
888 
889     default:
890       // TODO: introduce dedicated tickers/statistics/counters
891       // for range tombstones
892       if (get_context) {
893         ++get_context->get_context_stats_.num_cache_data_miss;
894       } else {
895         RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
896       }
897       break;
898   }
899 }
900 
UpdateCacheInsertionMetrics(BlockType block_type,GetContext * get_context,size_t usage) const901 void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
902                                                   GetContext* get_context,
903                                                   size_t usage) const {
904   Statistics* const statistics = rep_->ioptions.statistics;
905 
906   // TODO: introduce perf counters for block cache insertions
907   if (get_context) {
908     ++get_context->get_context_stats_.num_cache_add;
909     get_context->get_context_stats_.num_cache_bytes_write += usage;
910   } else {
911     RecordTick(statistics, BLOCK_CACHE_ADD);
912     RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
913   }
914 
915   switch (block_type) {
916     case BlockType::kFilter:
917       if (get_context) {
918         ++get_context->get_context_stats_.num_cache_filter_add;
919         get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
920       } else {
921         RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
922         RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
923       }
924       break;
925 
926     case BlockType::kCompressionDictionary:
927       if (get_context) {
928         ++get_context->get_context_stats_.num_cache_compression_dict_add;
929         get_context->get_context_stats_
930             .num_cache_compression_dict_bytes_insert += usage;
931       } else {
932         RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
933         RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
934                    usage);
935       }
936       break;
937 
938     case BlockType::kIndex:
939       if (get_context) {
940         ++get_context->get_context_stats_.num_cache_index_add;
941         get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
942       } else {
943         RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
944         RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
945       }
946       break;
947 
948     default:
949       // TODO: introduce dedicated tickers/statistics/counters
950       // for range tombstones
951       if (get_context) {
952         ++get_context->get_context_stats_.num_cache_data_add;
953         get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
954       } else {
955         RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
956         RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
957       }
958       break;
959   }
960 }
961 
GetEntryFromCache(Cache * block_cache,const Slice & key,BlockType block_type,GetContext * get_context) const962 Cache::Handle* BlockBasedTable::GetEntryFromCache(
963     Cache* block_cache, const Slice& key, BlockType block_type,
964     GetContext* get_context) const {
965   auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);
966 
967   if (cache_handle != nullptr) {
968     UpdateCacheHitMetrics(block_type, get_context,
969                           block_cache->GetUsage(cache_handle));
970   } else {
971     UpdateCacheMissMetrics(block_type, get_context);
972   }
973 
974   return cache_handle;
975 }
976 
977 // Helper function to setup the cache key's prefix for the Table.
SetupCacheKeyPrefix(Rep * rep)978 void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
979   assert(kMaxCacheKeyPrefixSize >= 10);
980   rep->cache_key_prefix_size = 0;
981   rep->compressed_cache_key_prefix_size = 0;
982   if (rep->table_options.block_cache != nullptr) {
983     GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
984                         &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
985   }
986   if (rep->table_options.persistent_cache != nullptr) {
987     GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
988                         &rep->persistent_cache_key_prefix[0],
989                         &rep->persistent_cache_key_prefix_size);
990   }
991   if (rep->table_options.block_cache_compressed != nullptr) {
992     GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
993                         rep->file->file(), &rep->compressed_cache_key_prefix[0],
994                         &rep->compressed_cache_key_prefix_size);
995   }
996 }
997 
GenerateCachePrefix(Cache * cc,FSRandomAccessFile * file,char * buffer,size_t * size)998 void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
999                                           char* buffer, size_t* size) {
1000   // generate an id from the file
1001   *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
1002 
1003   // If the prefix wasn't generated or was too long,
1004   // create one from the cache.
1005   if (cc != nullptr && *size == 0) {
1006     char* end = EncodeVarint64(buffer, cc->NewId());
1007     *size = static_cast<size_t>(end - buffer);
1008   }
1009 }
1010 
GenerateCachePrefix(Cache * cc,FSWritableFile * file,char * buffer,size_t * size)1011 void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSWritableFile* file,
1012                                           char* buffer, size_t* size) {
1013   // generate an id from the file
1014   *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
1015 
1016   // If the prefix wasn't generated or was too long,
1017   // create one from the cache.
1018   if (cc != nullptr && *size == 0) {
1019     char* end = EncodeVarint64(buffer, cc->NewId());
1020     *size = static_cast<size_t>(end - buffer);
1021   }
1022 }
1023 
1024 namespace {
1025 // Return True if table_properties has `user_prop_name` has a `true` value
1026 // or it doesn't contain this property (for backward compatible).
IsFeatureSupported(const TableProperties & table_properties,const std::string & user_prop_name,Logger * info_log)1027 bool IsFeatureSupported(const TableProperties& table_properties,
1028                         const std::string& user_prop_name, Logger* info_log) {
1029   auto& props = table_properties.user_collected_properties;
1030   auto pos = props.find(user_prop_name);
1031   // Older version doesn't have this value set. Skip this check.
1032   if (pos != props.end()) {
1033     if (pos->second == kPropFalse) {
1034       return false;
1035     } else if (pos->second != kPropTrue) {
1036       ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
1037                      user_prop_name.c_str(), pos->second.c_str());
1038     }
1039   }
1040   return true;
1041 }
1042 
1043 // Caller has to ensure seqno is not nullptr.
GetGlobalSequenceNumber(const TableProperties & table_properties,SequenceNumber largest_seqno,SequenceNumber * seqno)1044 Status GetGlobalSequenceNumber(const TableProperties& table_properties,
1045                                SequenceNumber largest_seqno,
1046                                SequenceNumber* seqno) {
1047   const auto& props = table_properties.user_collected_properties;
1048   const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
1049   const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
1050 
1051   *seqno = kDisableGlobalSequenceNumber;
1052   if (version_pos == props.end()) {
1053     if (seqno_pos != props.end()) {
1054       std::array<char, 200> msg_buf;
1055       // This is not an external sst file, global_seqno is not supported.
1056       snprintf(
1057           msg_buf.data(), msg_buf.max_size(),
1058           "A non-external sst file have global seqno property with value %s",
1059           seqno_pos->second.c_str());
1060       return Status::Corruption(msg_buf.data());
1061     }
1062     return Status::OK();
1063   }
1064 
1065   uint32_t version = DecodeFixed32(version_pos->second.c_str());
1066   if (version < 2) {
1067     if (seqno_pos != props.end() || version != 1) {
1068       std::array<char, 200> msg_buf;
1069       // This is a v1 external sst file, global_seqno is not supported.
1070       snprintf(msg_buf.data(), msg_buf.max_size(),
1071                "An external sst file with version %u have global seqno "
1072                "property with value %s",
1073                version, seqno_pos->second.c_str());
1074       return Status::Corruption(msg_buf.data());
1075     }
1076     return Status::OK();
1077   }
1078 
1079   // Since we have a plan to deprecate global_seqno, we do not return failure
1080   // if seqno_pos == props.end(). We rely on version_pos to detect whether the
1081   // SST is external.
1082   SequenceNumber global_seqno(0);
1083   if (seqno_pos != props.end()) {
1084     global_seqno = DecodeFixed64(seqno_pos->second.c_str());
1085   }
1086   // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
1087   // to denote it is unknown.
1088   if (largest_seqno < kMaxSequenceNumber) {
1089     if (global_seqno == 0) {
1090       global_seqno = largest_seqno;
1091     }
1092     if (global_seqno != largest_seqno) {
1093       std::array<char, 200> msg_buf;
1094       snprintf(
1095           msg_buf.data(), msg_buf.max_size(),
1096           "An external sst file with version %u have global seqno property "
1097           "with value %s, while largest seqno in the file is %llu",
1098           version, seqno_pos->second.c_str(),
1099           static_cast<unsigned long long>(largest_seqno));
1100       return Status::Corruption(msg_buf.data());
1101     }
1102   }
1103   *seqno = global_seqno;
1104 
1105   if (global_seqno > kMaxSequenceNumber) {
1106     std::array<char, 200> msg_buf;
1107     snprintf(msg_buf.data(), msg_buf.max_size(),
1108              "An external sst file with version %u have global seqno property "
1109              "with value %llu, which is greater than kMaxSequenceNumber",
1110              version, static_cast<unsigned long long>(global_seqno));
1111     return Status::Corruption(msg_buf.data());
1112   }
1113 
1114   return Status::OK();
1115 }
1116 }  // namespace
1117 
GetCacheKey(const char * cache_key_prefix,size_t cache_key_prefix_size,const BlockHandle & handle,char * cache_key)1118 Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
1119                                    size_t cache_key_prefix_size,
1120                                    const BlockHandle& handle, char* cache_key) {
1121   assert(cache_key != nullptr);
1122   assert(cache_key_prefix_size != 0);
1123   assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
1124   memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
1125   char* end =
1126       EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
1127   return Slice(cache_key, static_cast<size_t>(end - cache_key));
1128 }
1129 
Open(const ImmutableCFOptions & ioptions,const EnvOptions & env_options,const BlockBasedTableOptions & table_options,const InternalKeyComparator & internal_comparator,std::unique_ptr<RandomAccessFileReader> && file,uint64_t file_size,std::unique_ptr<TableReader> * table_reader,const SliceTransform * prefix_extractor,const bool prefetch_index_and_filter_in_cache,const bool skip_filters,const int level,const bool immortal_table,const SequenceNumber largest_seqno,TailPrefetchStats * tail_prefetch_stats,BlockCacheTracer * const block_cache_tracer)1130 Status BlockBasedTable::Open(
1131     const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
1132     const BlockBasedTableOptions& table_options,
1133     const InternalKeyComparator& internal_comparator,
1134     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
1135     std::unique_ptr<TableReader>* table_reader,
1136     const SliceTransform* prefix_extractor,
1137     const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
1138     const int level, const bool immortal_table,
1139     const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
1140     BlockCacheTracer* const block_cache_tracer) {
1141   table_reader->reset();
1142 
1143   Status s;
1144   Footer footer;
1145   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
1146 
1147   // prefetch both index and filters, down to all partitions
1148   const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
1149   const bool preload_all = !table_options.cache_index_and_filter_blocks;
1150 
1151   if (!ioptions.allow_mmap_reads) {
1152     s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
1153                      preload_all, &prefetch_buffer);
1154   } else {
1155     // Should not prefetch for mmap mode.
1156     prefetch_buffer.reset(new FilePrefetchBuffer(
1157         nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
1158   }
1159 
1160   // Read in the following order:
1161   //    1. Footer
1162   //    2. [metaindex block]
1163   //    3. [meta block: properties]
1164   //    4. [meta block: range deletion tombstone]
1165   //    5. [meta block: compression dictionary]
1166   //    6. [meta block: index]
1167   //    7. [meta block: filter]
1168   s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
1169                          kBlockBasedTableMagicNumber);
1170   if (!s.ok()) {
1171     return s;
1172   }
1173   if (!BlockBasedTableSupportedVersion(footer.version())) {
1174     return Status::Corruption(
1175         "Unknown Footer version. Maybe this file was created with newer "
1176         "version of RocksDB?");
1177   }
1178 
1179   // We've successfully read the footer. We are ready to serve requests.
1180   // Better not mutate rep_ after the creation. eg. internal_prefix_transform
1181   // raw pointer will be used to create HashIndexReader, whose reset may
1182   // access a dangling pointer.
1183   BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
1184   Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
1185                                       internal_comparator, skip_filters, level,
1186                                       immortal_table);
1187   rep->file = std::move(file);
1188   rep->footer = footer;
1189   rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
1190   // We need to wrap data with internal_prefix_transform to make sure it can
1191   // handle prefix correctly.
1192   if (prefix_extractor != nullptr) {
1193     rep->internal_prefix_transform.reset(
1194         new InternalKeySliceTransform(prefix_extractor));
1195   }
1196   SetupCacheKeyPrefix(rep);
1197   std::unique_ptr<BlockBasedTable> new_table(
1198       new BlockBasedTable(rep, block_cache_tracer));
1199 
1200   // page cache options
1201   rep->persistent_cache_options =
1202       PersistentCacheOptions(rep->table_options.persistent_cache,
1203                              std::string(rep->persistent_cache_key_prefix,
1204                                          rep->persistent_cache_key_prefix_size),
1205                              rep->ioptions.statistics);
1206 
1207   // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
1208   // handle to null, otherwise it may be seen as uninitialized during the below
1209   // meta-block reads.
1210   rep->compression_dict_handle = BlockHandle::NullBlockHandle();
1211 
1212   // Read metaindex
1213   std::unique_ptr<Block> metaindex;
1214   std::unique_ptr<InternalIterator> metaindex_iter;
1215   s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex,
1216                                     &metaindex_iter);
1217   if (!s.ok()) {
1218     return s;
1219   }
1220 
1221   // Populates table_properties and some fields that depend on it,
1222   // such as index_type.
1223   s = new_table->ReadPropertiesBlock(prefetch_buffer.get(),
1224                                      metaindex_iter.get(), largest_seqno);
1225   if (!s.ok()) {
1226     return s;
1227   }
1228   s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(),
1229                                    internal_comparator, &lookup_context);
1230   if (!s.ok()) {
1231     return s;
1232   }
1233   s = new_table->PrefetchIndexAndFilterBlocks(
1234       prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
1235       prefetch_all, table_options, level, &lookup_context);
1236 
1237   if (s.ok()) {
1238     // Update tail prefetch stats
1239     assert(prefetch_buffer.get() != nullptr);
1240     if (tail_prefetch_stats != nullptr) {
1241       assert(prefetch_buffer->min_offset_read() < file_size);
1242       tail_prefetch_stats->RecordEffectiveSize(
1243           static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
1244     }
1245 
1246     *table_reader = std::move(new_table);
1247   }
1248 
1249   return s;
1250 }
1251 
PrefetchTail(RandomAccessFileReader * file,uint64_t file_size,TailPrefetchStats * tail_prefetch_stats,const bool prefetch_all,const bool preload_all,std::unique_ptr<FilePrefetchBuffer> * prefetch_buffer)1252 Status BlockBasedTable::PrefetchTail(
1253     RandomAccessFileReader* file, uint64_t file_size,
1254     TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
1255     const bool preload_all,
1256     std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
1257   size_t tail_prefetch_size = 0;
1258   if (tail_prefetch_stats != nullptr) {
1259     // Multiple threads may get a 0 (no history) when running in parallel,
1260     // but it will get cleared after the first of them finishes.
1261     tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
1262   }
1263   if (tail_prefetch_size == 0) {
1264     // Before read footer, readahead backwards to prefetch data. Do more
1265     // readahead if we're going to read index/filter.
1266     // TODO: This may incorrectly select small readahead in case partitioned
1267     // index/filter is enabled and top-level partition pinning is enabled.
1268     // That's because we need to issue readahead before we read the properties,
1269     // at which point we don't yet know the index type.
1270     tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
1271   }
1272   size_t prefetch_off;
1273   size_t prefetch_len;
1274   if (file_size < tail_prefetch_size) {
1275     prefetch_off = 0;
1276     prefetch_len = static_cast<size_t>(file_size);
1277   } else {
1278     prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
1279     prefetch_len = tail_prefetch_size;
1280   }
1281   TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
1282                            &tail_prefetch_size);
1283   Status s;
1284   // TODO should not have this special logic in the future.
1285   if (!file->use_direct_io()) {
1286     prefetch_buffer->reset(new FilePrefetchBuffer(
1287         nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
1288     s = file->Prefetch(prefetch_off, prefetch_len);
1289   } else {
1290     prefetch_buffer->reset(new FilePrefetchBuffer(
1291         nullptr, 0, 0, true /* enable */, true /* track_min_offset */));
1292     s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
1293   }
1294   return s;
1295 }
1296 
VerifyChecksum(const ChecksumType type,const char * buf,size_t len,uint32_t expected)1297 Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
1298                       uint32_t expected) {
1299   Status s;
1300   uint32_t actual = 0;
1301   switch (type) {
1302     case kNoChecksum:
1303       break;
1304     case kCRC32c:
1305       expected = crc32c::Unmask(expected);
1306       actual = crc32c::Value(buf, len);
1307       break;
1308     case kxxHash:
1309       actual = XXH32(buf, static_cast<int>(len), 0);
1310       break;
1311     case kxxHash64:
1312       actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
1313                                      uint64_t{0xffffffff});
1314       break;
1315     default:
1316       s = Status::Corruption("unknown checksum type");
1317   }
1318   if (s.ok() && actual != expected) {
1319     s = Status::Corruption("properties block checksum mismatched");
1320   }
1321   return s;
1322 }
1323 
TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer * prefetch_buffer,const Slice & handle_value,TableProperties ** table_properties)1324 Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
1325     FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
1326     TableProperties** table_properties) {
1327   assert(table_properties != nullptr);
1328   // If this is an external SST file ingested with write_global_seqno set to
1329   // true, then we expect the checksum mismatch because checksum was written
1330   // by SstFileWriter, but its global seqno in the properties block may have
1331   // been changed during ingestion. In this case, we read the properties
1332   // block, copy it to a memory buffer, change the global seqno to its
1333   // original value, i.e. 0, and verify the checksum again.
1334   BlockHandle props_block_handle;
1335   CacheAllocationPtr tmp_buf;
1336   Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
1337                             rep_->footer, rep_->ioptions, table_properties,
1338                             false /* verify_checksum */, &props_block_handle,
1339                             &tmp_buf, false /* compression_type_missing */,
1340                             nullptr /* memory_allocator */);
1341   if (s.ok() && tmp_buf) {
1342     const auto seqno_pos_iter =
1343         (*table_properties)
1344             ->properties_offsets.find(
1345                 ExternalSstFilePropertyNames::kGlobalSeqno);
1346     size_t block_size = static_cast<size_t>(props_block_handle.size());
1347     if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
1348       uint64_t global_seqno_offset = seqno_pos_iter->second;
1349       EncodeFixed64(
1350           tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
1351     }
1352     uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
1353     s = ROCKSDB_NAMESPACE::VerifyChecksum(rep_->footer.checksum(),
1354                                           tmp_buf.get(), block_size + 1, value);
1355   }
1356   return s;
1357 }
1358 
ReadPropertiesBlock(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,const SequenceNumber largest_seqno)1359 Status BlockBasedTable::ReadPropertiesBlock(
1360     FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1361     const SequenceNumber largest_seqno) {
1362   bool found_properties_block = true;
1363   Status s;
1364   s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
1365 
1366   if (!s.ok()) {
1367     ROCKS_LOG_WARN(rep_->ioptions.info_log,
1368                    "Error when seeking to properties block from file: %s",
1369                    s.ToString().c_str());
1370   } else if (found_properties_block) {
1371     s = meta_iter->status();
1372     TableProperties* table_properties = nullptr;
1373     if (s.ok()) {
1374       s = ReadProperties(
1375           meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
1376           rep_->ioptions, &table_properties, true /* verify_checksum */,
1377           nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
1378           false /* compression_type_missing */, nullptr /* memory_allocator */);
1379     }
1380 
1381     if (s.IsCorruption()) {
1382       s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
1383                                            &table_properties);
1384     }
1385     std::unique_ptr<TableProperties> props_guard;
1386     if (table_properties != nullptr) {
1387       props_guard.reset(table_properties);
1388     }
1389 
1390     if (!s.ok()) {
1391       ROCKS_LOG_WARN(rep_->ioptions.info_log,
1392                      "Encountered error while reading data from properties "
1393                      "block %s",
1394                      s.ToString().c_str());
1395     } else {
1396       assert(table_properties != nullptr);
1397       rep_->table_properties.reset(props_guard.release());
1398       rep_->blocks_maybe_compressed =
1399           rep_->table_properties->compression_name !=
1400           CompressionTypeToString(kNoCompression);
1401       rep_->blocks_definitely_zstd_compressed =
1402           (rep_->table_properties->compression_name ==
1403                CompressionTypeToString(kZSTD) ||
1404            rep_->table_properties->compression_name ==
1405                CompressionTypeToString(kZSTDNotFinalCompression));
1406     }
1407   } else {
1408     ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1409                     "Cannot find Properties block from file.");
1410   }
1411 #ifndef ROCKSDB_LITE
1412   if (rep_->table_properties) {
1413     ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
1414                         &(rep_->table_prefix_extractor));
1415   }
1416 #endif  // ROCKSDB_LITE
1417 
1418   // Read the table properties, if provided.
1419   if (rep_->table_properties) {
1420     rep_->whole_key_filtering &=
1421         IsFeatureSupported(*(rep_->table_properties),
1422                            BlockBasedTablePropertyNames::kWholeKeyFiltering,
1423                            rep_->ioptions.info_log);
1424     rep_->prefix_filtering &=
1425         IsFeatureSupported(*(rep_->table_properties),
1426                            BlockBasedTablePropertyNames::kPrefixFiltering,
1427                            rep_->ioptions.info_log);
1428 
1429     rep_->index_key_includes_seq =
1430         rep_->table_properties->index_key_is_user_key == 0;
1431     rep_->index_value_is_full =
1432         rep_->table_properties->index_value_is_delta_encoded == 0;
1433 
1434     // Update index_type with the true type.
1435     // If table properties don't contain index type, we assume that the table
1436     // is in very old format and has kBinarySearch index type.
1437     auto& props = rep_->table_properties->user_collected_properties;
1438     auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
1439     if (pos != props.end()) {
1440       rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
1441           DecodeFixed32(pos->second.c_str()));
1442     }
1443 
1444     rep_->index_has_first_key =
1445         rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
1446 
1447     s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
1448                                 &(rep_->global_seqno));
1449     if (!s.ok()) {
1450       ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
1451     }
1452   }
1453   return s;
1454 }
1455 
ReadRangeDelBlock(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,const InternalKeyComparator & internal_comparator,BlockCacheLookupContext * lookup_context)1456 Status BlockBasedTable::ReadRangeDelBlock(
1457     FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1458     const InternalKeyComparator& internal_comparator,
1459     BlockCacheLookupContext* lookup_context) {
1460   Status s;
1461   bool found_range_del_block;
1462   BlockHandle range_del_handle;
1463   s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
1464   if (!s.ok()) {
1465     ROCKS_LOG_WARN(
1466         rep_->ioptions.info_log,
1467         "Error when seeking to range delete tombstones block from file: %s",
1468         s.ToString().c_str());
1469   } else if (found_range_del_block && !range_del_handle.IsNull()) {
1470     ReadOptions read_options;
1471     std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
1472         read_options, range_del_handle,
1473         /*input_iter=*/nullptr, BlockType::kRangeDeletion,
1474         /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
1475     assert(iter != nullptr);
1476     s = iter->status();
1477     if (!s.ok()) {
1478       ROCKS_LOG_WARN(
1479           rep_->ioptions.info_log,
1480           "Encountered error while reading data from range del block %s",
1481           s.ToString().c_str());
1482     } else {
1483       rep_->fragmented_range_dels =
1484           std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
1485                                                          internal_comparator);
1486     }
1487   }
1488   return s;
1489 }
1490 
PrefetchIndexAndFilterBlocks(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,BlockBasedTable * new_table,bool prefetch_all,const BlockBasedTableOptions & table_options,const int level,BlockCacheLookupContext * lookup_context)1491 Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
1492     FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1493     BlockBasedTable* new_table, bool prefetch_all,
1494     const BlockBasedTableOptions& table_options, const int level,
1495     BlockCacheLookupContext* lookup_context) {
1496   Status s;
1497 
1498   // Find filter handle and filter type
1499   if (rep_->filter_policy) {
1500     for (auto filter_type :
1501          {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
1502           Rep::FilterType::kBlockFilter}) {
1503       std::string prefix;
1504       switch (filter_type) {
1505         case Rep::FilterType::kFullFilter:
1506           prefix = kFullFilterBlockPrefix;
1507           break;
1508         case Rep::FilterType::kPartitionedFilter:
1509           prefix = kPartitionedFilterBlockPrefix;
1510           break;
1511         case Rep::FilterType::kBlockFilter:
1512           prefix = kFilterBlockPrefix;
1513           break;
1514         default:
1515           assert(0);
1516       }
1517       std::string filter_block_key = prefix;
1518       filter_block_key.append(rep_->filter_policy->Name());
1519       if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
1520               .ok()) {
1521         rep_->filter_type = filter_type;
1522         break;
1523       }
1524     }
1525   }
1526 
1527   // Find compression dictionary handle
1528   bool found_compression_dict = false;
1529   s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
1530                                  &rep_->compression_dict_handle);
1531   if (!s.ok()) {
1532     return s;
1533   }
1534 
1535   BlockBasedTableOptions::IndexType index_type = rep_->index_type;
1536 
1537   const bool use_cache = table_options.cache_index_and_filter_blocks;
1538 
1539   // pin both index and filters, down to all partitions
1540   const bool pin_all =
1541       rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
1542 
1543   // prefetch the first level of index
1544   const bool prefetch_index =
1545       prefetch_all ||
1546       (table_options.pin_top_level_index_and_filter &&
1547        index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
1548   // pin the first level of index
1549   const bool pin_index =
1550       pin_all || (table_options.pin_top_level_index_and_filter &&
1551                   index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
1552 
1553   std::unique_ptr<IndexReader> index_reader;
1554   s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
1555                                    prefetch_index, pin_index, lookup_context,
1556                                    &index_reader);
1557   if (!s.ok()) {
1558     return s;
1559   }
1560 
1561   rep_->index_reader = std::move(index_reader);
1562 
1563   // The partitions of partitioned index are always stored in cache. They
1564   // are hence follow the configuration for pin and prefetch regardless of
1565   // the value of cache_index_and_filter_blocks
1566   if (prefetch_all) {
1567     rep_->index_reader->CacheDependencies(pin_all);
1568   }
1569 
1570   // prefetch the first level of filter
1571   const bool prefetch_filter =
1572       prefetch_all ||
1573       (table_options.pin_top_level_index_and_filter &&
1574        rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1575   // Partition fitlers cannot be enabled without partition indexes
1576   assert(!prefetch_filter || prefetch_index);
1577   // pin the first level of filter
1578   const bool pin_filter =
1579       pin_all || (table_options.pin_top_level_index_and_filter &&
1580                   rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1581 
1582   if (rep_->filter_policy) {
1583     auto filter = new_table->CreateFilterBlockReader(
1584         prefetch_buffer, use_cache, prefetch_filter, pin_filter,
1585         lookup_context);
1586     if (filter) {
1587       // Refer to the comment above about paritioned indexes always being cached
1588       if (prefetch_all) {
1589         filter->CacheDependencies(pin_all);
1590       }
1591 
1592       rep_->filter = std::move(filter);
1593     }
1594   }
1595 
1596   if (!rep_->compression_dict_handle.IsNull()) {
1597     std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
1598     s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache,
1599                                         prefetch_all, pin_all, lookup_context,
1600                                         &uncompression_dict_reader);
1601     if (!s.ok()) {
1602       return s;
1603     }
1604 
1605     rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
1606   }
1607 
1608   assert(s.ok());
1609   return s;
1610 }
1611 
SetupForCompaction()1612 void BlockBasedTable::SetupForCompaction() {
1613   switch (rep_->ioptions.access_hint_on_compaction_start) {
1614     case Options::NONE:
1615       break;
1616     case Options::NORMAL:
1617       rep_->file->file()->Hint(FSRandomAccessFile::kNormal);
1618       break;
1619     case Options::SEQUENTIAL:
1620       rep_->file->file()->Hint(FSRandomAccessFile::kSequential);
1621       break;
1622     case Options::WILLNEED:
1623       rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed);
1624       break;
1625     default:
1626       assert(false);
1627   }
1628 }
1629 
GetTableProperties() const1630 std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
1631     const {
1632   return rep_->table_properties;
1633 }
1634 
ApproximateMemoryUsage() const1635 size_t BlockBasedTable::ApproximateMemoryUsage() const {
1636   size_t usage = 0;
1637   if (rep_->filter) {
1638     usage += rep_->filter->ApproximateMemoryUsage();
1639   }
1640   if (rep_->index_reader) {
1641     usage += rep_->index_reader->ApproximateMemoryUsage();
1642   }
1643   if (rep_->uncompression_dict_reader) {
1644     usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
1645   }
1646   return usage;
1647 }
1648 
1649 // Load the meta-index-block from the file. On success, return the loaded
1650 // metaindex
1651 // block and its iterator.
ReadMetaIndexBlock(FilePrefetchBuffer * prefetch_buffer,std::unique_ptr<Block> * metaindex_block,std::unique_ptr<InternalIterator> * iter)1652 Status BlockBasedTable::ReadMetaIndexBlock(
1653     FilePrefetchBuffer* prefetch_buffer,
1654     std::unique_ptr<Block>* metaindex_block,
1655     std::unique_ptr<InternalIterator>* iter) {
1656   // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
1657   // it is an empty block.
1658   std::unique_ptr<Block> metaindex;
1659   Status s = ReadBlockFromFile(
1660       rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
1661       rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
1662       true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
1663       UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
1664       kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
1665       GetMemoryAllocator(rep_->table_options), false /* for_compaction */,
1666       rep_->blocks_definitely_zstd_compressed, nullptr /* filter_policy */);
1667 
1668   if (!s.ok()) {
1669     ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1670                     "Encountered error while reading data from properties"
1671                     " block %s",
1672                     s.ToString().c_str());
1673     return s;
1674   }
1675 
1676   *metaindex_block = std::move(metaindex);
1677   // meta block uses bytewise comparator.
1678   iter->reset(metaindex_block->get()->NewDataIterator(BytewiseComparator(),
1679                                                       BytewiseComparator()));
1680   return Status::OK();
1681 }
1682 
1683 template <typename TBlocklike>
GetDataBlockFromCache(const Slice & block_cache_key,const Slice & compressed_block_cache_key,Cache * block_cache,Cache * block_cache_compressed,const ReadOptions & read_options,CachableEntry<TBlocklike> * block,const UncompressionDict & uncompression_dict,BlockType block_type,GetContext * get_context) const1684 Status BlockBasedTable::GetDataBlockFromCache(
1685     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1686     Cache* block_cache, Cache* block_cache_compressed,
1687     const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
1688     const UncompressionDict& uncompression_dict, BlockType block_type,
1689     GetContext* get_context) const {
1690   const size_t read_amp_bytes_per_bit =
1691       block_type == BlockType::kData
1692           ? rep_->table_options.read_amp_bytes_per_bit
1693           : 0;
1694   assert(block);
1695   assert(block->IsEmpty());
1696 
1697   Status s;
1698   BlockContents* compressed_block = nullptr;
1699   Cache::Handle* block_cache_compressed_handle = nullptr;
1700 
1701   // Lookup uncompressed cache first
1702   if (block_cache != nullptr) {
1703     auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
1704                                           block_type, get_context);
1705     if (cache_handle != nullptr) {
1706       block->SetCachedValue(
1707           reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
1708           block_cache, cache_handle);
1709       return s;
1710     }
1711   }
1712 
1713   // If not found, search from the compressed block cache.
1714   assert(block->IsEmpty());
1715 
1716   if (block_cache_compressed == nullptr) {
1717     return s;
1718   }
1719 
1720   assert(!compressed_block_cache_key.empty());
1721   block_cache_compressed_handle =
1722       block_cache_compressed->Lookup(compressed_block_cache_key);
1723 
1724   Statistics* statistics = rep_->ioptions.statistics;
1725 
1726   // if we found in the compressed cache, then uncompress and insert into
1727   // uncompressed cache
1728   if (block_cache_compressed_handle == nullptr) {
1729     RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
1730     return s;
1731   }
1732 
1733   // found compressed block
1734   RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
1735   compressed_block = reinterpret_cast<BlockContents*>(
1736       block_cache_compressed->Value(block_cache_compressed_handle));
1737   CompressionType compression_type = compressed_block->get_compression_type();
1738   assert(compression_type != kNoCompression);
1739 
1740   // Retrieve the uncompressed contents into a new buffer
1741   BlockContents contents;
1742   UncompressionContext context(compression_type);
1743   UncompressionInfo info(context, uncompression_dict, compression_type);
1744   s = UncompressBlockContents(
1745       info, compressed_block->data.data(), compressed_block->data.size(),
1746       &contents, rep_->table_options.format_version, rep_->ioptions,
1747       GetMemoryAllocator(rep_->table_options));
1748 
1749   // Insert uncompressed block into block cache
1750   if (s.ok()) {
1751     std::unique_ptr<TBlocklike> block_holder(
1752         BlocklikeTraits<TBlocklike>::Create(
1753             std::move(contents), rep_->get_global_seqno(block_type),
1754             read_amp_bytes_per_bit, statistics,
1755             rep_->blocks_definitely_zstd_compressed,
1756             rep_->table_options.filter_policy.get()));  // uncompressed block
1757 
1758     if (block_cache != nullptr && block_holder->own_bytes() &&
1759         read_options.fill_cache) {
1760       size_t charge = block_holder->ApproximateMemoryUsage();
1761       Cache::Handle* cache_handle = nullptr;
1762       s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1763                               &DeleteCachedEntry<TBlocklike>, &cache_handle);
1764       if (s.ok()) {
1765         assert(cache_handle != nullptr);
1766         block->SetCachedValue(block_holder.release(), block_cache,
1767                               cache_handle);
1768 
1769         UpdateCacheInsertionMetrics(block_type, get_context, charge);
1770       } else {
1771         RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1772       }
1773     } else {
1774       block->SetOwnedValue(block_holder.release());
1775     }
1776   }
1777 
1778   // Release hold on compressed cache entry
1779   block_cache_compressed->Release(block_cache_compressed_handle);
1780   return s;
1781 }
1782 
1783 template <typename TBlocklike>
PutDataBlockToCache(const Slice & block_cache_key,const Slice & compressed_block_cache_key,Cache * block_cache,Cache * block_cache_compressed,CachableEntry<TBlocklike> * cached_block,BlockContents * raw_block_contents,CompressionType raw_block_comp_type,const UncompressionDict & uncompression_dict,SequenceNumber seq_no,MemoryAllocator * memory_allocator,BlockType block_type,GetContext * get_context) const1784 Status BlockBasedTable::PutDataBlockToCache(
1785     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1786     Cache* block_cache, Cache* block_cache_compressed,
1787     CachableEntry<TBlocklike>* cached_block, BlockContents* raw_block_contents,
1788     CompressionType raw_block_comp_type,
1789     const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
1790     MemoryAllocator* memory_allocator, BlockType block_type,
1791     GetContext* get_context) const {
1792   const ImmutableCFOptions& ioptions = rep_->ioptions;
1793   const uint32_t format_version = rep_->table_options.format_version;
1794   const size_t read_amp_bytes_per_bit =
1795       block_type == BlockType::kData
1796           ? rep_->table_options.read_amp_bytes_per_bit
1797           : 0;
1798   const Cache::Priority priority =
1799       rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
1800               (block_type == BlockType::kFilter ||
1801                block_type == BlockType::kCompressionDictionary ||
1802                block_type == BlockType::kIndex)
1803           ? Cache::Priority::HIGH
1804           : Cache::Priority::LOW;
1805   assert(cached_block);
1806   assert(cached_block->IsEmpty());
1807 
1808   Status s;
1809   Statistics* statistics = ioptions.statistics;
1810 
1811   std::unique_ptr<TBlocklike> block_holder;
1812   if (raw_block_comp_type != kNoCompression) {
1813     // Retrieve the uncompressed contents into a new buffer
1814     BlockContents uncompressed_block_contents;
1815     UncompressionContext context(raw_block_comp_type);
1816     UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
1817     s = UncompressBlockContents(info, raw_block_contents->data.data(),
1818                                 raw_block_contents->data.size(),
1819                                 &uncompressed_block_contents, format_version,
1820                                 ioptions, memory_allocator);
1821     if (!s.ok()) {
1822       return s;
1823     }
1824 
1825     block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
1826         std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit,
1827         statistics, rep_->blocks_definitely_zstd_compressed,
1828         rep_->table_options.filter_policy.get()));
1829   } else {
1830     block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
1831         std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit,
1832         statistics, rep_->blocks_definitely_zstd_compressed,
1833         rep_->table_options.filter_policy.get()));
1834   }
1835 
1836   // Insert compressed block into compressed block cache.
1837   // Release the hold on the compressed cache entry immediately.
1838   if (block_cache_compressed != nullptr &&
1839       raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
1840       raw_block_contents->own_bytes()) {
1841 #ifndef NDEBUG
1842     assert(raw_block_contents->is_raw_block);
1843 #endif  // NDEBUG
1844 
1845     // We cannot directly put raw_block_contents because this could point to
1846     // an object in the stack.
1847     BlockContents* block_cont_for_comp_cache =
1848         new BlockContents(std::move(*raw_block_contents));
1849     s = block_cache_compressed->Insert(
1850         compressed_block_cache_key, block_cont_for_comp_cache,
1851         block_cont_for_comp_cache->ApproximateMemoryUsage(),
1852         &DeleteCachedEntry<BlockContents>);
1853     if (s.ok()) {
1854       // Avoid the following code to delete this cached block.
1855       RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
1856     } else {
1857       RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
1858       delete block_cont_for_comp_cache;
1859     }
1860   }
1861 
1862   // insert into uncompressed block cache
1863   if (block_cache != nullptr && block_holder->own_bytes()) {
1864     size_t charge = block_holder->ApproximateMemoryUsage();
1865     Cache::Handle* cache_handle = nullptr;
1866     s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1867                             &DeleteCachedEntry<TBlocklike>, &cache_handle,
1868                             priority);
1869     if (s.ok()) {
1870       assert(cache_handle != nullptr);
1871       cached_block->SetCachedValue(block_holder.release(), block_cache,
1872                                    cache_handle);
1873 
1874       UpdateCacheInsertionMetrics(block_type, get_context, charge);
1875     } else {
1876       RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1877     }
1878   } else {
1879     cached_block->SetOwnedValue(block_holder.release());
1880   }
1881 
1882   return s;
1883 }
1884 
CreateFilterBlockReader(FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context)1885 std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
1886     FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
1887     bool pin, BlockCacheLookupContext* lookup_context) {
1888   auto& rep = rep_;
1889   auto filter_type = rep->filter_type;
1890   if (filter_type == Rep::FilterType::kNoFilter) {
1891     return std::unique_ptr<FilterBlockReader>();
1892   }
1893 
1894   assert(rep->filter_policy);
1895 
1896   switch (filter_type) {
1897     case Rep::FilterType::kPartitionedFilter:
1898       return PartitionedFilterBlockReader::Create(
1899           this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
1900 
1901     case Rep::FilterType::kBlockFilter:
1902       return BlockBasedFilterBlockReader::Create(
1903           this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
1904 
1905     case Rep::FilterType::kFullFilter:
1906       return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache,
1907                                            prefetch, pin, lookup_context);
1908 
1909     default:
1910       // filter_type is either kNoFilter (exited the function at the first if),
1911       // or it must be covered in this switch block
1912       assert(false);
1913       return std::unique_ptr<FilterBlockReader>();
1914   }
1915 }
1916 
1917 // disable_prefix_seek should be set to true when prefix_extractor found in SST
1918 // differs from the one in mutable_cf_options and index type is HashBasedIndex
NewIndexIterator(const ReadOptions & read_options,bool disable_prefix_seek,IndexBlockIter * input_iter,GetContext * get_context,BlockCacheLookupContext * lookup_context) const1919 InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
1920     const ReadOptions& read_options, bool disable_prefix_seek,
1921     IndexBlockIter* input_iter, GetContext* get_context,
1922     BlockCacheLookupContext* lookup_context) const {
1923   assert(rep_ != nullptr);
1924   assert(rep_->index_reader != nullptr);
1925 
1926   // We don't return pinned data from index blocks, so no need
1927   // to set `block_contents_pinned`.
1928   return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
1929                                          input_iter, get_context,
1930                                          lookup_context);
1931 }
1932 
1933 // Convert an index iterator value (i.e., an encoded BlockHandle)
1934 // into an iterator over the contents of the corresponding block.
1935 // If input_iter is null, new a iterator
1936 // If input_iter is not null, update this iter and return it
1937 template <typename TBlockIter>
NewDataBlockIterator(const ReadOptions & ro,const BlockHandle & handle,TBlockIter * input_iter,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,Status s,FilePrefetchBuffer * prefetch_buffer,bool for_compaction) const1938 TBlockIter* BlockBasedTable::NewDataBlockIterator(
1939     const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
1940     BlockType block_type, GetContext* get_context,
1941     BlockCacheLookupContext* lookup_context, Status s,
1942     FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const {
1943   PERF_TIMER_GUARD(new_table_block_iter_nanos);
1944 
1945   TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
1946   if (!s.ok()) {
1947     iter->Invalidate(s);
1948     return iter;
1949   }
1950 
1951   CachableEntry<UncompressionDict> uncompression_dict;
1952   if (rep_->uncompression_dict_reader) {
1953     const bool no_io = (ro.read_tier == kBlockCacheTier);
1954     s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
1955         prefetch_buffer, no_io, get_context, lookup_context,
1956         &uncompression_dict);
1957     if (!s.ok()) {
1958       iter->Invalidate(s);
1959       return iter;
1960     }
1961   }
1962 
1963   const UncompressionDict& dict = uncompression_dict.GetValue()
1964                                       ? *uncompression_dict.GetValue()
1965                                       : UncompressionDict::GetEmptyDict();
1966 
1967   CachableEntry<Block> block;
1968   s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
1969                     get_context, lookup_context, for_compaction,
1970                     /* use_cache */ true);
1971 
1972   if (!s.ok()) {
1973     assert(block.IsEmpty());
1974     iter->Invalidate(s);
1975     return iter;
1976   }
1977 
1978   assert(block.GetValue() != nullptr);
1979 
1980   // Block contents are pinned and it is still pinned after the iterator
1981   // is destroyed as long as cleanup functions are moved to another object,
1982   // when:
1983   // 1. block cache handle is set to be released in cleanup function, or
1984   // 2. it's pointing to immortal source. If own_bytes is true then we are
1985   //    not reading data from the original source, whether immortal or not.
1986   //    Otherwise, the block is pinned iff the source is immortal.
1987   const bool block_contents_pinned =
1988       block.IsCached() ||
1989       (!block.GetValue()->own_bytes() && rep_->immortal_table);
1990   iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
1991                                        block_contents_pinned);
1992 
1993   if (!block.IsCached()) {
1994     if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
1995       // insert a dummy record to block cache to track the memory usage
1996       Cache* const block_cache = rep_->table_options.block_cache.get();
1997       Cache::Handle* cache_handle = nullptr;
1998       // There are two other types of cache keys: 1) SST cache key added in
1999       // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
2000       // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
2001       // from SST cache key(31 bytes), and use non-zero prefix to
2002       // differentiate from `write_buffer_manager`
2003       const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
2004       char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
2005       // Prefix: use rep_->cache_key_prefix padded by 0s
2006       memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
2007       assert(rep_->cache_key_prefix_size != 0);
2008       assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
2009       memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
2010       char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
2011                                  next_cache_key_id_++);
2012       assert(end - cache_key <=
2013              static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
2014       const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
2015       s = block_cache->Insert(unique_key, nullptr,
2016                               block.GetValue()->ApproximateMemoryUsage(),
2017                               nullptr, &cache_handle);
2018 
2019       if (s.ok()) {
2020         assert(cache_handle != nullptr);
2021         iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
2022                               cache_handle);
2023       }
2024     }
2025   } else {
2026     iter->SetCacheHandle(block.GetCacheHandle());
2027   }
2028 
2029   block.TransferTo(iter);
2030 
2031   return iter;
2032 }
2033 
2034 template <>
InitBlockIterator(const Rep * rep,Block * block,DataBlockIter * input_iter,bool block_contents_pinned)2035 DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
2036     const Rep* rep, Block* block, DataBlockIter* input_iter,
2037     bool block_contents_pinned) {
2038   return block->NewDataIterator(
2039       &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2040       input_iter, rep->ioptions.statistics, block_contents_pinned);
2041 }
2042 
2043 template <>
InitBlockIterator(const Rep * rep,Block * block,IndexBlockIter * input_iter,bool block_contents_pinned)2044 IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
2045     const Rep* rep, Block* block, IndexBlockIter* input_iter,
2046     bool block_contents_pinned) {
2047   return block->NewIndexIterator(
2048       &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2049       input_iter, rep->ioptions.statistics, /* total_order_seek */ true,
2050       rep->index_has_first_key, rep->index_key_includes_seq,
2051       rep->index_value_is_full, block_contents_pinned);
2052 }
2053 
2054 // Convert an uncompressed data block (i.e CachableEntry<Block>)
2055 // into an iterator over the contents of the corresponding block.
2056 // If input_iter is null, new a iterator
2057 // If input_iter is not null, update this iter and return it
2058 template <typename TBlockIter>
NewDataBlockIterator(const ReadOptions & ro,CachableEntry<Block> & block,TBlockIter * input_iter,Status s) const2059 TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro,
2060                                                   CachableEntry<Block>& block,
2061                                                   TBlockIter* input_iter,
2062                                                   Status s) const {
2063   PERF_TIMER_GUARD(new_table_block_iter_nanos);
2064 
2065   TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
2066   if (!s.ok()) {
2067     iter->Invalidate(s);
2068     return iter;
2069   }
2070 
2071   assert(block.GetValue() != nullptr);
2072   // Block contents are pinned and it is still pinned after the iterator
2073   // is destroyed as long as cleanup functions are moved to another object,
2074   // when:
2075   // 1. block cache handle is set to be released in cleanup function, or
2076   // 2. it's pointing to immortal source. If own_bytes is true then we are
2077   //    not reading data from the original source, whether immortal or not.
2078   //    Otherwise, the block is pinned iff the source is immortal.
2079   const bool block_contents_pinned =
2080       block.IsCached() ||
2081       (!block.GetValue()->own_bytes() && rep_->immortal_table);
2082   iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
2083                                        block_contents_pinned);
2084 
2085   if (!block.IsCached()) {
2086     if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
2087       // insert a dummy record to block cache to track the memory usage
2088       Cache* const block_cache = rep_->table_options.block_cache.get();
2089       Cache::Handle* cache_handle = nullptr;
2090       // There are two other types of cache keys: 1) SST cache key added in
2091       // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
2092       // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
2093       // from SST cache key(31 bytes), and use non-zero prefix to
2094       // differentiate from `write_buffer_manager`
2095       const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
2096       char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
2097       // Prefix: use rep_->cache_key_prefix padded by 0s
2098       memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
2099       assert(rep_->cache_key_prefix_size != 0);
2100       assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
2101       memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
2102       char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
2103                                  next_cache_key_id_++);
2104       assert(end - cache_key <=
2105              static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
2106       const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
2107       s = block_cache->Insert(unique_key, nullptr,
2108                               block.GetValue()->ApproximateMemoryUsage(),
2109                               nullptr, &cache_handle);
2110       if (s.ok()) {
2111         assert(cache_handle != nullptr);
2112         iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
2113                               cache_handle);
2114       }
2115     }
2116   } else {
2117     iter->SetCacheHandle(block.GetCacheHandle());
2118   }
2119 
2120   block.TransferTo(iter);
2121   return iter;
2122 }
2123 
2124 // If contents is nullptr, this function looks up the block caches for the
2125 // data block referenced by handle, and read the block from disk if necessary.
2126 // If contents is non-null, it skips the cache lookup and disk read, since
2127 // the caller has already read it. In both cases, if ro.fill_cache is true,
2128 // it inserts the block into the block cache.
2129 template <typename TBlocklike>
MaybeReadBlockAndLoadToCache(FilePrefetchBuffer * prefetch_buffer,const ReadOptions & ro,const BlockHandle & handle,const UncompressionDict & uncompression_dict,CachableEntry<TBlocklike> * block_entry,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,BlockContents * contents) const2130 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
2131     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2132     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2133     CachableEntry<TBlocklike>* block_entry, BlockType block_type,
2134     GetContext* get_context, BlockCacheLookupContext* lookup_context,
2135     BlockContents* contents) const {
2136   assert(block_entry != nullptr);
2137   const bool no_io = (ro.read_tier == kBlockCacheTier);
2138   Cache* block_cache = rep_->table_options.block_cache.get();
2139   // No point to cache compressed blocks if it never goes away
2140   Cache* block_cache_compressed =
2141       rep_->immortal_table ? nullptr
2142                            : rep_->table_options.block_cache_compressed.get();
2143 
2144   // First, try to get the block from the cache
2145   //
2146   // If either block cache is enabled, we'll try to read from it.
2147   Status s;
2148   char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
2149   char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
2150   Slice key /* key to the block cache */;
2151   Slice ckey /* key to the compressed block cache */;
2152   bool is_cache_hit = false;
2153   if (block_cache != nullptr || block_cache_compressed != nullptr) {
2154     // create key for block cache
2155     if (block_cache != nullptr) {
2156       key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
2157                         handle, cache_key);
2158     }
2159 
2160     if (block_cache_compressed != nullptr) {
2161       ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
2162                          rep_->compressed_cache_key_prefix_size, handle,
2163                          compressed_cache_key);
2164     }
2165 
2166     if (!contents) {
2167       s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
2168                                 ro, block_entry, uncompression_dict, block_type,
2169                                 get_context);
2170       if (block_entry->GetValue()) {
2171         // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
2172         // compressed block cache.
2173         is_cache_hit = true;
2174       }
2175     }
2176 
2177     // Can't find the block from the cache. If I/O is allowed, read from the
2178     // file.
2179     if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
2180       Statistics* statistics = rep_->ioptions.statistics;
2181       const bool maybe_compressed =
2182           block_type != BlockType::kFilter &&
2183           block_type != BlockType::kCompressionDictionary &&
2184           rep_->blocks_maybe_compressed;
2185       const bool do_uncompress = maybe_compressed && !block_cache_compressed;
2186       CompressionType raw_block_comp_type;
2187       BlockContents raw_block_contents;
2188       if (!contents) {
2189         StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
2190         BlockFetcher block_fetcher(
2191             rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
2192             &raw_block_contents, rep_->ioptions, do_uncompress,
2193             maybe_compressed, block_type, uncompression_dict,
2194             rep_->persistent_cache_options,
2195             GetMemoryAllocator(rep_->table_options),
2196             GetMemoryAllocatorForCompressedBlock(rep_->table_options));
2197         s = block_fetcher.ReadBlockContents();
2198         raw_block_comp_type = block_fetcher.get_compression_type();
2199         contents = &raw_block_contents;
2200       } else {
2201         raw_block_comp_type = contents->get_compression_type();
2202       }
2203 
2204       if (s.ok()) {
2205         SequenceNumber seq_no = rep_->get_global_seqno(block_type);
2206         // If filling cache is allowed and a cache is configured, try to put the
2207         // block to the cache.
2208         s = PutDataBlockToCache(
2209             key, ckey, block_cache, block_cache_compressed, block_entry,
2210             contents, raw_block_comp_type, uncompression_dict, seq_no,
2211             GetMemoryAllocator(rep_->table_options), block_type, get_context);
2212       }
2213     }
2214   }
2215 
2216   // Fill lookup_context.
2217   if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
2218       lookup_context) {
2219     size_t usage = 0;
2220     uint64_t nkeys = 0;
2221     if (block_entry->GetValue()) {
2222       // Approximate the number of keys in the block using restarts.
2223       nkeys =
2224           rep_->table_options.block_restart_interval *
2225           BlocklikeTraits<TBlocklike>::GetNumRestarts(*block_entry->GetValue());
2226       usage = block_entry->GetValue()->ApproximateMemoryUsage();
2227     }
2228     TraceType trace_block_type = TraceType::kTraceMax;
2229     switch (block_type) {
2230       case BlockType::kData:
2231         trace_block_type = TraceType::kBlockTraceDataBlock;
2232         break;
2233       case BlockType::kFilter:
2234         trace_block_type = TraceType::kBlockTraceFilterBlock;
2235         break;
2236       case BlockType::kCompressionDictionary:
2237         trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
2238         break;
2239       case BlockType::kRangeDeletion:
2240         trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
2241         break;
2242       case BlockType::kIndex:
2243         trace_block_type = TraceType::kBlockTraceIndexBlock;
2244         break;
2245       default:
2246         // This cannot happen.
2247         assert(false);
2248         break;
2249     }
2250     bool no_insert = no_io || !ro.fill_cache;
2251     if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
2252             trace_block_type, lookup_context->caller)) {
2253       // Defer logging the access to Get() and MultiGet() to trace additional
2254       // information, e.g., referenced_key_exist_in_block.
2255 
2256       // Make a copy of the block key here since it will be logged later.
2257       lookup_context->FillLookupContext(
2258           is_cache_hit, no_insert, trace_block_type,
2259           /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
2260     } else {
2261       // Avoid making copy of block_key and cf_name when constructing the access
2262       // record.
2263       BlockCacheTraceRecord access_record(
2264           rep_->ioptions.env->NowMicros(),
2265           /*block_key=*/"", trace_block_type,
2266           /*block_size=*/usage, rep_->cf_id_for_tracing(),
2267           /*cf_name=*/"", rep_->level_for_tracing(),
2268           rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
2269           no_insert, lookup_context->get_id,
2270           lookup_context->get_from_user_specified_snapshot,
2271           /*referenced_key=*/"");
2272       block_cache_tracer_->WriteBlockAccess(access_record, key,
2273                                             rep_->cf_name_for_tracing(),
2274                                             lookup_context->referenced_key);
2275     }
2276   }
2277 
2278   assert(s.ok() || block_entry->GetValue() == nullptr);
2279   return s;
2280 }
2281 
2282 // This function reads multiple data blocks from disk using Env::MultiRead()
2283 // and optionally inserts them into the block cache. It uses the scratch
2284 // buffer provided by the caller, which is contiguous. If scratch is a nullptr
2285 // it allocates a separate buffer for each block. Typically, if the blocks
2286 // need to be uncompressed and there is no compressed block cache, callers
2287 // can allocate a temporary scratch buffer in order to minimize memory
2288 // allocations.
2289 // If options.fill_cache is true, it inserts the blocks into cache. If its
2290 // false and scratch is non-null and the blocks are uncompressed, it copies
2291 // the buffers to heap. In any case, the CachableEntry<Block> returned will
2292 // own the data bytes.
2293 // If compression is enabled and also there is no compressed block cache,
2294 // the adjacent blocks are read out in one IO (combined read)
2295 // batch - A MultiGetRange with only those keys with unique data blocks not
2296 //         found in cache
2297 // handles - A vector of block handles. Some of them me be NULL handles
2298 // scratch - An optional contiguous buffer to read compressed blocks into
RetrieveMultipleBlocks(const ReadOptions & options,const MultiGetRange * batch,const autovector<BlockHandle,MultiGetContext::MAX_BATCH_SIZE> * handles,autovector<Status,MultiGetContext::MAX_BATCH_SIZE> * statuses,autovector<CachableEntry<Block>,MultiGetContext::MAX_BATCH_SIZE> * results,char * scratch,const UncompressionDict & uncompression_dict) const2299 void BlockBasedTable::RetrieveMultipleBlocks(
2300     const ReadOptions& options, const MultiGetRange* batch,
2301     const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
2302     autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
2303     autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
2304     char* scratch, const UncompressionDict& uncompression_dict) const {
2305   RandomAccessFileReader* file = rep_->file.get();
2306   const Footer& footer = rep_->footer;
2307   const ImmutableCFOptions& ioptions = rep_->ioptions;
2308   SequenceNumber global_seqno = rep_->get_global_seqno(BlockType::kData);
2309   size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
2310   MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
2311 
2312   if (file->use_direct_io() || ioptions.allow_mmap_reads) {
2313     size_t idx_in_batch = 0;
2314     for (auto mget_iter = batch->begin(); mget_iter != batch->end();
2315          ++mget_iter, ++idx_in_batch) {
2316       BlockCacheLookupContext lookup_data_block_context(
2317           TableReaderCaller::kUserMultiGet);
2318       const BlockHandle& handle = (*handles)[idx_in_batch];
2319       if (handle.IsNull()) {
2320         continue;
2321       }
2322 
2323       (*statuses)[idx_in_batch] =
2324           RetrieveBlock(nullptr, options, handle, uncompression_dict,
2325                         &(*results)[idx_in_batch], BlockType::kData,
2326                         mget_iter->get_context, &lookup_data_block_context,
2327                         /* for_compaction */ false, /* use_cache */ true);
2328     }
2329     return;
2330   }
2331 
2332   autovector<FSReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs;
2333   size_t buf_offset = 0;
2334   size_t idx_in_batch = 0;
2335 
2336   uint64_t prev_offset = 0;
2337   size_t prev_len = 0;
2338   autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_idx_for_block;
2339   autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_offset_for_block;
2340   for (auto mget_iter = batch->begin(); mget_iter != batch->end();
2341        ++mget_iter, ++idx_in_batch) {
2342     const BlockHandle& handle = (*handles)[idx_in_batch];
2343     if (handle.IsNull()) {
2344       continue;
2345     }
2346 
2347     size_t prev_end = static_cast<size_t>(prev_offset) + prev_len;
2348 
2349     // If current block is adjacent to the previous one, at the same time,
2350     // compression is enabled and there is no compressed cache, we combine
2351     // the two block read as one.
2352     if (scratch != nullptr && prev_end == handle.offset()) {
2353       req_offset_for_block.emplace_back(prev_len);
2354       prev_len += block_size(handle);
2355     } else {
2356       // No compression or current block and previous one is not adjacent:
2357       // Step 1, create a new request for previous blocks
2358       if (prev_len != 0) {
2359         FSReadRequest req;
2360         req.offset = prev_offset;
2361         req.len = prev_len;
2362         if (scratch == nullptr) {
2363           req.scratch = new char[req.len];
2364         } else {
2365           req.scratch = scratch + buf_offset;
2366           buf_offset += req.len;
2367         }
2368         req.status = IOStatus::OK();
2369         read_reqs.emplace_back(req);
2370       }
2371 
2372       // Step 2, remeber the previous block info
2373       prev_offset = handle.offset();
2374       prev_len = block_size(handle);
2375       req_offset_for_block.emplace_back(0);
2376     }
2377     req_idx_for_block.emplace_back(read_reqs.size());
2378   }
2379   // Handle the last block and process the pending last request
2380   if (prev_len != 0) {
2381     FSReadRequest req;
2382     req.offset = prev_offset;
2383     req.len = prev_len;
2384     if (scratch == nullptr) {
2385       req.scratch = new char[req.len];
2386     } else {
2387       req.scratch = scratch + buf_offset;
2388     }
2389     req.status = IOStatus::OK();
2390     read_reqs.emplace_back(req);
2391   }
2392 
2393   file->MultiRead(&read_reqs[0], read_reqs.size());
2394 
2395   idx_in_batch = 0;
2396   size_t valid_batch_idx = 0;
2397   for (auto mget_iter = batch->begin(); mget_iter != batch->end();
2398        ++mget_iter, ++idx_in_batch) {
2399     const BlockHandle& handle = (*handles)[idx_in_batch];
2400 
2401     if (handle.IsNull()) {
2402       continue;
2403     }
2404 
2405     assert(valid_batch_idx < req_idx_for_block.size());
2406     assert(valid_batch_idx < req_offset_for_block.size());
2407     assert(req_idx_for_block[valid_batch_idx] < read_reqs.size());
2408     size_t& req_idx = req_idx_for_block[valid_batch_idx];
2409     size_t& req_offset = req_offset_for_block[valid_batch_idx];
2410     valid_batch_idx++;
2411     FSReadRequest& req = read_reqs[req_idx];
2412     Status s = req.status;
2413     if (s.ok()) {
2414       if (req.result.size() != req.len) {
2415         s = Status::Corruption(
2416             "truncated block read from " + rep_->file->file_name() +
2417             " offset " + ToString(handle.offset()) + ", expected " +
2418             ToString(req.len) + " bytes, got " + ToString(req.result.size()));
2419       }
2420     }
2421 
2422     BlockContents raw_block_contents;
2423     size_t cur_read_end = req_offset + block_size(handle);
2424     if (cur_read_end > req.result.size()) {
2425       s = Status::Corruption(
2426           "truncated block read from " + rep_->file->file_name() + " offset " +
2427           ToString(handle.offset()) + ", expected " + ToString(req.len) +
2428           " bytes, got " + ToString(req.result.size()));
2429     }
2430 
2431     bool blocks_share_read_buffer = (req.result.size() != block_size(handle));
2432     if (s.ok()) {
2433       if (scratch == nullptr && !blocks_share_read_buffer) {
2434         // We allocated a buffer for this block. Give ownership of it to
2435         // BlockContents so it can free the memory
2436         assert(req.result.data() == req.scratch);
2437         std::unique_ptr<char[]> raw_block(req.scratch + req_offset);
2438         raw_block_contents = BlockContents(std::move(raw_block), handle.size());
2439       } else {
2440         // We used the scratch buffer which are shared by the blocks.
2441         // raw_block_contents does not have the ownership.
2442         raw_block_contents =
2443             BlockContents(Slice(req.scratch + req_offset, handle.size()));
2444       }
2445 
2446 #ifndef NDEBUG
2447       raw_block_contents.is_raw_block = true;
2448 #endif
2449       if (options.verify_checksums) {
2450         PERF_TIMER_GUARD(block_checksum_time);
2451         const char* data = req.result.data();
2452         uint32_t expected =
2453             DecodeFixed32(data + req_offset + handle.size() + 1);
2454         // Since the scratch might be shared. the offset of the data block in
2455         // the buffer might not be 0. req.result.data() only point to the
2456         // begin address of each read request, we need to add the offset
2457         // in each read request. Checksum is stored in the block trailer,
2458         // which is handle.size() + 1.
2459         s = ROCKSDB_NAMESPACE::VerifyChecksum(footer.checksum(),
2460                                               req.result.data() + req_offset,
2461                                               handle.size() + 1, expected);
2462         TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
2463       }
2464     }
2465 
2466     if (s.ok()) {
2467       // It handles a rare case: compression is set and these is no compressed
2468       // cache (enable combined read). In this case, the scratch != nullptr.
2469       // At the same time, some blocks are actually not compressed,
2470       // since its compression space saving is smaller than the threshold. In
2471       // this case, if the block shares the scratch memory, we need to copy it
2472       // to the heap such that it can be added to the regular block cache.
2473       CompressionType compression_type =
2474           raw_block_contents.get_compression_type();
2475       if (scratch != nullptr && compression_type == kNoCompression) {
2476         Slice raw = Slice(req.scratch + req_offset, block_size(handle));
2477         raw_block_contents = BlockContents(
2478             CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
2479             handle.size());
2480 #ifndef NDEBUG
2481         raw_block_contents.is_raw_block = true;
2482 #endif
2483       }
2484     }
2485 
2486     if (s.ok()) {
2487       if (options.fill_cache) {
2488         BlockCacheLookupContext lookup_data_block_context(
2489             TableReaderCaller::kUserMultiGet);
2490         CachableEntry<Block>* block_entry = &(*results)[idx_in_batch];
2491         // MaybeReadBlockAndLoadToCache will insert into the block caches if
2492         // necessary. Since we're passing the raw block contents, it will
2493         // avoid looking up the block cache
2494         s = MaybeReadBlockAndLoadToCache(
2495             nullptr, options, handle, uncompression_dict, block_entry,
2496             BlockType::kData, mget_iter->get_context,
2497             &lookup_data_block_context, &raw_block_contents);
2498 
2499         // block_entry value could be null if no block cache is present, i.e
2500         // BlockBasedTableOptions::no_block_cache is true and no compressed
2501         // block cache is configured. In that case, fall
2502         // through and set up the block explicitly
2503         if (block_entry->GetValue() != nullptr) {
2504           continue;
2505         }
2506       }
2507 
2508       CompressionType compression_type =
2509           raw_block_contents.get_compression_type();
2510       BlockContents contents;
2511       if (compression_type != kNoCompression) {
2512         UncompressionContext context(compression_type);
2513         UncompressionInfo info(context, uncompression_dict, compression_type);
2514         s = UncompressBlockContents(info, req.result.data() + req_offset,
2515                                     handle.size(), &contents, footer.version(),
2516                                     rep_->ioptions, memory_allocator);
2517       } else {
2518         // There are two cases here: 1) caller uses the scratch buffer; 2) we
2519         // use the requst buffer. If scratch buffer is used, we ensure that
2520         // all raw blocks are copyed to the heap as single blocks. If scratch
2521         // buffer is not used, we also have no combined read, so the raw
2522         // block can be used directly.
2523         contents = std::move(raw_block_contents);
2524       }
2525       if (s.ok()) {
2526         (*results)[idx_in_batch].SetOwnedValue(
2527             new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
2528                       ioptions.statistics));
2529       }
2530     }
2531     (*statuses)[idx_in_batch] = s;
2532   }
2533 }
2534 
2535 template <typename TBlocklike>
RetrieveBlock(FilePrefetchBuffer * prefetch_buffer,const ReadOptions & ro,const BlockHandle & handle,const UncompressionDict & uncompression_dict,CachableEntry<TBlocklike> * block_entry,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,bool for_compaction,bool use_cache) const2536 Status BlockBasedTable::RetrieveBlock(
2537     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2538     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2539     CachableEntry<TBlocklike>* block_entry, BlockType block_type,
2540     GetContext* get_context, BlockCacheLookupContext* lookup_context,
2541     bool for_compaction, bool use_cache) const {
2542   assert(block_entry);
2543   assert(block_entry->IsEmpty());
2544 
2545   Status s;
2546   if (use_cache) {
2547     s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
2548                                      uncompression_dict, block_entry,
2549                                      block_type, get_context, lookup_context,
2550                                      /*contents=*/nullptr);
2551 
2552     if (!s.ok()) {
2553       return s;
2554     }
2555 
2556     if (block_entry->GetValue() != nullptr) {
2557       assert(s.ok());
2558       return s;
2559     }
2560   }
2561 
2562   assert(block_entry->IsEmpty());
2563 
2564   const bool no_io = ro.read_tier == kBlockCacheTier;
2565   if (no_io) {
2566     return Status::Incomplete("no blocking io");
2567   }
2568 
2569   const bool maybe_compressed =
2570       block_type != BlockType::kFilter &&
2571       block_type != BlockType::kCompressionDictionary &&
2572       rep_->blocks_maybe_compressed;
2573   const bool do_uncompress = maybe_compressed;
2574   std::unique_ptr<TBlocklike> block;
2575 
2576   {
2577     StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
2578                  READ_BLOCK_GET_MICROS);
2579     s = ReadBlockFromFile(
2580         rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
2581         rep_->ioptions, do_uncompress, maybe_compressed, block_type,
2582         uncompression_dict, rep_->persistent_cache_options,
2583         rep_->get_global_seqno(block_type),
2584         block_type == BlockType::kData
2585             ? rep_->table_options.read_amp_bytes_per_bit
2586             : 0,
2587         GetMemoryAllocator(rep_->table_options), for_compaction,
2588         rep_->blocks_definitely_zstd_compressed,
2589         rep_->table_options.filter_policy.get());
2590   }
2591 
2592   if (!s.ok()) {
2593     return s;
2594   }
2595 
2596   block_entry->SetOwnedValue(block.release());
2597 
2598   assert(s.ok());
2599   return s;
2600 }
2601 
2602 // Explicitly instantiate templates for both "blocklike" types we use.
2603 // This makes it possible to keep the template definitions in the .cc file.
2604 template Status BlockBasedTable::RetrieveBlock<BlockContents>(
2605     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2606     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2607     CachableEntry<BlockContents>* block_entry, BlockType block_type,
2608     GetContext* get_context, BlockCacheLookupContext* lookup_context,
2609     bool for_compaction, bool use_cache) const;
2610 
2611 template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
2612     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2613     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2614     CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type,
2615     GetContext* get_context, BlockCacheLookupContext* lookup_context,
2616     bool for_compaction, bool use_cache) const;
2617 
2618 template Status BlockBasedTable::RetrieveBlock<Block>(
2619     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2620     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2621     CachableEntry<Block>* block_entry, BlockType block_type,
2622     GetContext* get_context, BlockCacheLookupContext* lookup_context,
2623     bool for_compaction, bool use_cache) const;
2624 
2625 template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
2626     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2627     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2628     CachableEntry<UncompressionDict>* block_entry, BlockType block_type,
2629     GetContext* get_context, BlockCacheLookupContext* lookup_context,
2630     bool for_compaction, bool use_cache) const;
2631 
PartitionedIndexIteratorState(const BlockBasedTable * table,std::unordered_map<uint64_t,CachableEntry<Block>> * block_map)2632 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
2633     const BlockBasedTable* table,
2634     std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
2635     : table_(table), block_map_(block_map) {}
2636 
2637 InternalIteratorBase<IndexValue>*
NewSecondaryIterator(const BlockHandle & handle)2638 BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
2639     const BlockHandle& handle) {
2640   // Return a block iterator on the index partition
2641   auto block = block_map_->find(handle.offset());
2642   // This is a possible scenario since block cache might not have had space
2643   // for the partition
2644   if (block != block_map_->end()) {
2645     const Rep* rep = table_->get_rep();
2646     assert(rep);
2647 
2648     Statistics* kNullStats = nullptr;
2649     // We don't return pinned data from index blocks, so no need
2650     // to set `block_contents_pinned`.
2651     return block->second.GetValue()->NewIndexIterator(
2652         &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2653         nullptr, kNullStats, true, rep->index_has_first_key,
2654         rep->index_key_includes_seq, rep->index_value_is_full);
2655   }
2656   // Create an empty iterator
2657   return new IndexBlockIter();
2658 }
2659 
2660 // This will be broken if the user specifies an unusual implementation
2661 // of Options.comparator, or if the user specifies an unusual
2662 // definition of prefixes in BlockBasedTableOptions.filter_policy.
2663 // In particular, we require the following three properties:
2664 //
2665 // 1) key.starts_with(prefix(key))
2666 // 2) Compare(prefix(key), key) <= 0.
2667 // 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
2668 //
2669 // Otherwise, this method guarantees no I/O will be incurred.
2670 //
2671 // REQUIRES: this method shouldn't be called while the DB lock is held.
PrefixMayMatch(const Slice & internal_key,const ReadOptions & read_options,const SliceTransform * options_prefix_extractor,const bool need_upper_bound_check,BlockCacheLookupContext * lookup_context) const2672 bool BlockBasedTable::PrefixMayMatch(
2673     const Slice& internal_key, const ReadOptions& read_options,
2674     const SliceTransform* options_prefix_extractor,
2675     const bool need_upper_bound_check,
2676     BlockCacheLookupContext* lookup_context) const {
2677   if (!rep_->filter_policy) {
2678     return true;
2679   }
2680 
2681   const SliceTransform* prefix_extractor;
2682 
2683   if (rep_->table_prefix_extractor == nullptr) {
2684     if (need_upper_bound_check) {
2685       return true;
2686     }
2687     prefix_extractor = options_prefix_extractor;
2688   } else {
2689     prefix_extractor = rep_->table_prefix_extractor.get();
2690   }
2691   auto user_key = ExtractUserKey(internal_key);
2692   if (!prefix_extractor->InDomain(user_key)) {
2693     return true;
2694   }
2695 
2696   bool may_match = true;
2697   Status s;
2698 
2699   // First, try check with full filter
2700   FilterBlockReader* const filter = rep_->filter.get();
2701   bool filter_checked = true;
2702   if (filter != nullptr) {
2703     if (!filter->IsBlockBased()) {
2704       const Slice* const const_ikey_ptr = &internal_key;
2705       may_match = filter->RangeMayExist(
2706           read_options.iterate_upper_bound, user_key, prefix_extractor,
2707           rep_->internal_comparator.user_comparator(), const_ikey_ptr,
2708           &filter_checked, need_upper_bound_check, lookup_context);
2709     } else {
2710       // if prefix_extractor changed for block based filter, skip filter
2711       if (need_upper_bound_check) {
2712         return true;
2713       }
2714       auto prefix = prefix_extractor->Transform(user_key);
2715       InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
2716       auto internal_prefix = internal_key_prefix.Encode();
2717 
2718       // To prevent any io operation in this method, we set `read_tier` to make
2719       // sure we always read index or filter only when they have already been
2720       // loaded to memory.
2721       ReadOptions no_io_read_options;
2722       no_io_read_options.read_tier = kBlockCacheTier;
2723 
2724       // Then, try find it within each block
2725       // we already know prefix_extractor and prefix_extractor_name must match
2726       // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
2727       std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
2728           no_io_read_options,
2729           /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
2730           /*get_context=*/nullptr, lookup_context));
2731       iiter->Seek(internal_prefix);
2732 
2733       if (!iiter->Valid()) {
2734         // we're past end of file
2735         // if it's incomplete, it means that we avoided I/O
2736         // and we're not really sure that we're past the end
2737         // of the file
2738         may_match = iiter->status().IsIncomplete();
2739       } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key())
2740                                                : iiter->key())
2741                      .starts_with(ExtractUserKey(internal_prefix))) {
2742         // we need to check for this subtle case because our only
2743         // guarantee is that "the key is a string >= last key in that data
2744         // block" according to the doc/table_format.txt spec.
2745         //
2746         // Suppose iiter->key() starts with the desired prefix; it is not
2747         // necessarily the case that the corresponding data block will
2748         // contain the prefix, since iiter->key() need not be in the
2749         // block.  However, the next data block may contain the prefix, so
2750         // we return true to play it safe.
2751         may_match = true;
2752       } else if (filter->IsBlockBased()) {
2753         // iiter->key() does NOT start with the desired prefix.  Because
2754         // Seek() finds the first key that is >= the seek target, this
2755         // means that iiter->key() > prefix.  Thus, any data blocks coming
2756         // after the data block corresponding to iiter->key() cannot
2757         // possibly contain the key.  Thus, the corresponding data block
2758         // is the only on could potentially contain the prefix.
2759         BlockHandle handle = iiter->value().handle;
2760         may_match = filter->PrefixMayMatch(
2761             prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
2762             /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context);
2763       }
2764     }
2765   }
2766 
2767   if (filter_checked) {
2768     Statistics* statistics = rep_->ioptions.statistics;
2769     RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
2770     if (!may_match) {
2771       RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
2772     }
2773   }
2774 
2775   return may_match;
2776 }
2777 
2778 template <class TBlockIter, typename TValue>
Seek(const Slice & target)2779 void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
2780   SeekImpl(&target);
2781 }
2782 
2783 template <class TBlockIter, typename TValue>
SeekToFirst()2784 void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
2785   SeekImpl(nullptr);
2786 }
2787 
2788 template <class TBlockIter, typename TValue>
SeekImpl(const Slice * target)2789 void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
2790     const Slice* target) {
2791   is_out_of_bound_ = false;
2792   is_at_first_key_from_index_ = false;
2793   if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) {
2794     ResetDataIter();
2795     return;
2796   }
2797 
2798   bool need_seek_index = true;
2799   if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
2800     // Reseek.
2801     prev_block_offset_ = index_iter_->value().handle.offset();
2802 
2803     if (target) {
2804       // We can avoid an index seek if:
2805       // 1. The new seek key is larger than the current key
2806       // 2. The new seek key is within the upper bound of the block
2807       // Since we don't necessarily know the internal key for either
2808       // the current key or the upper bound, we check user keys and
2809       // exclude the equality case. Considering internal keys can
2810       // improve for the boundary cases, but it would complicate the
2811       // code.
2812       if (user_comparator_.Compare(ExtractUserKey(*target),
2813                                    block_iter_.user_key()) > 0 &&
2814           user_comparator_.Compare(ExtractUserKey(*target),
2815                                    index_iter_->user_key()) < 0) {
2816         need_seek_index = false;
2817       }
2818     }
2819   }
2820 
2821   if (need_seek_index) {
2822     if (target) {
2823       index_iter_->Seek(*target);
2824     } else {
2825       index_iter_->SeekToFirst();
2826     }
2827 
2828     if (!index_iter_->Valid()) {
2829       ResetDataIter();
2830       return;
2831     }
2832   }
2833 
2834   IndexValue v = index_iter_->value();
2835   const bool same_block = block_iter_points_to_real_block_ &&
2836                           v.handle.offset() == prev_block_offset_;
2837 
2838   // TODO(kolmike): Remove the != kBlockCacheTier condition.
2839   if (!v.first_internal_key.empty() && !same_block &&
2840       (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
2841       read_options_.read_tier != kBlockCacheTier) {
2842     // Index contains the first key of the block, and it's >= target.
2843     // We can defer reading the block.
2844     is_at_first_key_from_index_ = true;
2845     // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
2846     // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
2847     // as that will be done later when the data block is actually read.
2848     ResetDataIter();
2849   } else {
2850     // Need to use the data block.
2851     if (!same_block) {
2852       InitDataBlock();
2853     } else {
2854       // When the user does a reseek, the iterate_upper_bound might have
2855       // changed. CheckDataBlockWithinUpperBound() needs to be called
2856       // explicitly if the reseek ends up in the same data block.
2857       // If the reseek ends up in a different block, InitDataBlock() will do
2858       // the iterator upper bound check.
2859       CheckDataBlockWithinUpperBound();
2860     }
2861 
2862     if (target) {
2863       block_iter_.Seek(*target);
2864     } else {
2865       block_iter_.SeekToFirst();
2866     }
2867     FindKeyForward();
2868   }
2869 
2870   CheckOutOfBound();
2871 
2872   if (target) {
2873     assert(!Valid() || ((block_type_ == BlockType::kIndex &&
2874                          !table_->get_rep()->index_key_includes_seq)
2875                             ? (user_comparator_.Compare(ExtractUserKey(*target),
2876                                                         key()) <= 0)
2877                             : (icomp_.Compare(*target, key()) <= 0)));
2878   }
2879 }
2880 
2881 template <class TBlockIter, typename TValue>
SeekForPrev(const Slice & target)2882 void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
2883     const Slice& target) {
2884   is_out_of_bound_ = false;
2885   is_at_first_key_from_index_ = false;
2886   // For now totally disable prefix seek in auto prefix mode because we don't
2887   // have logic
2888   if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) {
2889     ResetDataIter();
2890     return;
2891   }
2892 
2893   SavePrevIndexValue();
2894 
2895   // Call Seek() rather than SeekForPrev() in the index block, because the
2896   // target data block will likely to contain the position for `target`, the
2897   // same as Seek(), rather than than before.
2898   // For example, if we have three data blocks, each containing two keys:
2899   //   [2, 4]  [6, 8] [10, 12]
2900   //  (the keys in the index block would be [4, 8, 12])
2901   // and the user calls SeekForPrev(7), we need to go to the second block,
2902   // just like if they call Seek(7).
2903   // The only case where the block is difference is when they seek to a position
2904   // in the boundary. For example, if they SeekForPrev(5), we should go to the
2905   // first block, rather than the second. However, we don't have the information
2906   // to distinguish the two unless we read the second block. In this case, we'll
2907   // end up with reading two blocks.
2908   index_iter_->Seek(target);
2909 
2910   if (!index_iter_->Valid()) {
2911     auto seek_status = index_iter_->status();
2912     // Check for IO error
2913     if (!seek_status.IsNotFound() && !seek_status.ok()) {
2914       ResetDataIter();
2915       return;
2916     }
2917 
2918     // With prefix index, Seek() returns NotFound if the prefix doesn't exist
2919     if (seek_status.IsNotFound()) {
2920       // Any key less than the target is fine for prefix seek
2921       ResetDataIter();
2922       return;
2923     } else {
2924       index_iter_->SeekToLast();
2925     }
2926     // Check for IO error
2927     if (!index_iter_->Valid()) {
2928       ResetDataIter();
2929       return;
2930     }
2931   }
2932 
2933   InitDataBlock();
2934 
2935   block_iter_.SeekForPrev(target);
2936 
2937   FindKeyBackward();
2938   CheckDataBlockWithinUpperBound();
2939   assert(!block_iter_.Valid() ||
2940          icomp_.Compare(target, block_iter_.key()) >= 0);
2941 }
2942 
2943 template <class TBlockIter, typename TValue>
SeekToLast()2944 void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
2945   is_out_of_bound_ = false;
2946   is_at_first_key_from_index_ = false;
2947   SavePrevIndexValue();
2948   index_iter_->SeekToLast();
2949   if (!index_iter_->Valid()) {
2950     ResetDataIter();
2951     return;
2952   }
2953   InitDataBlock();
2954   block_iter_.SeekToLast();
2955   FindKeyBackward();
2956   CheckDataBlockWithinUpperBound();
2957 }
2958 
2959 template <class TBlockIter, typename TValue>
Next()2960 void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
2961   if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
2962     return;
2963   }
2964   assert(block_iter_points_to_real_block_);
2965   block_iter_.Next();
2966   FindKeyForward();
2967   CheckOutOfBound();
2968 }
2969 
2970 template <class TBlockIter, typename TValue>
NextAndGetResult(IterateResult * result)2971 bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
2972     IterateResult* result) {
2973   Next();
2974   bool is_valid = Valid();
2975   if (is_valid) {
2976     result->key = key();
2977     result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
2978   }
2979   return is_valid;
2980 }
2981 
2982 template <class TBlockIter, typename TValue>
Prev()2983 void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
2984   if (is_at_first_key_from_index_) {
2985     is_at_first_key_from_index_ = false;
2986 
2987     index_iter_->Prev();
2988     if (!index_iter_->Valid()) {
2989       return;
2990     }
2991 
2992     InitDataBlock();
2993     block_iter_.SeekToLast();
2994   } else {
2995     assert(block_iter_points_to_real_block_);
2996     block_iter_.Prev();
2997   }
2998 
2999   FindKeyBackward();
3000 }
3001 
3002 template <class TBlockIter, typename TValue>
InitDataBlock()3003 void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
3004   BlockHandle data_block_handle = index_iter_->value().handle;
3005   if (!block_iter_points_to_real_block_ ||
3006       data_block_handle.offset() != prev_block_offset_ ||
3007       // if previous attempt of reading the block missed cache, try again
3008       block_iter_.status().IsIncomplete()) {
3009     if (block_iter_points_to_real_block_) {
3010       ResetDataIter();
3011     }
3012     auto* rep = table_->get_rep();
3013 
3014     // Prefetch additional data for range scans (iterators). Enabled only for
3015     // user reads.
3016     // Implicit auto readahead:
3017     //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
3018     // Explicit user requested readahead:
3019     //   Enabled from the very first IO when ReadOptions.readahead_size is set.
3020     if (lookup_context_.caller != TableReaderCaller::kCompaction) {
3021       if (read_options_.readahead_size == 0) {
3022         // Implicit auto readahead
3023         num_file_reads_++;
3024         if (num_file_reads_ >
3025             BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
3026           if (!rep->file->use_direct_io() &&
3027               (data_block_handle.offset() +
3028                    static_cast<size_t>(block_size(data_block_handle)) >
3029                readahead_limit_)) {
3030             // Buffered I/O
3031             // Discarding the return status of Prefetch calls intentionally, as
3032             // we can fallback to reading from disk if Prefetch fails.
3033             rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
3034             readahead_limit_ = static_cast<size_t>(data_block_handle.offset() +
3035                                                    readahead_size_);
3036             // Keep exponentially increasing readahead size until
3037             // kMaxAutoReadaheadSize.
3038             readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize,
3039                                        readahead_size_ * 2);
3040           } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
3041             // Direct I/O
3042             // Let FilePrefetchBuffer take care of the readahead.
3043             rep->CreateFilePrefetchBuffer(
3044                 BlockBasedTable::kInitAutoReadaheadSize,
3045                 BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
3046           }
3047         }
3048       } else if (!prefetch_buffer_) {
3049         // Explicit user requested readahead
3050         // The actual condition is:
3051         // if (read_options_.readahead_size != 0 && !prefetch_buffer_)
3052         rep->CreateFilePrefetchBuffer(read_options_.readahead_size,
3053                                       read_options_.readahead_size,
3054                                       &prefetch_buffer_);
3055       }
3056     } else if (!prefetch_buffer_) {
3057       rep->CreateFilePrefetchBuffer(compaction_readahead_size_,
3058                                     compaction_readahead_size_,
3059                                     &prefetch_buffer_);
3060     }
3061 
3062     Status s;
3063     table_->NewDataBlockIterator<TBlockIter>(
3064         read_options_, data_block_handle, &block_iter_, block_type_,
3065         /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(),
3066         /*for_compaction=*/lookup_context_.caller ==
3067             TableReaderCaller::kCompaction);
3068     block_iter_points_to_real_block_ = true;
3069     CheckDataBlockWithinUpperBound();
3070   }
3071 }
3072 
3073 template <class TBlockIter, typename TValue>
MaterializeCurrentBlock()3074 bool BlockBasedTableIterator<TBlockIter, TValue>::MaterializeCurrentBlock() {
3075   assert(is_at_first_key_from_index_);
3076   assert(!block_iter_points_to_real_block_);
3077   assert(index_iter_->Valid());
3078 
3079   is_at_first_key_from_index_ = false;
3080   InitDataBlock();
3081   assert(block_iter_points_to_real_block_);
3082   block_iter_.SeekToFirst();
3083 
3084   if (!block_iter_.Valid() ||
3085       icomp_.Compare(block_iter_.key(),
3086                      index_iter_->value().first_internal_key) != 0) {
3087     // Uh oh.
3088     block_iter_.Invalidate(Status::Corruption(
3089         "first key in index doesn't match first key in block"));
3090     return false;
3091   }
3092 
3093   return true;
3094 }
3095 
3096 template <class TBlockIter, typename TValue>
FindKeyForward()3097 void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
3098   // This method's code is kept short to make it likely to be inlined.
3099 
3100   assert(!is_out_of_bound_);
3101   assert(block_iter_points_to_real_block_);
3102 
3103   if (!block_iter_.Valid()) {
3104     // This is the only call site of FindBlockForward(), but it's extracted into
3105     // a separate method to keep FindKeyForward() short and likely to be
3106     // inlined. When transitioning to a different block, we call
3107     // FindBlockForward(), which is much longer and is probably not inlined.
3108     FindBlockForward();
3109   } else {
3110     // This is the fast path that avoids a function call.
3111   }
3112 }
3113 
3114 template <class TBlockIter, typename TValue>
FindBlockForward()3115 void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
3116   // TODO the while loop inherits from two-level-iterator. We don't know
3117   // whether a block can be empty so it can be replaced by an "if".
3118   do {
3119     if (!block_iter_.status().ok()) {
3120       return;
3121     }
3122     // Whether next data block is out of upper bound, if there is one.
3123     const bool next_block_is_out_of_bound =
3124         read_options_.iterate_upper_bound != nullptr &&
3125         block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
3126     assert(!next_block_is_out_of_bound ||
3127            user_comparator_.Compare(*read_options_.iterate_upper_bound,
3128                                     index_iter_->user_key()) <= 0);
3129     ResetDataIter();
3130     index_iter_->Next();
3131     if (next_block_is_out_of_bound) {
3132       // The next block is out of bound. No need to read it.
3133       TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
3134       // We need to make sure this is not the last data block before setting
3135       // is_out_of_bound_, since the index key for the last data block can be
3136       // larger than smallest key of the next file on the same level.
3137       if (index_iter_->Valid()) {
3138         is_out_of_bound_ = true;
3139       }
3140       return;
3141     }
3142 
3143     if (!index_iter_->Valid()) {
3144       return;
3145     }
3146 
3147     IndexValue v = index_iter_->value();
3148 
3149     // TODO(kolmike): Remove the != kBlockCacheTier condition.
3150     if (!v.first_internal_key.empty() &&
3151         read_options_.read_tier != kBlockCacheTier) {
3152       // Index contains the first key of the block. Defer reading the block.
3153       is_at_first_key_from_index_ = true;
3154       return;
3155     }
3156 
3157     InitDataBlock();
3158     block_iter_.SeekToFirst();
3159   } while (!block_iter_.Valid());
3160 }
3161 
3162 template <class TBlockIter, typename TValue>
FindKeyBackward()3163 void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
3164   while (!block_iter_.Valid()) {
3165     if (!block_iter_.status().ok()) {
3166       return;
3167     }
3168 
3169     ResetDataIter();
3170     index_iter_->Prev();
3171 
3172     if (index_iter_->Valid()) {
3173       InitDataBlock();
3174       block_iter_.SeekToLast();
3175     } else {
3176       return;
3177     }
3178   }
3179 
3180   // We could have check lower bound here too, but we opt not to do it for
3181   // code simplicity.
3182 }
3183 
3184 template <class TBlockIter, typename TValue>
CheckOutOfBound()3185 void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
3186   if (read_options_.iterate_upper_bound != nullptr && Valid()) {
3187     is_out_of_bound_ = user_comparator_.Compare(
3188                            *read_options_.iterate_upper_bound, user_key()) <= 0;
3189   }
3190 }
3191 
3192 template <class TBlockIter, typename TValue>
3193 void BlockBasedTableIterator<TBlockIter,
CheckDataBlockWithinUpperBound()3194                              TValue>::CheckDataBlockWithinUpperBound() {
3195   if (read_options_.iterate_upper_bound != nullptr &&
3196       block_iter_points_to_real_block_) {
3197     data_block_within_upper_bound_ =
3198         (user_comparator_.Compare(*read_options_.iterate_upper_bound,
3199                                   index_iter_->user_key()) > 0);
3200   }
3201 }
3202 
NewIterator(const ReadOptions & read_options,const SliceTransform * prefix_extractor,Arena * arena,bool skip_filters,TableReaderCaller caller,size_t compaction_readahead_size)3203 InternalIterator* BlockBasedTable::NewIterator(
3204     const ReadOptions& read_options, const SliceTransform* prefix_extractor,
3205     Arena* arena, bool skip_filters, TableReaderCaller caller,
3206     size_t compaction_readahead_size) {
3207   BlockCacheLookupContext lookup_context{caller};
3208   bool need_upper_bound_check =
3209       read_options.auto_prefix_mode ||
3210       PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
3211   if (arena == nullptr) {
3212     return new BlockBasedTableIterator<DataBlockIter>(
3213         this, read_options, rep_->internal_comparator,
3214         NewIndexIterator(
3215             read_options,
3216             need_upper_bound_check &&
3217                 rep_->index_type == BlockBasedTableOptions::kHashSearch,
3218             /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
3219         !skip_filters && !read_options.total_order_seek &&
3220             prefix_extractor != nullptr,
3221         need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
3222         compaction_readahead_size);
3223   } else {
3224     auto* mem =
3225         arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
3226     return new (mem) BlockBasedTableIterator<DataBlockIter>(
3227         this, read_options, rep_->internal_comparator,
3228         NewIndexIterator(
3229             read_options,
3230             need_upper_bound_check &&
3231                 rep_->index_type == BlockBasedTableOptions::kHashSearch,
3232             /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
3233         !skip_filters && !read_options.total_order_seek &&
3234             prefix_extractor != nullptr,
3235         need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
3236         compaction_readahead_size);
3237   }
3238 }
3239 
NewRangeTombstoneIterator(const ReadOptions & read_options)3240 FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
3241     const ReadOptions& read_options) {
3242   if (rep_->fragmented_range_dels == nullptr) {
3243     return nullptr;
3244   }
3245   SequenceNumber snapshot = kMaxSequenceNumber;
3246   if (read_options.snapshot != nullptr) {
3247     snapshot = read_options.snapshot->GetSequenceNumber();
3248   }
3249   return new FragmentedRangeTombstoneIterator(
3250       rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
3251 }
3252 
FullFilterKeyMayMatch(const ReadOptions & read_options,FilterBlockReader * filter,const Slice & internal_key,const bool no_io,const SliceTransform * prefix_extractor,GetContext * get_context,BlockCacheLookupContext * lookup_context) const3253 bool BlockBasedTable::FullFilterKeyMayMatch(
3254     const ReadOptions& read_options, FilterBlockReader* filter,
3255     const Slice& internal_key, const bool no_io,
3256     const SliceTransform* prefix_extractor, GetContext* get_context,
3257     BlockCacheLookupContext* lookup_context) const {
3258   if (filter == nullptr || filter->IsBlockBased()) {
3259     return true;
3260   }
3261   Slice user_key = ExtractUserKey(internal_key);
3262   const Slice* const const_ikey_ptr = &internal_key;
3263   bool may_match = true;
3264   if (rep_->whole_key_filtering) {
3265     size_t ts_sz =
3266         rep_->internal_comparator.user_comparator()->timestamp_size();
3267     Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
3268     may_match =
3269         filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
3270                             no_io, const_ikey_ptr, get_context, lookup_context);
3271   } else if (!read_options.total_order_seek && prefix_extractor &&
3272              rep_->table_properties->prefix_extractor_name.compare(
3273                  prefix_extractor->Name()) == 0 &&
3274              prefix_extractor->InDomain(user_key) &&
3275              !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
3276                                      prefix_extractor, kNotValid, no_io,
3277                                      const_ikey_ptr, get_context,
3278                                      lookup_context)) {
3279     may_match = false;
3280   }
3281   if (may_match) {
3282     RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
3283     PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
3284   }
3285   return may_match;
3286 }
3287 
FullFilterKeysMayMatch(const ReadOptions & read_options,FilterBlockReader * filter,MultiGetRange * range,const bool no_io,const SliceTransform * prefix_extractor,BlockCacheLookupContext * lookup_context) const3288 void BlockBasedTable::FullFilterKeysMayMatch(
3289     const ReadOptions& read_options, FilterBlockReader* filter,
3290     MultiGetRange* range, const bool no_io,
3291     const SliceTransform* prefix_extractor,
3292     BlockCacheLookupContext* lookup_context) const {
3293   if (filter == nullptr || filter->IsBlockBased()) {
3294     return;
3295   }
3296   if (rep_->whole_key_filtering) {
3297     filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
3298                          lookup_context);
3299   } else if (!read_options.total_order_seek && prefix_extractor &&
3300              rep_->table_properties->prefix_extractor_name.compare(
3301                  prefix_extractor->Name()) == 0) {
3302     filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
3303                              lookup_context);
3304   }
3305 }
3306 
Get(const ReadOptions & read_options,const Slice & key,GetContext * get_context,const SliceTransform * prefix_extractor,bool skip_filters)3307 Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
3308                             GetContext* get_context,
3309                             const SliceTransform* prefix_extractor,
3310                             bool skip_filters) {
3311   assert(key.size() >= 8);  // key must be internal key
3312   assert(get_context != nullptr);
3313   Status s;
3314   const bool no_io = read_options.read_tier == kBlockCacheTier;
3315 
3316   FilterBlockReader* const filter =
3317       !skip_filters ? rep_->filter.get() : nullptr;
3318 
3319   // First check the full filter
3320   // If full filter not useful, Then go into each block
3321   uint64_t tracing_get_id = get_context->get_tracing_get_id();
3322   BlockCacheLookupContext lookup_context{
3323       TableReaderCaller::kUserGet, tracing_get_id,
3324       /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
3325   if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
3326     // Trace the key since it contains both user key and sequence number.
3327     lookup_context.referenced_key = key.ToString();
3328     lookup_context.get_from_user_specified_snapshot =
3329         read_options.snapshot != nullptr;
3330   }
3331   const bool may_match =
3332       FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor,
3333                             get_context, &lookup_context);
3334   if (!may_match) {
3335     RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
3336     PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
3337   } else {
3338     IndexBlockIter iiter_on_stack;
3339     // if prefix_extractor found in block differs from options, disable
3340     // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
3341     bool need_upper_bound_check = false;
3342     if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
3343       need_upper_bound_check = PrefixExtractorChanged(
3344           rep_->table_properties.get(), prefix_extractor);
3345     }
3346     auto iiter =
3347         NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
3348                          get_context, &lookup_context);
3349     std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3350     if (iiter != &iiter_on_stack) {
3351       iiter_unique_ptr.reset(iiter);
3352     }
3353 
3354     size_t ts_sz =
3355         rep_->internal_comparator.user_comparator()->timestamp_size();
3356     bool matched = false;  // if such user key mathced a key in SST
3357     bool done = false;
3358     for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
3359       IndexValue v = iiter->value();
3360 
3361       bool not_exist_in_filter =
3362           filter != nullptr && filter->IsBlockBased() == true &&
3363           !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
3364                                prefix_extractor, v.handle.offset(), no_io,
3365                                /*const_ikey_ptr=*/nullptr, get_context,
3366                                &lookup_context);
3367 
3368       if (not_exist_in_filter) {
3369         // Not found
3370         // TODO: think about interaction with Merge. If a user key cannot
3371         // cross one data block, we should be fine.
3372         RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
3373         PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
3374         break;
3375       }
3376 
3377       if (!v.first_internal_key.empty() && !skip_filters &&
3378           UserComparatorWrapper(rep_->internal_comparator.user_comparator())
3379                   .Compare(ExtractUserKey(key),
3380                            ExtractUserKey(v.first_internal_key)) < 0) {
3381         // The requested key falls between highest key in previous block and
3382         // lowest key in current block.
3383         break;
3384       }
3385 
3386       BlockCacheLookupContext lookup_data_block_context{
3387           TableReaderCaller::kUserGet, tracing_get_id,
3388           /*get_from_user_specified_snapshot=*/read_options.snapshot !=
3389               nullptr};
3390       bool does_referenced_key_exist = false;
3391       DataBlockIter biter;
3392       uint64_t referenced_data_size = 0;
3393       NewDataBlockIterator<DataBlockIter>(
3394           read_options, v.handle, &biter, BlockType::kData, get_context,
3395           &lookup_data_block_context,
3396           /*s=*/Status(), /*prefetch_buffer*/ nullptr);
3397 
3398       if (no_io && biter.status().IsIncomplete()) {
3399         // couldn't get block from block_cache
3400         // Update Saver.state to Found because we are only looking for
3401         // whether we can guarantee the key is not there when "no_io" is set
3402         get_context->MarkKeyMayExist();
3403         break;
3404       }
3405       if (!biter.status().ok()) {
3406         s = biter.status();
3407         break;
3408       }
3409 
3410       bool may_exist = biter.SeekForGet(key);
3411       // If user-specified timestamp is supported, we cannot end the search
3412       // just because hash index lookup indicates the key+ts does not exist.
3413       if (!may_exist && ts_sz == 0) {
3414         // HashSeek cannot find the key this block and the the iter is not
3415         // the end of the block, i.e. cannot be in the following blocks
3416         // either. In this case, the seek_key cannot be found, so we break
3417         // from the top level for-loop.
3418         done = true;
3419       } else {
3420         // Call the *saver function on each entry/block until it returns false
3421         for (; biter.Valid(); biter.Next()) {
3422           ParsedInternalKey parsed_key;
3423           if (!ParseInternalKey(biter.key(), &parsed_key)) {
3424             s = Status::Corruption(Slice());
3425           }
3426 
3427           if (!get_context->SaveValue(
3428                   parsed_key, biter.value(), &matched,
3429                   biter.IsValuePinned() ? &biter : nullptr)) {
3430             if (get_context->State() == GetContext::GetState::kFound) {
3431               does_referenced_key_exist = true;
3432               referenced_data_size = biter.key().size() + biter.value().size();
3433             }
3434             done = true;
3435             break;
3436           }
3437         }
3438         s = biter.status();
3439       }
3440       // Write the block cache access record.
3441       if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
3442         // Avoid making copy of block_key, cf_name, and referenced_key when
3443         // constructing the access record.
3444         Slice referenced_key;
3445         if (does_referenced_key_exist) {
3446           referenced_key = biter.key();
3447         } else {
3448           referenced_key = key;
3449         }
3450         BlockCacheTraceRecord access_record(
3451             rep_->ioptions.env->NowMicros(),
3452             /*block_key=*/"", lookup_data_block_context.block_type,
3453             lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
3454             /*cf_name=*/"", rep_->level_for_tracing(),
3455             rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
3456             lookup_data_block_context.is_cache_hit,
3457             lookup_data_block_context.no_insert,
3458             lookup_data_block_context.get_id,
3459             lookup_data_block_context.get_from_user_specified_snapshot,
3460             /*referenced_key=*/"", referenced_data_size,
3461             lookup_data_block_context.num_keys_in_block,
3462             does_referenced_key_exist);
3463         block_cache_tracer_->WriteBlockAccess(
3464             access_record, lookup_data_block_context.block_key,
3465             rep_->cf_name_for_tracing(), referenced_key);
3466       }
3467 
3468       if (done) {
3469         // Avoid the extra Next which is expensive in two-level indexes
3470         break;
3471       }
3472     }
3473     if (matched && filter != nullptr && !filter->IsBlockBased()) {
3474       RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
3475       PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
3476                                 rep_->level);
3477     }
3478     if (s.ok() && !iiter->status().IsNotFound()) {
3479       s = iiter->status();
3480     }
3481   }
3482 
3483   return s;
3484 }
3485 
3486 using MultiGetRange = MultiGetContext::Range;
MultiGet(const ReadOptions & read_options,const MultiGetRange * mget_range,const SliceTransform * prefix_extractor,bool skip_filters)3487 void BlockBasedTable::MultiGet(const ReadOptions& read_options,
3488                                const MultiGetRange* mget_range,
3489                                const SliceTransform* prefix_extractor,
3490                                bool skip_filters) {
3491   FilterBlockReader* const filter =
3492       !skip_filters ? rep_->filter.get() : nullptr;
3493   MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
3494                                mget_range->end());
3495 
3496   // First check the full filter
3497   // If full filter not useful, Then go into each block
3498   const bool no_io = read_options.read_tier == kBlockCacheTier;
3499   uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
3500   if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
3501     tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
3502   }
3503   BlockCacheLookupContext lookup_context{
3504       TableReaderCaller::kUserMultiGet, tracing_mget_id,
3505       /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
3506   FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
3507                          prefix_extractor, &lookup_context);
3508 
3509   if (skip_filters || !sst_file_range.empty()) {
3510     IndexBlockIter iiter_on_stack;
3511     // if prefix_extractor found in block differs from options, disable
3512     // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
3513     bool need_upper_bound_check = false;
3514     if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
3515       need_upper_bound_check = PrefixExtractorChanged(
3516           rep_->table_properties.get(), prefix_extractor);
3517     }
3518     auto iiter =
3519         NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
3520                          sst_file_range.begin()->get_context, &lookup_context);
3521     std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3522     if (iiter != &iiter_on_stack) {
3523       iiter_unique_ptr.reset(iiter);
3524     }
3525 
3526     uint64_t offset = std::numeric_limits<uint64_t>::max();
3527     autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles;
3528     autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results;
3529     autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
3530     char stack_buf[kMultiGetReadStackBufSize];
3531     std::unique_ptr<char[]> block_buf;
3532     {
3533       MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
3534                                      sst_file_range.end());
3535 
3536       CachableEntry<UncompressionDict> uncompression_dict;
3537       Status uncompression_dict_status;
3538       if (rep_->uncompression_dict_reader) {
3539         uncompression_dict_status =
3540             rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
3541                 nullptr /* prefetch_buffer */, no_io,
3542                 sst_file_range.begin()->get_context, &lookup_context,
3543                 &uncompression_dict);
3544       }
3545 
3546       const UncompressionDict& dict = uncompression_dict.GetValue()
3547                                           ? *uncompression_dict.GetValue()
3548                                           : UncompressionDict::GetEmptyDict();
3549 
3550       size_t total_len = 0;
3551       ReadOptions ro = read_options;
3552       ro.read_tier = kBlockCacheTier;
3553 
3554       for (auto miter = data_block_range.begin();
3555            miter != data_block_range.end(); ++miter) {
3556         const Slice& key = miter->ikey;
3557         iiter->Seek(miter->ikey);
3558 
3559         IndexValue v;
3560         if (iiter->Valid()) {
3561           v = iiter->value();
3562         }
3563         if (!iiter->Valid() ||
3564             (!v.first_internal_key.empty() && !skip_filters &&
3565              UserComparatorWrapper(rep_->internal_comparator.user_comparator())
3566                      .Compare(ExtractUserKey(key),
3567                               ExtractUserKey(v.first_internal_key)) < 0)) {
3568           // The requested key falls between highest key in previous block and
3569           // lowest key in current block.
3570           *(miter->s) = iiter->status();
3571           data_block_range.SkipKey(miter);
3572           sst_file_range.SkipKey(miter);
3573           continue;
3574         }
3575 
3576         if (!uncompression_dict_status.ok()) {
3577           *(miter->s) = uncompression_dict_status;
3578           data_block_range.SkipKey(miter);
3579           sst_file_range.SkipKey(miter);
3580           continue;
3581         }
3582 
3583         statuses.emplace_back();
3584         results.emplace_back();
3585         if (v.handle.offset() == offset) {
3586           // We're going to reuse the block for this key later on. No need to
3587           // look it up now. Place a null handle
3588           block_handles.emplace_back(BlockHandle::NullBlockHandle());
3589           continue;
3590         }
3591         // Lookup the cache for the given data block referenced by an index
3592         // iterator value (i.e BlockHandle). If it exists in the cache,
3593         // initialize block to the contents of the data block.
3594         offset = v.handle.offset();
3595         BlockHandle handle = v.handle;
3596         BlockCacheLookupContext lookup_data_block_context(
3597             TableReaderCaller::kUserMultiGet);
3598         Status s = RetrieveBlock(
3599             nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
3600             miter->get_context, &lookup_data_block_context,
3601             /* for_compaction */ false, /* use_cache */ true);
3602         if (s.IsIncomplete()) {
3603           s = Status::OK();
3604         }
3605         if (s.ok() && !results.back().IsEmpty()) {
3606           // Found it in the cache. Add NULL handle to indicate there is
3607           // nothing to read from disk
3608           block_handles.emplace_back(BlockHandle::NullBlockHandle());
3609         } else {
3610           block_handles.emplace_back(handle);
3611           total_len += block_size(handle);
3612         }
3613       }
3614 
3615       if (total_len) {
3616         char* scratch = nullptr;
3617         // If the blocks need to be uncompressed and we don't need the
3618         // compressed blocks, then we can use a contiguous block of
3619         // memory to read in all the blocks as it will be temporary
3620         // storage
3621         // 1. If blocks are compressed and compressed block cache is there,
3622         //    alloc heap bufs
3623         // 2. If blocks are uncompressed, alloc heap bufs
3624         // 3. If blocks are compressed and no compressed block cache, use
3625         //    stack buf
3626         if (rep_->table_options.block_cache_compressed == nullptr &&
3627             rep_->blocks_maybe_compressed) {
3628           if (total_len <= kMultiGetReadStackBufSize) {
3629             scratch = stack_buf;
3630           } else {
3631             scratch = new char[total_len];
3632             block_buf.reset(scratch);
3633           }
3634         }
3635         RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles,
3636                                &statuses, &results, scratch, dict);
3637       }
3638     }
3639 
3640     DataBlockIter first_biter;
3641     DataBlockIter next_biter;
3642     size_t idx_in_batch = 0;
3643     for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
3644          ++miter) {
3645       Status s;
3646       GetContext* get_context = miter->get_context;
3647       const Slice& key = miter->ikey;
3648       bool matched = false;  // if such user key matched a key in SST
3649       bool done = false;
3650       bool first_block = true;
3651       do {
3652         DataBlockIter* biter = nullptr;
3653         bool reusing_block = true;
3654         uint64_t referenced_data_size = 0;
3655         bool does_referenced_key_exist = false;
3656         BlockCacheLookupContext lookup_data_block_context(
3657             TableReaderCaller::kUserMultiGet, tracing_mget_id,
3658             /*get_from_user_specified_snapshot=*/read_options.snapshot !=
3659                 nullptr);
3660         if (first_block) {
3661           if (!block_handles[idx_in_batch].IsNull() ||
3662               !results[idx_in_batch].IsEmpty()) {
3663             first_biter.Invalidate(Status::OK());
3664             NewDataBlockIterator<DataBlockIter>(
3665                 read_options, results[idx_in_batch], &first_biter,
3666                 statuses[idx_in_batch]);
3667             reusing_block = false;
3668           }
3669           biter = &first_biter;
3670           idx_in_batch++;
3671         } else {
3672           IndexValue v = iiter->value();
3673           if (!v.first_internal_key.empty() && !skip_filters &&
3674               UserComparatorWrapper(rep_->internal_comparator.user_comparator())
3675                       .Compare(ExtractUserKey(key),
3676                                ExtractUserKey(v.first_internal_key)) < 0) {
3677             // The requested key falls between highest key in previous block and
3678             // lowest key in current block.
3679             break;
3680           }
3681 
3682           next_biter.Invalidate(Status::OK());
3683           NewDataBlockIterator<DataBlockIter>(
3684               read_options, iiter->value().handle, &next_biter,
3685               BlockType::kData, get_context, &lookup_data_block_context,
3686               Status(), nullptr);
3687           biter = &next_biter;
3688           reusing_block = false;
3689         }
3690 
3691         if (read_options.read_tier == kBlockCacheTier &&
3692             biter->status().IsIncomplete()) {
3693           // couldn't get block from block_cache
3694           // Update Saver.state to Found because we are only looking for
3695           // whether we can guarantee the key is not there when "no_io" is set
3696           get_context->MarkKeyMayExist();
3697           break;
3698         }
3699         if (!biter->status().ok()) {
3700           s = biter->status();
3701           break;
3702         }
3703 
3704         bool may_exist = biter->SeekForGet(key);
3705         if (!may_exist) {
3706           // HashSeek cannot find the key this block and the the iter is not
3707           // the end of the block, i.e. cannot be in the following blocks
3708           // either. In this case, the seek_key cannot be found, so we break
3709           // from the top level for-loop.
3710           break;
3711         }
3712 
3713         // Call the *saver function on each entry/block until it returns false
3714         for (; biter->Valid(); biter->Next()) {
3715           ParsedInternalKey parsed_key;
3716           Cleanable dummy;
3717           Cleanable* value_pinner = nullptr;
3718           if (!ParseInternalKey(biter->key(), &parsed_key)) {
3719             s = Status::Corruption(Slice());
3720           }
3721           if (biter->IsValuePinned()) {
3722             if (reusing_block) {
3723               Cache* block_cache = rep_->table_options.block_cache.get();
3724               assert(biter->cache_handle() != nullptr);
3725               block_cache->Ref(biter->cache_handle());
3726               dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
3727                                     biter->cache_handle());
3728               value_pinner = &dummy;
3729             } else {
3730               value_pinner = biter;
3731             }
3732           }
3733           if (!get_context->SaveValue(parsed_key, biter->value(), &matched,
3734                                       value_pinner)) {
3735             if (get_context->State() == GetContext::GetState::kFound) {
3736               does_referenced_key_exist = true;
3737               referenced_data_size =
3738                   biter->key().size() + biter->value().size();
3739             }
3740             done = true;
3741             break;
3742           }
3743           s = biter->status();
3744         }
3745         // Write the block cache access.
3746         if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
3747           // Avoid making copy of block_key, cf_name, and referenced_key when
3748           // constructing the access record.
3749           Slice referenced_key;
3750           if (does_referenced_key_exist) {
3751             referenced_key = biter->key();
3752           } else {
3753             referenced_key = key;
3754           }
3755           BlockCacheTraceRecord access_record(
3756               rep_->ioptions.env->NowMicros(),
3757               /*block_key=*/"", lookup_data_block_context.block_type,
3758               lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
3759               /*cf_name=*/"", rep_->level_for_tracing(),
3760               rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
3761               lookup_data_block_context.is_cache_hit,
3762               lookup_data_block_context.no_insert,
3763               lookup_data_block_context.get_id,
3764               lookup_data_block_context.get_from_user_specified_snapshot,
3765               /*referenced_key=*/"", referenced_data_size,
3766               lookup_data_block_context.num_keys_in_block,
3767               does_referenced_key_exist);
3768           block_cache_tracer_->WriteBlockAccess(
3769               access_record, lookup_data_block_context.block_key,
3770               rep_->cf_name_for_tracing(), referenced_key);
3771         }
3772         s = biter->status();
3773         if (done) {
3774           // Avoid the extra Next which is expensive in two-level indexes
3775           break;
3776         }
3777         if (first_block) {
3778           iiter->Seek(key);
3779         }
3780         first_block = false;
3781         iiter->Next();
3782       } while (iiter->Valid());
3783 
3784       if (matched && filter != nullptr && !filter->IsBlockBased()) {
3785         RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
3786         PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
3787                                   rep_->level);
3788       }
3789       if (s.ok()) {
3790         s = iiter->status();
3791       }
3792       *(miter->s) = s;
3793     }
3794   }
3795 }
3796 
Prefetch(const Slice * const begin,const Slice * const end)3797 Status BlockBasedTable::Prefetch(const Slice* const begin,
3798                                  const Slice* const end) {
3799   auto& comparator = rep_->internal_comparator;
3800   UserComparatorWrapper user_comparator(comparator.user_comparator());
3801   // pre-condition
3802   if (begin && end && comparator.Compare(*begin, *end) > 0) {
3803     return Status::InvalidArgument(*begin, *end);
3804   }
3805   BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
3806   IndexBlockIter iiter_on_stack;
3807   auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
3808                                 &iiter_on_stack, /*get_context=*/nullptr,
3809                                 &lookup_context);
3810   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3811   if (iiter != &iiter_on_stack) {
3812     iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
3813   }
3814 
3815   if (!iiter->status().ok()) {
3816     // error opening index iterator
3817     return iiter->status();
3818   }
3819 
3820   // indicates if we are on the last page that need to be pre-fetched
3821   bool prefetching_boundary_page = false;
3822 
3823   for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
3824        iiter->Next()) {
3825     BlockHandle block_handle = iiter->value().handle;
3826     const bool is_user_key = !rep_->index_key_includes_seq;
3827     if (end &&
3828         ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
3829          (is_user_key &&
3830           user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
3831       if (prefetching_boundary_page) {
3832         break;
3833       }
3834 
3835       // The index entry represents the last key in the data block.
3836       // We should load this page into memory as well, but no more
3837       prefetching_boundary_page = true;
3838     }
3839 
3840     // Load the block specified by the block_handle into the block cache
3841     DataBlockIter biter;
3842 
3843     NewDataBlockIterator<DataBlockIter>(
3844         ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
3845         /*get_context=*/nullptr, &lookup_context, Status(),
3846         /*prefetch_buffer=*/nullptr);
3847 
3848     if (!biter.status().ok()) {
3849       // there was an unexpected error while pre-fetching
3850       return biter.status();
3851     }
3852   }
3853 
3854   return Status::OK();
3855 }
3856 
VerifyChecksum(const ReadOptions & read_options,TableReaderCaller caller)3857 Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
3858                                        TableReaderCaller caller) {
3859   Status s;
3860   // Check Meta blocks
3861   std::unique_ptr<Block> metaindex;
3862   std::unique_ptr<InternalIterator> metaindex_iter;
3863   s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex,
3864                          &metaindex_iter);
3865   if (s.ok()) {
3866     s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
3867     if (!s.ok()) {
3868       return s;
3869     }
3870   } else {
3871     return s;
3872   }
3873   // Check Data blocks
3874   IndexBlockIter iiter_on_stack;
3875   BlockCacheLookupContext context{caller};
3876   InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
3877       read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
3878       /*get_context=*/nullptr, &context);
3879   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3880   if (iiter != &iiter_on_stack) {
3881     iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
3882   }
3883   if (!iiter->status().ok()) {
3884     // error opening index iterator
3885     return iiter->status();
3886   }
3887   s = VerifyChecksumInBlocks(read_options, iiter);
3888   return s;
3889 }
3890 
VerifyChecksumInBlocks(const ReadOptions & read_options,InternalIteratorBase<IndexValue> * index_iter)3891 Status BlockBasedTable::VerifyChecksumInBlocks(
3892     const ReadOptions& read_options,
3893     InternalIteratorBase<IndexValue>* index_iter) {
3894   Status s;
3895   // We are scanning the whole file, so no need to do exponential
3896   // increasing of the buffer size.
3897   size_t readahead_size = (read_options.readahead_size != 0)
3898                               ? read_options.readahead_size
3899                               : kMaxAutoReadaheadSize;
3900   // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
3901   // needed there.
3902   FilePrefetchBuffer prefetch_buffer(
3903       rep_->file.get(), readahead_size /* readadhead_size */,
3904       readahead_size /* max_readahead_size */,
3905       !rep_->ioptions.allow_mmap_reads /* enable */);
3906 
3907   for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
3908     s = index_iter->status();
3909     if (!s.ok()) {
3910       break;
3911     }
3912     BlockHandle handle = index_iter->value().handle;
3913     BlockContents contents;
3914     BlockFetcher block_fetcher(
3915         rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle,
3916         &contents, rep_->ioptions, false /* decompress */,
3917         false /*maybe_compressed*/, BlockType::kData,
3918         UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
3919     s = block_fetcher.ReadBlockContents();
3920     if (!s.ok()) {
3921       break;
3922     }
3923   }
3924   return s;
3925 }
3926 
GetBlockTypeForMetaBlockByName(const Slice & meta_block_name)3927 BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
3928     const Slice& meta_block_name) {
3929   if (meta_block_name.starts_with(kFilterBlockPrefix) ||
3930       meta_block_name.starts_with(kFullFilterBlockPrefix) ||
3931       meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
3932     return BlockType::kFilter;
3933   }
3934 
3935   if (meta_block_name == kPropertiesBlock) {
3936     return BlockType::kProperties;
3937   }
3938 
3939   if (meta_block_name == kCompressionDictBlock) {
3940     return BlockType::kCompressionDictionary;
3941   }
3942 
3943   if (meta_block_name == kRangeDelBlock) {
3944     return BlockType::kRangeDeletion;
3945   }
3946 
3947   if (meta_block_name == kHashIndexPrefixesBlock) {
3948     return BlockType::kHashIndexPrefixes;
3949   }
3950 
3951   if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
3952     return BlockType::kHashIndexMetadata;
3953   }
3954 
3955   assert(false);
3956   return BlockType::kInvalid;
3957 }
3958 
VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice> * index_iter)3959 Status BlockBasedTable::VerifyChecksumInMetaBlocks(
3960     InternalIteratorBase<Slice>* index_iter) {
3961   Status s;
3962   for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
3963     s = index_iter->status();
3964     if (!s.ok()) {
3965       break;
3966     }
3967     BlockHandle handle;
3968     Slice input = index_iter->value();
3969     s = handle.DecodeFrom(&input);
3970     BlockContents contents;
3971     const Slice meta_block_name = index_iter->key();
3972     BlockFetcher block_fetcher(
3973         rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
3974         ReadOptions(), handle, &contents, rep_->ioptions,
3975         false /* decompress */, false /*maybe_compressed*/,
3976         GetBlockTypeForMetaBlockByName(meta_block_name),
3977         UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
3978     s = block_fetcher.ReadBlockContents();
3979     if (s.IsCorruption() && meta_block_name == kPropertiesBlock) {
3980       TableProperties* table_properties;
3981       s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
3982                                            index_iter->value(),
3983                                            &table_properties);
3984       delete table_properties;
3985     }
3986     if (!s.ok()) {
3987       break;
3988     }
3989   }
3990   return s;
3991 }
3992 
TEST_BlockInCache(const BlockHandle & handle) const3993 bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
3994   assert(rep_ != nullptr);
3995 
3996   Cache* const cache = rep_->table_options.block_cache.get();
3997   if (cache == nullptr) {
3998     return false;
3999   }
4000 
4001   char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
4002   Slice cache_key =
4003       GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
4004                   cache_key_storage);
4005 
4006   Cache::Handle* const cache_handle = cache->Lookup(cache_key);
4007   if (cache_handle == nullptr) {
4008     return false;
4009   }
4010 
4011   cache->Release(cache_handle);
4012 
4013   return true;
4014 }
4015 
TEST_KeyInCache(const ReadOptions & options,const Slice & key)4016 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
4017                                       const Slice& key) {
4018   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
4019       options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
4020       /*get_context=*/nullptr, /*lookup_context=*/nullptr));
4021   iiter->Seek(key);
4022   assert(iiter->Valid());
4023 
4024   return TEST_BlockInCache(iiter->value().handle);
4025 }
4026 
4027 // REQUIRES: The following fields of rep_ should have already been populated:
4028 //  1. file
4029 //  2. index_handle,
4030 //  3. options
4031 //  4. internal_comparator
4032 //  5. index_type
CreateIndexReader(FilePrefetchBuffer * prefetch_buffer,InternalIterator * preloaded_meta_index_iter,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)4033 Status BlockBasedTable::CreateIndexReader(
4034     FilePrefetchBuffer* prefetch_buffer,
4035     InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
4036     bool pin, BlockCacheLookupContext* lookup_context,
4037     std::unique_ptr<IndexReader>* index_reader) {
4038   // kHashSearch requires non-empty prefix_extractor but bypass checking
4039   // prefix_extractor here since we have no access to MutableCFOptions.
4040   // Add need_upper_bound_check flag in  BlockBasedTable::NewIndexIterator.
4041   // If prefix_extractor does not match prefix_extractor_name from table
4042   // properties, turn off Hash Index by setting total_order_seek to true
4043 
4044   switch (rep_->index_type) {
4045     case BlockBasedTableOptions::kTwoLevelIndexSearch: {
4046       return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
4047                                           prefetch, pin, lookup_context,
4048                                           index_reader);
4049     }
4050     case BlockBasedTableOptions::kBinarySearch:
4051       FALLTHROUGH_INTENDED;
4052     case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
4053       return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
4054                                              prefetch, pin, lookup_context,
4055                                              index_reader);
4056     }
4057     case BlockBasedTableOptions::kHashSearch: {
4058       std::unique_ptr<Block> metaindex_guard;
4059       std::unique_ptr<InternalIterator> metaindex_iter_guard;
4060       auto meta_index_iter = preloaded_meta_index_iter;
4061       bool should_fallback = false;
4062       if (rep_->internal_prefix_transform.get() == nullptr) {
4063         ROCKS_LOG_WARN(rep_->ioptions.info_log,
4064                        "No prefix extractor passed in. Fall back to binary"
4065                        " search index.");
4066         should_fallback = true;
4067       } else if (meta_index_iter == nullptr) {
4068         auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard,
4069                                     &metaindex_iter_guard);
4070         if (!s.ok()) {
4071           // we simply fall back to binary search in case there is any
4072           // problem with prefix hash index loading.
4073           ROCKS_LOG_WARN(rep_->ioptions.info_log,
4074                          "Unable to read the metaindex block."
4075                          " Fall back to binary search index.");
4076           should_fallback = true;
4077         }
4078         meta_index_iter = metaindex_iter_guard.get();
4079       }
4080 
4081       if (should_fallback) {
4082         return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
4083                                                prefetch, pin, lookup_context,
4084                                                index_reader);
4085       } else {
4086         return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
4087                                        use_cache, prefetch, pin, lookup_context,
4088                                        index_reader);
4089       }
4090     }
4091     default: {
4092       std::string error_message =
4093           "Unrecognized index type: " + ToString(rep_->index_type);
4094       return Status::InvalidArgument(error_message.c_str());
4095     }
4096   }
4097 }
4098 
ApproximateOffsetOf(const InternalIteratorBase<IndexValue> & index_iter) const4099 uint64_t BlockBasedTable::ApproximateOffsetOf(
4100     const InternalIteratorBase<IndexValue>& index_iter) const {
4101   uint64_t result = 0;
4102   if (index_iter.Valid()) {
4103     BlockHandle handle = index_iter.value().handle;
4104     result = handle.offset();
4105   } else {
4106     // The iterator is past the last key in the file. If table_properties is not
4107     // available, approximate the offset by returning the offset of the
4108     // metaindex block (which is right near the end of the file).
4109     if (rep_->table_properties) {
4110       result = rep_->table_properties->data_size;
4111     }
4112     // table_properties is not present in the table.
4113     if (result == 0) {
4114       result = rep_->footer.metaindex_handle().offset();
4115     }
4116   }
4117 
4118   return result;
4119 }
4120 
ApproximateOffsetOf(const Slice & key,TableReaderCaller caller)4121 uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
4122                                               TableReaderCaller caller) {
4123   BlockCacheLookupContext context(caller);
4124   IndexBlockIter iiter_on_stack;
4125   ReadOptions ro;
4126   ro.total_order_seek = true;
4127   auto index_iter =
4128       NewIndexIterator(ro, /*disable_prefix_seek=*/true,
4129                        /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
4130                        /*lookup_context=*/&context);
4131   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
4132   if (index_iter != &iiter_on_stack) {
4133     iiter_unique_ptr.reset(index_iter);
4134   }
4135 
4136   index_iter->Seek(key);
4137   return ApproximateOffsetOf(*index_iter);
4138 }
4139 
ApproximateSize(const Slice & start,const Slice & end,TableReaderCaller caller)4140 uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
4141                                           TableReaderCaller caller) {
4142   assert(rep_->internal_comparator.Compare(start, end) <= 0);
4143 
4144   BlockCacheLookupContext context(caller);
4145   IndexBlockIter iiter_on_stack;
4146   ReadOptions ro;
4147   ro.total_order_seek = true;
4148   auto index_iter =
4149       NewIndexIterator(ro, /*disable_prefix_seek=*/true,
4150                        /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
4151                        /*lookup_context=*/&context);
4152   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
4153   if (index_iter != &iiter_on_stack) {
4154     iiter_unique_ptr.reset(index_iter);
4155   }
4156 
4157   index_iter->Seek(start);
4158   uint64_t start_offset = ApproximateOffsetOf(*index_iter);
4159   index_iter->Seek(end);
4160   uint64_t end_offset = ApproximateOffsetOf(*index_iter);
4161 
4162   assert(end_offset >= start_offset);
4163   return end_offset - start_offset;
4164 }
4165 
TEST_FilterBlockInCache() const4166 bool BlockBasedTable::TEST_FilterBlockInCache() const {
4167   assert(rep_ != nullptr);
4168   return TEST_BlockInCache(rep_->filter_handle);
4169 }
4170 
TEST_IndexBlockInCache() const4171 bool BlockBasedTable::TEST_IndexBlockInCache() const {
4172   assert(rep_ != nullptr);
4173 
4174   return TEST_BlockInCache(rep_->footer.index_handle());
4175 }
4176 
GetKVPairsFromDataBlocks(std::vector<KVPairBlock> * kv_pair_blocks)4177 Status BlockBasedTable::GetKVPairsFromDataBlocks(
4178     std::vector<KVPairBlock>* kv_pair_blocks) {
4179   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
4180       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
4181                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
4182                        /*lookup_contex=*/nullptr));
4183 
4184   Status s = blockhandles_iter->status();
4185   if (!s.ok()) {
4186     // Cannot read Index Block
4187     return s;
4188   }
4189 
4190   for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
4191        blockhandles_iter->Next()) {
4192     s = blockhandles_iter->status();
4193 
4194     if (!s.ok()) {
4195       break;
4196     }
4197 
4198     std::unique_ptr<InternalIterator> datablock_iter;
4199     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
4200         ReadOptions(), blockhandles_iter->value().handle,
4201         /*input_iter=*/nullptr, /*type=*/BlockType::kData,
4202         /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
4203         /*prefetch_buffer=*/nullptr));
4204     s = datablock_iter->status();
4205 
4206     if (!s.ok()) {
4207       // Error reading the block - Skipped
4208       continue;
4209     }
4210 
4211     KVPairBlock kv_pair_block;
4212     for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
4213          datablock_iter->Next()) {
4214       s = datablock_iter->status();
4215       if (!s.ok()) {
4216         // Error reading the block - Skipped
4217         break;
4218       }
4219       const Slice& key = datablock_iter->key();
4220       const Slice& value = datablock_iter->value();
4221       std::string key_copy = std::string(key.data(), key.size());
4222       std::string value_copy = std::string(value.data(), value.size());
4223 
4224       kv_pair_block.push_back(
4225           std::make_pair(std::move(key_copy), std::move(value_copy)));
4226     }
4227     kv_pair_blocks->push_back(std::move(kv_pair_block));
4228   }
4229   return Status::OK();
4230 }
4231 
DumpTable(WritableFile * out_file)4232 Status BlockBasedTable::DumpTable(WritableFile* out_file) {
4233   // Output Footer
4234   out_file->Append(
4235       "Footer Details:\n"
4236       "--------------------------------------\n"
4237       "  ");
4238   out_file->Append(rep_->footer.ToString().c_str());
4239   out_file->Append("\n");
4240 
4241   // Output MetaIndex
4242   out_file->Append(
4243       "Metaindex Details:\n"
4244       "--------------------------------------\n");
4245   std::unique_ptr<Block> metaindex;
4246   std::unique_ptr<InternalIterator> metaindex_iter;
4247   Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex,
4248                                 &metaindex_iter);
4249   if (s.ok()) {
4250     for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
4251          metaindex_iter->Next()) {
4252       s = metaindex_iter->status();
4253       if (!s.ok()) {
4254         return s;
4255       }
4256       if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kPropertiesBlock) {
4257         out_file->Append("  Properties block handle: ");
4258         out_file->Append(metaindex_iter->value().ToString(true).c_str());
4259         out_file->Append("\n");
4260       } else if (metaindex_iter->key() ==
4261                  ROCKSDB_NAMESPACE::kCompressionDictBlock) {
4262         out_file->Append("  Compression dictionary block handle: ");
4263         out_file->Append(metaindex_iter->value().ToString(true).c_str());
4264         out_file->Append("\n");
4265       } else if (strstr(metaindex_iter->key().ToString().c_str(),
4266                         "filter.rocksdb.") != nullptr) {
4267         out_file->Append("  Filter block handle: ");
4268         out_file->Append(metaindex_iter->value().ToString(true).c_str());
4269         out_file->Append("\n");
4270       } else if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kRangeDelBlock) {
4271         out_file->Append("  Range deletion block handle: ");
4272         out_file->Append(metaindex_iter->value().ToString(true).c_str());
4273         out_file->Append("\n");
4274       }
4275     }
4276     out_file->Append("\n");
4277   } else {
4278     return s;
4279   }
4280 
4281   // Output TableProperties
4282   const ROCKSDB_NAMESPACE::TableProperties* table_properties;
4283   table_properties = rep_->table_properties.get();
4284 
4285   if (table_properties != nullptr) {
4286     out_file->Append(
4287         "Table Properties:\n"
4288         "--------------------------------------\n"
4289         "  ");
4290     out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
4291     out_file->Append("\n");
4292   }
4293 
4294   if (rep_->filter) {
4295     out_file->Append(
4296         "Filter Details:\n"
4297         "--------------------------------------\n"
4298         "  ");
4299     out_file->Append(rep_->filter->ToString().c_str());
4300     out_file->Append("\n");
4301   }
4302 
4303   // Output Index block
4304   s = DumpIndexBlock(out_file);
4305   if (!s.ok()) {
4306     return s;
4307   }
4308 
4309   // Output compression dictionary
4310   if (rep_->uncompression_dict_reader) {
4311     CachableEntry<UncompressionDict> uncompression_dict;
4312     s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
4313         nullptr /* prefetch_buffer */, false /* no_io */,
4314         nullptr /* get_context */, nullptr /* lookup_context */,
4315         &uncompression_dict);
4316     if (!s.ok()) {
4317       return s;
4318     }
4319 
4320     assert(uncompression_dict.GetValue());
4321 
4322     const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
4323     out_file->Append(
4324         "Compression Dictionary:\n"
4325         "--------------------------------------\n");
4326     out_file->Append("  size (bytes): ");
4327     out_file->Append(ROCKSDB_NAMESPACE::ToString(raw_dict.size()));
4328     out_file->Append("\n\n");
4329     out_file->Append("  HEX    ");
4330     out_file->Append(raw_dict.ToString(true).c_str());
4331     out_file->Append("\n\n");
4332   }
4333 
4334   // Output range deletions block
4335   auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
4336   if (range_del_iter != nullptr) {
4337     range_del_iter->SeekToFirst();
4338     if (range_del_iter->Valid()) {
4339       out_file->Append(
4340           "Range deletions:\n"
4341           "--------------------------------------\n"
4342           "  ");
4343       for (; range_del_iter->Valid(); range_del_iter->Next()) {
4344         DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
4345       }
4346       out_file->Append("\n");
4347     }
4348     delete range_del_iter;
4349   }
4350   // Output Data blocks
4351   s = DumpDataBlocks(out_file);
4352 
4353   return s;
4354 }
4355 
DumpIndexBlock(WritableFile * out_file)4356 Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
4357   out_file->Append(
4358       "Index Details:\n"
4359       "--------------------------------------\n");
4360   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
4361       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
4362                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
4363                        /*lookup_contex=*/nullptr));
4364   Status s = blockhandles_iter->status();
4365   if (!s.ok()) {
4366     out_file->Append("Can not read Index Block \n\n");
4367     return s;
4368   }
4369 
4370   out_file->Append("  Block key hex dump: Data block handle\n");
4371   out_file->Append("  Block key ascii\n\n");
4372   for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
4373        blockhandles_iter->Next()) {
4374     s = blockhandles_iter->status();
4375     if (!s.ok()) {
4376       break;
4377     }
4378     Slice key = blockhandles_iter->key();
4379     Slice user_key;
4380     InternalKey ikey;
4381     if (!rep_->index_key_includes_seq) {
4382       user_key = key;
4383     } else {
4384       ikey.DecodeFrom(key);
4385       user_key = ikey.user_key();
4386     }
4387 
4388     out_file->Append("  HEX    ");
4389     out_file->Append(user_key.ToString(true).c_str());
4390     out_file->Append(": ");
4391     out_file->Append(blockhandles_iter->value()
4392                          .ToString(true, rep_->index_has_first_key)
4393                          .c_str());
4394     out_file->Append("\n");
4395 
4396     std::string str_key = user_key.ToString();
4397     std::string res_key("");
4398     char cspace = ' ';
4399     for (size_t i = 0; i < str_key.size(); i++) {
4400       res_key.append(&str_key[i], 1);
4401       res_key.append(1, cspace);
4402     }
4403     out_file->Append("  ASCII  ");
4404     out_file->Append(res_key.c_str());
4405     out_file->Append("\n  ------\n");
4406   }
4407   out_file->Append("\n");
4408   return Status::OK();
4409 }
4410 
DumpDataBlocks(WritableFile * out_file)4411 Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
4412   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
4413       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
4414                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
4415                        /*lookup_contex=*/nullptr));
4416   Status s = blockhandles_iter->status();
4417   if (!s.ok()) {
4418     out_file->Append("Can not read Index Block \n\n");
4419     return s;
4420   }
4421 
4422   uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
4423   uint64_t datablock_size_max = 0;
4424   uint64_t datablock_size_sum = 0;
4425 
4426   size_t block_id = 1;
4427   for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
4428        block_id++, blockhandles_iter->Next()) {
4429     s = blockhandles_iter->status();
4430     if (!s.ok()) {
4431       break;
4432     }
4433 
4434     BlockHandle bh = blockhandles_iter->value().handle;
4435     uint64_t datablock_size = bh.size();
4436     datablock_size_min = std::min(datablock_size_min, datablock_size);
4437     datablock_size_max = std::max(datablock_size_max, datablock_size);
4438     datablock_size_sum += datablock_size;
4439 
4440     out_file->Append("Data Block # ");
4441     out_file->Append(ROCKSDB_NAMESPACE::ToString(block_id));
4442     out_file->Append(" @ ");
4443     out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str());
4444     out_file->Append("\n");
4445     out_file->Append("--------------------------------------\n");
4446 
4447     std::unique_ptr<InternalIterator> datablock_iter;
4448     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
4449         ReadOptions(), blockhandles_iter->value().handle,
4450         /*input_iter=*/nullptr, /*type=*/BlockType::kData,
4451         /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
4452         /*prefetch_buffer=*/nullptr));
4453     s = datablock_iter->status();
4454 
4455     if (!s.ok()) {
4456       out_file->Append("Error reading the block - Skipped \n\n");
4457       continue;
4458     }
4459 
4460     for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
4461          datablock_iter->Next()) {
4462       s = datablock_iter->status();
4463       if (!s.ok()) {
4464         out_file->Append("Error reading the block - Skipped \n");
4465         break;
4466       }
4467       DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
4468     }
4469     out_file->Append("\n");
4470   }
4471 
4472   uint64_t num_datablocks = block_id - 1;
4473   if (num_datablocks) {
4474     double datablock_size_avg =
4475         static_cast<double>(datablock_size_sum) / num_datablocks;
4476     out_file->Append("Data Block Summary:\n");
4477     out_file->Append("--------------------------------------");
4478     out_file->Append("\n  # data blocks: ");
4479     out_file->Append(ROCKSDB_NAMESPACE::ToString(num_datablocks));
4480     out_file->Append("\n  min data block size: ");
4481     out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_min));
4482     out_file->Append("\n  max data block size: ");
4483     out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_max));
4484     out_file->Append("\n  avg data block size: ");
4485     out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_avg));
4486     out_file->Append("\n");
4487   }
4488 
4489   return Status::OK();
4490 }
4491 
DumpKeyValue(const Slice & key,const Slice & value,WritableFile * out_file)4492 void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
4493                                    WritableFile* out_file) {
4494   InternalKey ikey;
4495   ikey.DecodeFrom(key);
4496 
4497   out_file->Append("  HEX    ");
4498   out_file->Append(ikey.user_key().ToString(true).c_str());
4499   out_file->Append(": ");
4500   out_file->Append(value.ToString(true).c_str());
4501   out_file->Append("\n");
4502 
4503   std::string str_key = ikey.user_key().ToString();
4504   std::string str_value = value.ToString();
4505   std::string res_key(""), res_value("");
4506   char cspace = ' ';
4507   for (size_t i = 0; i < str_key.size(); i++) {
4508     if (str_key[i] == '\0') {
4509       res_key.append("\\0", 2);
4510     } else {
4511       res_key.append(&str_key[i], 1);
4512     }
4513     res_key.append(1, cspace);
4514   }
4515   for (size_t i = 0; i < str_value.size(); i++) {
4516     if (str_value[i] == '\0') {
4517       res_value.append("\\0", 2);
4518     } else {
4519       res_value.append(&str_value[i], 1);
4520     }
4521     res_value.append(1, cspace);
4522   }
4523 
4524   out_file->Append("  ASCII  ");
4525   out_file->Append(res_key.c_str());
4526   out_file->Append(": ");
4527   out_file->Append(res_value.c_str());
4528   out_file->Append("\n  ------\n");
4529 }
4530 
4531 }  // namespace ROCKSDB_NAMESPACE
4532