1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 #include "table/block_based/block_based_table_reader.h"
10 #include <algorithm>
11 #include <array>
12 #include <limits>
13 #include <string>
14 #include <utility>
15 #include <vector>
16 
17 #include "db/dbformat.h"
18 #include "db/pinned_iterators_manager.h"
19 
20 #include "file/file_prefetch_buffer.h"
21 #include "file/random_access_file_reader.h"
22 
23 #include "rocksdb/cache.h"
24 #include "rocksdb/comparator.h"
25 #include "rocksdb/env.h"
26 #include "rocksdb/file_system.h"
27 #include "rocksdb/filter_policy.h"
28 #include "rocksdb/iterator.h"
29 #include "rocksdb/options.h"
30 #include "rocksdb/statistics.h"
31 #include "rocksdb/table.h"
32 #include "rocksdb/table_properties.h"
33 
34 #include "table/block_based/block.h"
35 #include "table/block_based/block_based_filter_block.h"
36 #include "table/block_based/block_based_table_factory.h"
37 #include "table/block_based/block_prefix_index.h"
38 #include "table/block_based/filter_block.h"
39 #include "table/block_based/full_filter_block.h"
40 #include "table/block_based/partitioned_filter_block.h"
41 #include "table/block_fetcher.h"
42 #include "table/format.h"
43 #include "table/get_context.h"
44 #include "table/internal_iterator.h"
45 #include "table/meta_blocks.h"
46 #include "table/multiget_context.h"
47 #include "table/persistent_cache_helper.h"
48 #include "table/sst_file_writer_collectors.h"
49 #include "table/two_level_iterator.h"
50 
51 #include "monitoring/perf_context_imp.h"
52 #include "test_util/sync_point.h"
53 #include "util/coding.h"
54 #include "util/crc32c.h"
55 #include "util/stop_watch.h"
56 #include "util/string_util.h"
57 #include "util/xxhash.h"
58 
59 namespace rocksdb {
60 
61 extern const uint64_t kBlockBasedTableMagicNumber;
62 extern const std::string kHashIndexPrefixesBlock;
63 extern const std::string kHashIndexPrefixesMetadataBlock;
64 
65 typedef BlockBasedTable::IndexReader IndexReader;
66 
67 // Found that 256 KB readahead size provides the best performance, based on
68 // experiments, for auto readahead. Experiment data is in PR #3282.
69 const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
70 
~BlockBasedTable()71 BlockBasedTable::~BlockBasedTable() {
72   delete rep_;
73 }
74 
75 std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
76 
77 template <typename TBlocklike>
78 class BlocklikeTraits;
79 
80 template <>
81 class BlocklikeTraits<BlockContents> {
82  public:
Create(BlockContents && contents,SequenceNumber,size_t,Statistics *,bool,const FilterPolicy *)83   static BlockContents* Create(BlockContents&& contents,
84                                SequenceNumber /* global_seqno */,
85                                size_t /* read_amp_bytes_per_bit */,
86                                Statistics* /* statistics */,
87                                bool /* using_zstd */,
88                                const FilterPolicy* /* filter_policy */) {
89     return new BlockContents(std::move(contents));
90   }
91 
GetNumRestarts(const BlockContents &)92   static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
93     return 0;
94   }
95 };
96 
97 template <>
98 class BlocklikeTraits<ParsedFullFilterBlock> {
99  public:
Create(BlockContents && contents,SequenceNumber,size_t,Statistics *,bool,const FilterPolicy * filter_policy)100   static ParsedFullFilterBlock* Create(BlockContents&& contents,
101                                        SequenceNumber /* global_seqno */,
102                                        size_t /* read_amp_bytes_per_bit */,
103                                        Statistics* /* statistics */,
104                                        bool /* using_zstd */,
105                                        const FilterPolicy* filter_policy) {
106     return new ParsedFullFilterBlock(filter_policy, std::move(contents));
107   }
108 
GetNumRestarts(const ParsedFullFilterBlock &)109   static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
110     return 0;
111   }
112 };
113 
114 template <>
115 class BlocklikeTraits<Block> {
116  public:
Create(BlockContents && contents,SequenceNumber global_seqno,size_t read_amp_bytes_per_bit,Statistics * statistics,bool,const FilterPolicy *)117   static Block* Create(BlockContents&& contents, SequenceNumber global_seqno,
118                        size_t read_amp_bytes_per_bit, Statistics* statistics,
119                        bool /* using_zstd */,
120                        const FilterPolicy* /* filter_policy */) {
121     return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
122                      statistics);
123   }
124 
GetNumRestarts(const Block & block)125   static uint32_t GetNumRestarts(const Block& block) {
126     return block.NumRestarts();
127   }
128 };
129 
130 template <>
131 class BlocklikeTraits<UncompressionDict> {
132  public:
Create(BlockContents && contents,SequenceNumber,size_t,Statistics *,bool using_zstd,const FilterPolicy *)133   static UncompressionDict* Create(BlockContents&& contents,
134                                    SequenceNumber /* global_seqno */,
135                                    size_t /* read_amp_bytes_per_bit */,
136                                    Statistics* /* statistics */,
137                                    bool using_zstd,
138                                    const FilterPolicy* /* filter_policy */) {
139     return new UncompressionDict(contents.data, std::move(contents.allocation),
140                                  using_zstd);
141   }
142 
GetNumRestarts(const UncompressionDict &)143   static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
144     return 0;
145   }
146 };
147 
148 namespace {
149 // Read the block identified by "handle" from "file".
150 // The only relevant option is options.verify_checksums for now.
151 // On failure return non-OK.
152 // On success fill *result and return OK - caller owns *result
153 // @param uncompression_dict Data for presetting the compression library's
154 //    dictionary.
155 template <typename TBlocklike>
ReadBlockFromFile(RandomAccessFileReader * file,FilePrefetchBuffer * prefetch_buffer,const Footer & footer,const ReadOptions & options,const BlockHandle & handle,std::unique_ptr<TBlocklike> * result,const ImmutableCFOptions & ioptions,bool do_uncompress,bool maybe_compressed,BlockType block_type,const UncompressionDict & uncompression_dict,const PersistentCacheOptions & cache_options,SequenceNumber global_seqno,size_t read_amp_bytes_per_bit,MemoryAllocator * memory_allocator,bool for_compaction,bool using_zstd,const FilterPolicy * filter_policy)156 Status ReadBlockFromFile(
157     RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
158     const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
159     std::unique_ptr<TBlocklike>* result, const ImmutableCFOptions& ioptions,
160     bool do_uncompress, bool maybe_compressed, BlockType block_type,
161     const UncompressionDict& uncompression_dict,
162     const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
163     size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
164     bool for_compaction, bool using_zstd, const FilterPolicy* filter_policy) {
165   assert(result);
166 
167   BlockContents contents;
168   BlockFetcher block_fetcher(
169       file, prefetch_buffer, footer, options, handle, &contents, ioptions,
170       do_uncompress, maybe_compressed, block_type, uncompression_dict,
171       cache_options, memory_allocator, nullptr, for_compaction);
172   Status s = block_fetcher.ReadBlockContents();
173   if (s.ok()) {
174     result->reset(BlocklikeTraits<TBlocklike>::Create(
175         std::move(contents), global_seqno, read_amp_bytes_per_bit,
176         ioptions.statistics, using_zstd, filter_policy));
177   }
178 
179   return s;
180 }
181 
GetMemoryAllocator(const BlockBasedTableOptions & table_options)182 inline MemoryAllocator* GetMemoryAllocator(
183     const BlockBasedTableOptions& table_options) {
184   return table_options.block_cache.get()
185              ? table_options.block_cache->memory_allocator()
186              : nullptr;
187 }
188 
GetMemoryAllocatorForCompressedBlock(const BlockBasedTableOptions & table_options)189 inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
190     const BlockBasedTableOptions& table_options) {
191   return table_options.block_cache_compressed.get()
192              ? table_options.block_cache_compressed->memory_allocator()
193              : nullptr;
194 }
195 
196 // Delete the entry resided in the cache.
197 template <class Entry>
DeleteCachedEntry(const Slice &,void * value)198 void DeleteCachedEntry(const Slice& /*key*/, void* value) {
199   auto entry = reinterpret_cast<Entry*>(value);
200   delete entry;
201 }
202 
203 // Release the cached entry and decrement its ref count.
ForceReleaseCachedEntry(void * arg,void * h)204 void ForceReleaseCachedEntry(void* arg, void* h) {
205   Cache* cache = reinterpret_cast<Cache*>(arg);
206   Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
207   cache->Release(handle, true /* force_erase */);
208 }
209 
210 // Release the cached entry and decrement its ref count.
211 // Do not force erase
ReleaseCachedEntry(void * arg,void * h)212 void ReleaseCachedEntry(void* arg, void* h) {
213   Cache* cache = reinterpret_cast<Cache*>(arg);
214   Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
215   cache->Release(handle, false /* force_erase */);
216 }
217 
218 // For hash based index, return true if prefix_extractor and
219 // prefix_extractor_block mismatch, false otherwise. This flag will be used
220 // as total_order_seek via NewIndexIterator
PrefixExtractorChanged(const TableProperties * table_properties,const SliceTransform * prefix_extractor)221 bool PrefixExtractorChanged(const TableProperties* table_properties,
222                             const SliceTransform* prefix_extractor) {
223   // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
224   // Turn off hash index in prefix_extractor is not set; if  prefix_extractor
225   // is set but prefix_extractor_block is not set, also disable hash index
226   if (prefix_extractor == nullptr || table_properties == nullptr ||
227       table_properties->prefix_extractor_name.empty()) {
228     return true;
229   }
230 
231   // prefix_extractor and prefix_extractor_block are both non-empty
232   if (table_properties->prefix_extractor_name.compare(
233           prefix_extractor->Name()) != 0) {
234     return true;
235   } else {
236     return false;
237   }
238 }
239 
CopyBufferToHeap(MemoryAllocator * allocator,Slice & buf)240 CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
241   CacheAllocationPtr heap_buf;
242   heap_buf = AllocateBlock(buf.size(), allocator);
243   memcpy(heap_buf.get(), buf.data(), buf.size());
244   return heap_buf;
245 }
246 
247 }  // namespace
248 
249 // Encapsulates common functionality for the various index reader
250 // implementations. Provides access to the index block regardless of whether
251 // it is owned by the reader or stored in the cache, or whether it is pinned
252 // in the cache or not.
253 class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
254  public:
IndexReaderCommon(const BlockBasedTable * t,CachableEntry<Block> && index_block)255   IndexReaderCommon(const BlockBasedTable* t,
256                     CachableEntry<Block>&& index_block)
257       : table_(t), index_block_(std::move(index_block)) {
258     assert(table_ != nullptr);
259   }
260 
261  protected:
262   static Status ReadIndexBlock(const BlockBasedTable* table,
263                                FilePrefetchBuffer* prefetch_buffer,
264                                const ReadOptions& read_options, bool use_cache,
265                                GetContext* get_context,
266                                BlockCacheLookupContext* lookup_context,
267                                CachableEntry<Block>* index_block);
268 
table() const269   const BlockBasedTable* table() const { return table_; }
270 
internal_comparator() const271   const InternalKeyComparator* internal_comparator() const {
272     assert(table_ != nullptr);
273     assert(table_->get_rep() != nullptr);
274 
275     return &table_->get_rep()->internal_comparator;
276   }
277 
index_has_first_key() const278   bool index_has_first_key() const {
279     assert(table_ != nullptr);
280     assert(table_->get_rep() != nullptr);
281     return table_->get_rep()->index_has_first_key;
282   }
283 
index_key_includes_seq() const284   bool index_key_includes_seq() const {
285     assert(table_ != nullptr);
286     assert(table_->get_rep() != nullptr);
287     return table_->get_rep()->index_key_includes_seq;
288   }
289 
index_value_is_full() const290   bool index_value_is_full() const {
291     assert(table_ != nullptr);
292     assert(table_->get_rep() != nullptr);
293     return table_->get_rep()->index_value_is_full;
294   }
295 
cache_index_blocks() const296   bool cache_index_blocks() const {
297     assert(table_ != nullptr);
298     assert(table_->get_rep() != nullptr);
299     return table_->get_rep()->table_options.cache_index_and_filter_blocks;
300   }
301 
302   Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
303                              BlockCacheLookupContext* lookup_context,
304                              CachableEntry<Block>* index_block) const;
305 
ApproximateIndexBlockMemoryUsage() const306   size_t ApproximateIndexBlockMemoryUsage() const {
307     assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
308     return index_block_.GetOwnValue()
309                ? index_block_.GetValue()->ApproximateMemoryUsage()
310                : 0;
311   }
312 
313  private:
314   const BlockBasedTable* table_;
315   CachableEntry<Block> index_block_;
316 };
317 
ReadIndexBlock(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,const ReadOptions & read_options,bool use_cache,GetContext * get_context,BlockCacheLookupContext * lookup_context,CachableEntry<Block> * index_block)318 Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
319     const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
320     const ReadOptions& read_options, bool use_cache, GetContext* get_context,
321     BlockCacheLookupContext* lookup_context,
322     CachableEntry<Block>* index_block) {
323   PERF_TIMER_GUARD(read_index_block_nanos);
324 
325   assert(table != nullptr);
326   assert(index_block != nullptr);
327   assert(index_block->IsEmpty());
328 
329   const Rep* const rep = table->get_rep();
330   assert(rep != nullptr);
331 
332   const Status s = table->RetrieveBlock(
333       prefetch_buffer, read_options, rep->footer.index_handle(),
334       UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
335       get_context, lookup_context, /* for_compaction */ false, use_cache);
336 
337   return s;
338 }
339 
GetOrReadIndexBlock(bool no_io,GetContext * get_context,BlockCacheLookupContext * lookup_context,CachableEntry<Block> * index_block) const340 Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
341     bool no_io, GetContext* get_context,
342     BlockCacheLookupContext* lookup_context,
343     CachableEntry<Block>* index_block) const {
344   assert(index_block != nullptr);
345 
346   if (!index_block_.IsEmpty()) {
347     index_block->SetUnownedValue(index_block_.GetValue());
348     return Status::OK();
349   }
350 
351   ReadOptions read_options;
352   if (no_io) {
353     read_options.read_tier = kBlockCacheTier;
354   }
355 
356   return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
357                         cache_index_blocks(), get_context, lookup_context,
358                         index_block);
359 }
360 
361 // Index that allows binary search lookup in a two-level index structure.
362 class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
363  public:
364   // Read the partition index from the file and create an instance for
365   // `PartitionIndexReader`.
366   // On success, index_reader will be populated; otherwise it will remain
367   // unmodified.
Create(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)368   static Status Create(const BlockBasedTable* table,
369                        FilePrefetchBuffer* prefetch_buffer, bool use_cache,
370                        bool prefetch, bool pin,
371                        BlockCacheLookupContext* lookup_context,
372                        std::unique_ptr<IndexReader>* index_reader) {
373     assert(table != nullptr);
374     assert(table->get_rep());
375     assert(!pin || prefetch);
376     assert(index_reader != nullptr);
377 
378     CachableEntry<Block> index_block;
379     if (prefetch || !use_cache) {
380       const Status s =
381           ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
382                          /*get_context=*/nullptr, lookup_context, &index_block);
383       if (!s.ok()) {
384         return s;
385       }
386 
387       if (use_cache && !pin) {
388         index_block.Reset();
389       }
390     }
391 
392     index_reader->reset(
393         new PartitionIndexReader(table, std::move(index_block)));
394 
395     return Status::OK();
396   }
397 
398   // return a two-level iterator: first level is on the partition index
NewIterator(const ReadOptions & read_options,bool,IndexBlockIter * iter,GetContext * get_context,BlockCacheLookupContext * lookup_context)399   InternalIteratorBase<IndexValue>* NewIterator(
400       const ReadOptions& read_options, bool /* disable_prefix_seek */,
401       IndexBlockIter* iter, GetContext* get_context,
402       BlockCacheLookupContext* lookup_context) override {
403     const bool no_io = (read_options.read_tier == kBlockCacheTier);
404     CachableEntry<Block> index_block;
405     const Status s =
406         GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
407     if (!s.ok()) {
408       if (iter != nullptr) {
409         iter->Invalidate(s);
410         return iter;
411       }
412 
413       return NewErrorInternalIterator<IndexValue>(s);
414     }
415 
416     InternalIteratorBase<IndexValue>* it = nullptr;
417 
418     Statistics* kNullStats = nullptr;
419     // Filters are already checked before seeking the index
420     if (!partition_map_.empty()) {
421       // We don't return pinned data from index blocks, so no need
422       // to set `block_contents_pinned`.
423       it = NewTwoLevelIterator(
424           new BlockBasedTable::PartitionedIndexIteratorState(table(),
425                                                              &partition_map_),
426           index_block.GetValue()->NewIndexIterator(
427               internal_comparator(), internal_comparator()->user_comparator(),
428               nullptr, kNullStats, true, index_has_first_key(),
429               index_key_includes_seq(), index_value_is_full()));
430     } else {
431       ReadOptions ro;
432       ro.fill_cache = read_options.fill_cache;
433       // We don't return pinned data from index blocks, so no need
434       // to set `block_contents_pinned`.
435       it = new BlockBasedTableIterator<IndexBlockIter, IndexValue>(
436           table(), ro, *internal_comparator(),
437           index_block.GetValue()->NewIndexIterator(
438               internal_comparator(), internal_comparator()->user_comparator(),
439               nullptr, kNullStats, true, index_has_first_key(),
440               index_key_includes_seq(), index_value_is_full()),
441           false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
442           lookup_context ? lookup_context->caller
443                          : TableReaderCaller::kUncategorized);
444     }
445 
446     assert(it != nullptr);
447     index_block.TransferTo(it);
448 
449     return it;
450 
451     // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
452     // on-stack BlockIter while the state is on heap. Currentlly it assumes
453     // the first level iter is always on heap and will attempt to delete it
454     // in its destructor.
455   }
456 
CacheDependencies(bool pin)457   void CacheDependencies(bool pin) override {
458     // Before read partitions, prefetch them to avoid lots of IOs
459     BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
460     const BlockBasedTable::Rep* rep = table()->rep_;
461     IndexBlockIter biter;
462     BlockHandle handle;
463     Statistics* kNullStats = nullptr;
464 
465     CachableEntry<Block> index_block;
466     Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
467                                    &lookup_context, &index_block);
468     if (!s.ok()) {
469       ROCKS_LOG_WARN(rep->ioptions.info_log,
470                      "Error retrieving top-level index block while trying to "
471                      "cache index partitions: %s",
472                      s.ToString().c_str());
473       return;
474     }
475 
476     // We don't return pinned data from index blocks, so no need
477     // to set `block_contents_pinned`.
478     index_block.GetValue()->NewIndexIterator(
479         internal_comparator(), internal_comparator()->user_comparator(), &biter,
480         kNullStats, true, index_has_first_key(), index_key_includes_seq(),
481         index_value_is_full());
482     // Index partitions are assumed to be consecuitive. Prefetch them all.
483     // Read the first block offset
484     biter.SeekToFirst();
485     if (!biter.Valid()) {
486       // Empty index.
487       return;
488     }
489     handle = biter.value().handle;
490     uint64_t prefetch_off = handle.offset();
491 
492     // Read the last block's offset
493     biter.SeekToLast();
494     if (!biter.Valid()) {
495       // Empty index.
496       return;
497     }
498     handle = biter.value().handle;
499     uint64_t last_off = handle.offset() + block_size(handle);
500     uint64_t prefetch_len = last_off - prefetch_off;
501     std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
502     rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer);
503     s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off,
504                                   static_cast<size_t>(prefetch_len));
505 
506     // After prefetch, read the partitions one by one
507     biter.SeekToFirst();
508     auto ro = ReadOptions();
509     for (; biter.Valid(); biter.Next()) {
510       handle = biter.value().handle;
511       CachableEntry<Block> block;
512       // TODO: Support counter batch update for partitioned index and
513       // filter blocks
514       s = table()->MaybeReadBlockAndLoadToCache(
515           prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
516           &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context,
517           /*contents=*/nullptr);
518 
519       assert(s.ok() || block.GetValue() == nullptr);
520       if (s.ok() && block.GetValue() != nullptr) {
521         if (block.IsCached()) {
522           if (pin) {
523             partition_map_[handle.offset()] = std::move(block);
524           }
525         }
526       }
527     }
528   }
529 
ApproximateMemoryUsage() const530   size_t ApproximateMemoryUsage() const override {
531     size_t usage = ApproximateIndexBlockMemoryUsage();
532 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
533     usage += malloc_usable_size(const_cast<PartitionIndexReader*>(this));
534 #else
535     usage += sizeof(*this);
536 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
537     // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
538     return usage;
539   }
540 
541  private:
PartitionIndexReader(const BlockBasedTable * t,CachableEntry<Block> && index_block)542   PartitionIndexReader(const BlockBasedTable* t,
543                        CachableEntry<Block>&& index_block)
544       : IndexReaderCommon(t, std::move(index_block)) {}
545 
546   std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
547 };
548 
549 // Index that allows binary search lookup for the first key of each block.
550 // This class can be viewed as a thin wrapper for `Block` class which already
551 // supports binary search.
552 class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
553  public:
554   // Read index from the file and create an intance for
555   // `BinarySearchIndexReader`.
556   // On success, index_reader will be populated; otherwise it will remain
557   // unmodified.
Create(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)558   static Status Create(const BlockBasedTable* table,
559                        FilePrefetchBuffer* prefetch_buffer, bool use_cache,
560                        bool prefetch, bool pin,
561                        BlockCacheLookupContext* lookup_context,
562                        std::unique_ptr<IndexReader>* index_reader) {
563     assert(table != nullptr);
564     assert(table->get_rep());
565     assert(!pin || prefetch);
566     assert(index_reader != nullptr);
567 
568     CachableEntry<Block> index_block;
569     if (prefetch || !use_cache) {
570       const Status s =
571           ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
572                          /*get_context=*/nullptr, lookup_context, &index_block);
573       if (!s.ok()) {
574         return s;
575       }
576 
577       if (use_cache && !pin) {
578         index_block.Reset();
579       }
580     }
581 
582     index_reader->reset(
583         new BinarySearchIndexReader(table, std::move(index_block)));
584 
585     return Status::OK();
586   }
587 
NewIterator(const ReadOptions & read_options,bool,IndexBlockIter * iter,GetContext * get_context,BlockCacheLookupContext * lookup_context)588   InternalIteratorBase<IndexValue>* NewIterator(
589       const ReadOptions& read_options, bool /* disable_prefix_seek */,
590       IndexBlockIter* iter, GetContext* get_context,
591       BlockCacheLookupContext* lookup_context) override {
592     const bool no_io = (read_options.read_tier == kBlockCacheTier);
593     CachableEntry<Block> index_block;
594     const Status s =
595         GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
596     if (!s.ok()) {
597       if (iter != nullptr) {
598         iter->Invalidate(s);
599         return iter;
600       }
601 
602       return NewErrorInternalIterator<IndexValue>(s);
603     }
604 
605     Statistics* kNullStats = nullptr;
606     // We don't return pinned data from index blocks, so no need
607     // to set `block_contents_pinned`.
608     auto it = index_block.GetValue()->NewIndexIterator(
609         internal_comparator(), internal_comparator()->user_comparator(), iter,
610         kNullStats, true, index_has_first_key(), index_key_includes_seq(),
611         index_value_is_full());
612 
613     assert(it != nullptr);
614     index_block.TransferTo(it);
615 
616     return it;
617   }
618 
ApproximateMemoryUsage() const619   size_t ApproximateMemoryUsage() const override {
620     size_t usage = ApproximateIndexBlockMemoryUsage();
621 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
622     usage += malloc_usable_size(const_cast<BinarySearchIndexReader*>(this));
623 #else
624     usage += sizeof(*this);
625 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
626     return usage;
627   }
628 
629  private:
BinarySearchIndexReader(const BlockBasedTable * t,CachableEntry<Block> && index_block)630   BinarySearchIndexReader(const BlockBasedTable* t,
631                           CachableEntry<Block>&& index_block)
632       : IndexReaderCommon(t, std::move(index_block)) {}
633 };
634 
635 // Index that leverages an internal hash table to quicken the lookup for a given
636 // key.
637 class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
638  public:
Create(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_index_iter,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)639   static Status Create(const BlockBasedTable* table,
640                        FilePrefetchBuffer* prefetch_buffer,
641                        InternalIterator* meta_index_iter, bool use_cache,
642                        bool prefetch, bool pin,
643                        BlockCacheLookupContext* lookup_context,
644                        std::unique_ptr<IndexReader>* index_reader) {
645     assert(table != nullptr);
646     assert(index_reader != nullptr);
647     assert(!pin || prefetch);
648 
649     const BlockBasedTable::Rep* rep = table->get_rep();
650     assert(rep != nullptr);
651 
652     CachableEntry<Block> index_block;
653     if (prefetch || !use_cache) {
654       const Status s =
655           ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
656                          /*get_context=*/nullptr, lookup_context, &index_block);
657       if (!s.ok()) {
658         return s;
659       }
660 
661       if (use_cache && !pin) {
662         index_block.Reset();
663       }
664     }
665 
666     // Note, failure to create prefix hash index does not need to be a
667     // hard error. We can still fall back to the original binary search index.
668     // So, Create will succeed regardless, from this point on.
669 
670     index_reader->reset(new HashIndexReader(table, std::move(index_block)));
671 
672     // Get prefixes block
673     BlockHandle prefixes_handle;
674     Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
675                              &prefixes_handle);
676     if (!s.ok()) {
677       // TODO: log error
678       return Status::OK();
679     }
680 
681     // Get index metadata block
682     BlockHandle prefixes_meta_handle;
683     s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
684                       &prefixes_meta_handle);
685     if (!s.ok()) {
686       // TODO: log error
687       return Status::OK();
688     }
689 
690     RandomAccessFileReader* const file = rep->file.get();
691     const Footer& footer = rep->footer;
692     const ImmutableCFOptions& ioptions = rep->ioptions;
693     const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
694     MemoryAllocator* const memory_allocator =
695         GetMemoryAllocator(rep->table_options);
696 
697     // Read contents for the blocks
698     BlockContents prefixes_contents;
699     BlockFetcher prefixes_block_fetcher(
700         file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
701         &prefixes_contents, ioptions, true /*decompress*/,
702         true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
703         UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
704     s = prefixes_block_fetcher.ReadBlockContents();
705     if (!s.ok()) {
706       return s;
707     }
708     BlockContents prefixes_meta_contents;
709     BlockFetcher prefixes_meta_block_fetcher(
710         file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
711         &prefixes_meta_contents, ioptions, true /*decompress*/,
712         true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
713         UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
714     s = prefixes_meta_block_fetcher.ReadBlockContents();
715     if (!s.ok()) {
716       // TODO: log error
717       return Status::OK();
718     }
719 
720     BlockPrefixIndex* prefix_index = nullptr;
721     s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
722                                  prefixes_contents.data,
723                                  prefixes_meta_contents.data, &prefix_index);
724     // TODO: log error
725     if (s.ok()) {
726       HashIndexReader* const hash_index_reader =
727           static_cast<HashIndexReader*>(index_reader->get());
728       hash_index_reader->prefix_index_.reset(prefix_index);
729     }
730 
731     return Status::OK();
732   }
733 
NewIterator(const ReadOptions & read_options,bool disable_prefix_seek,IndexBlockIter * iter,GetContext * get_context,BlockCacheLookupContext * lookup_context)734   InternalIteratorBase<IndexValue>* NewIterator(
735       const ReadOptions& read_options, bool disable_prefix_seek,
736       IndexBlockIter* iter, GetContext* get_context,
737       BlockCacheLookupContext* lookup_context) override {
738     const bool no_io = (read_options.read_tier == kBlockCacheTier);
739     CachableEntry<Block> index_block;
740     const Status s =
741         GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
742     if (!s.ok()) {
743       if (iter != nullptr) {
744         iter->Invalidate(s);
745         return iter;
746       }
747 
748       return NewErrorInternalIterator<IndexValue>(s);
749     }
750 
751     Statistics* kNullStats = nullptr;
752     const bool total_order_seek =
753         read_options.total_order_seek || disable_prefix_seek;
754     // We don't return pinned data from index blocks, so no need
755     // to set `block_contents_pinned`.
756     auto it = index_block.GetValue()->NewIndexIterator(
757         internal_comparator(), internal_comparator()->user_comparator(), iter,
758         kNullStats, total_order_seek, index_has_first_key(),
759         index_key_includes_seq(), index_value_is_full(),
760         false /* block_contents_pinned */, prefix_index_.get());
761 
762     assert(it != nullptr);
763     index_block.TransferTo(it);
764 
765     return it;
766   }
767 
ApproximateMemoryUsage() const768   size_t ApproximateMemoryUsage() const override {
769     size_t usage = ApproximateIndexBlockMemoryUsage();
770 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
771     usage += malloc_usable_size(const_cast<HashIndexReader*>(this));
772 #else
773     if (prefix_index_) {
774       usage += prefix_index_->ApproximateMemoryUsage();
775     }
776     usage += sizeof(*this);
777 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
778     return usage;
779   }
780 
781  private:
HashIndexReader(const BlockBasedTable * t,CachableEntry<Block> && index_block)782   HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
783       : IndexReaderCommon(t, std::move(index_block)) {}
784 
785   std::unique_ptr<BlockPrefixIndex> prefix_index_;
786 };
787 
UpdateCacheHitMetrics(BlockType block_type,GetContext * get_context,size_t usage) const788 void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
789                                             GetContext* get_context,
790                                             size_t usage) const {
791   Statistics* const statistics = rep_->ioptions.statistics;
792 
793   PERF_COUNTER_ADD(block_cache_hit_count, 1);
794   PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
795                             static_cast<uint32_t>(rep_->level));
796 
797   if (get_context) {
798     ++get_context->get_context_stats_.num_cache_hit;
799     get_context->get_context_stats_.num_cache_bytes_read += usage;
800   } else {
801     RecordTick(statistics, BLOCK_CACHE_HIT);
802     RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
803   }
804 
805   switch (block_type) {
806     case BlockType::kFilter:
807       PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
808 
809       if (get_context) {
810         ++get_context->get_context_stats_.num_cache_filter_hit;
811       } else {
812         RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
813       }
814       break;
815 
816     case BlockType::kCompressionDictionary:
817       // TODO: introduce perf counter for compression dictionary hit count
818       if (get_context) {
819         ++get_context->get_context_stats_.num_cache_compression_dict_hit;
820       } else {
821         RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
822       }
823       break;
824 
825     case BlockType::kIndex:
826       PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
827 
828       if (get_context) {
829         ++get_context->get_context_stats_.num_cache_index_hit;
830       } else {
831         RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
832       }
833       break;
834 
835     default:
836       // TODO: introduce dedicated tickers/statistics/counters
837       // for range tombstones
838       if (get_context) {
839         ++get_context->get_context_stats_.num_cache_data_hit;
840       } else {
841         RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
842       }
843       break;
844   }
845 }
846 
UpdateCacheMissMetrics(BlockType block_type,GetContext * get_context) const847 void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
848                                              GetContext* get_context) const {
849   Statistics* const statistics = rep_->ioptions.statistics;
850 
851   // TODO: introduce aggregate (not per-level) block cache miss count
852   PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
853                             static_cast<uint32_t>(rep_->level));
854 
855   if (get_context) {
856     ++get_context->get_context_stats_.num_cache_miss;
857   } else {
858     RecordTick(statistics, BLOCK_CACHE_MISS);
859   }
860 
861   // TODO: introduce perf counters for misses per block type
862   switch (block_type) {
863     case BlockType::kFilter:
864       if (get_context) {
865         ++get_context->get_context_stats_.num_cache_filter_miss;
866       } else {
867         RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
868       }
869       break;
870 
871     case BlockType::kCompressionDictionary:
872       if (get_context) {
873         ++get_context->get_context_stats_.num_cache_compression_dict_miss;
874       } else {
875         RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
876       }
877       break;
878 
879     case BlockType::kIndex:
880       if (get_context) {
881         ++get_context->get_context_stats_.num_cache_index_miss;
882       } else {
883         RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
884       }
885       break;
886 
887     default:
888       // TODO: introduce dedicated tickers/statistics/counters
889       // for range tombstones
890       if (get_context) {
891         ++get_context->get_context_stats_.num_cache_data_miss;
892       } else {
893         RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
894       }
895       break;
896   }
897 }
898 
UpdateCacheInsertionMetrics(BlockType block_type,GetContext * get_context,size_t usage) const899 void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
900                                                   GetContext* get_context,
901                                                   size_t usage) const {
902   Statistics* const statistics = rep_->ioptions.statistics;
903 
904   // TODO: introduce perf counters for block cache insertions
905   if (get_context) {
906     ++get_context->get_context_stats_.num_cache_add;
907     get_context->get_context_stats_.num_cache_bytes_write += usage;
908   } else {
909     RecordTick(statistics, BLOCK_CACHE_ADD);
910     RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
911   }
912 
913   switch (block_type) {
914     case BlockType::kFilter:
915       if (get_context) {
916         ++get_context->get_context_stats_.num_cache_filter_add;
917         get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
918       } else {
919         RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
920         RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
921       }
922       break;
923 
924     case BlockType::kCompressionDictionary:
925       if (get_context) {
926         ++get_context->get_context_stats_.num_cache_compression_dict_add;
927         get_context->get_context_stats_
928             .num_cache_compression_dict_bytes_insert += usage;
929       } else {
930         RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
931         RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
932                    usage);
933       }
934       break;
935 
936     case BlockType::kIndex:
937       if (get_context) {
938         ++get_context->get_context_stats_.num_cache_index_add;
939         get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
940       } else {
941         RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
942         RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
943       }
944       break;
945 
946     default:
947       // TODO: introduce dedicated tickers/statistics/counters
948       // for range tombstones
949       if (get_context) {
950         ++get_context->get_context_stats_.num_cache_data_add;
951         get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
952       } else {
953         RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
954         RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
955       }
956       break;
957   }
958 }
959 
GetEntryFromCache(Cache * block_cache,const Slice & key,BlockType block_type,GetContext * get_context) const960 Cache::Handle* BlockBasedTable::GetEntryFromCache(
961     Cache* block_cache, const Slice& key, BlockType block_type,
962     GetContext* get_context) const {
963   auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);
964 
965   if (cache_handle != nullptr) {
966     UpdateCacheHitMetrics(block_type, get_context,
967                           block_cache->GetUsage(cache_handle));
968   } else {
969     UpdateCacheMissMetrics(block_type, get_context);
970   }
971 
972   return cache_handle;
973 }
974 
975 // Helper function to setup the cache key's prefix for the Table.
SetupCacheKeyPrefix(Rep * rep)976 void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
977   assert(kMaxCacheKeyPrefixSize >= 10);
978   rep->cache_key_prefix_size = 0;
979   rep->compressed_cache_key_prefix_size = 0;
980   if (rep->table_options.block_cache != nullptr) {
981     GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
982                         &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
983   }
984   if (rep->table_options.persistent_cache != nullptr) {
985     GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
986                         &rep->persistent_cache_key_prefix[0],
987                         &rep->persistent_cache_key_prefix_size);
988   }
989   if (rep->table_options.block_cache_compressed != nullptr) {
990     GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
991                         rep->file->file(), &rep->compressed_cache_key_prefix[0],
992                         &rep->compressed_cache_key_prefix_size);
993   }
994 }
995 
GenerateCachePrefix(Cache * cc,FSRandomAccessFile * file,char * buffer,size_t * size)996 void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
997                                           char* buffer, size_t* size) {
998   // generate an id from the file
999   *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
1000 
1001   // If the prefix wasn't generated or was too long,
1002   // create one from the cache.
1003   if (cc != nullptr && *size == 0) {
1004     char* end = EncodeVarint64(buffer, cc->NewId());
1005     *size = static_cast<size_t>(end - buffer);
1006   }
1007 }
1008 
GenerateCachePrefix(Cache * cc,FSWritableFile * file,char * buffer,size_t * size)1009 void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSWritableFile* file,
1010                                           char* buffer, size_t* size) {
1011   // generate an id from the file
1012   *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
1013 
1014   // If the prefix wasn't generated or was too long,
1015   // create one from the cache.
1016   if (cc != nullptr && *size == 0) {
1017     char* end = EncodeVarint64(buffer, cc->NewId());
1018     *size = static_cast<size_t>(end - buffer);
1019   }
1020 }
1021 
1022 namespace {
1023 // Return True if table_properties has `user_prop_name` has a `true` value
1024 // or it doesn't contain this property (for backward compatible).
IsFeatureSupported(const TableProperties & table_properties,const std::string & user_prop_name,Logger * info_log)1025 bool IsFeatureSupported(const TableProperties& table_properties,
1026                         const std::string& user_prop_name, Logger* info_log) {
1027   auto& props = table_properties.user_collected_properties;
1028   auto pos = props.find(user_prop_name);
1029   // Older version doesn't have this value set. Skip this check.
1030   if (pos != props.end()) {
1031     if (pos->second == kPropFalse) {
1032       return false;
1033     } else if (pos->second != kPropTrue) {
1034       ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
1035                      user_prop_name.c_str(), pos->second.c_str());
1036     }
1037   }
1038   return true;
1039 }
1040 
1041 // Caller has to ensure seqno is not nullptr.
GetGlobalSequenceNumber(const TableProperties & table_properties,SequenceNumber largest_seqno,SequenceNumber * seqno)1042 Status GetGlobalSequenceNumber(const TableProperties& table_properties,
1043                                SequenceNumber largest_seqno,
1044                                SequenceNumber* seqno) {
1045   const auto& props = table_properties.user_collected_properties;
1046   const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
1047   const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
1048 
1049   *seqno = kDisableGlobalSequenceNumber;
1050   if (version_pos == props.end()) {
1051     if (seqno_pos != props.end()) {
1052       std::array<char, 200> msg_buf;
1053       // This is not an external sst file, global_seqno is not supported.
1054       snprintf(
1055           msg_buf.data(), msg_buf.max_size(),
1056           "A non-external sst file have global seqno property with value %s",
1057           seqno_pos->second.c_str());
1058       return Status::Corruption(msg_buf.data());
1059     }
1060     return Status::OK();
1061   }
1062 
1063   uint32_t version = DecodeFixed32(version_pos->second.c_str());
1064   if (version < 2) {
1065     if (seqno_pos != props.end() || version != 1) {
1066       std::array<char, 200> msg_buf;
1067       // This is a v1 external sst file, global_seqno is not supported.
1068       snprintf(msg_buf.data(), msg_buf.max_size(),
1069                "An external sst file with version %u have global seqno "
1070                "property with value %s",
1071                version, seqno_pos->second.c_str());
1072       return Status::Corruption(msg_buf.data());
1073     }
1074     return Status::OK();
1075   }
1076 
1077   // Since we have a plan to deprecate global_seqno, we do not return failure
1078   // if seqno_pos == props.end(). We rely on version_pos to detect whether the
1079   // SST is external.
1080   SequenceNumber global_seqno(0);
1081   if (seqno_pos != props.end()) {
1082     global_seqno = DecodeFixed64(seqno_pos->second.c_str());
1083   }
1084   // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
1085   // to denote it is unknown.
1086   if (largest_seqno < kMaxSequenceNumber) {
1087     if (global_seqno == 0) {
1088       global_seqno = largest_seqno;
1089     }
1090     if (global_seqno != largest_seqno) {
1091       std::array<char, 200> msg_buf;
1092       snprintf(
1093           msg_buf.data(), msg_buf.max_size(),
1094           "An external sst file with version %u have global seqno property "
1095           "with value %s, while largest seqno in the file is %llu",
1096           version, seqno_pos->second.c_str(),
1097           static_cast<unsigned long long>(largest_seqno));
1098       return Status::Corruption(msg_buf.data());
1099     }
1100   }
1101   *seqno = global_seqno;
1102 
1103   if (global_seqno > kMaxSequenceNumber) {
1104     std::array<char, 200> msg_buf;
1105     snprintf(msg_buf.data(), msg_buf.max_size(),
1106              "An external sst file with version %u have global seqno property "
1107              "with value %llu, which is greater than kMaxSequenceNumber",
1108              version, static_cast<unsigned long long>(global_seqno));
1109     return Status::Corruption(msg_buf.data());
1110   }
1111 
1112   return Status::OK();
1113 }
1114 }  // namespace
1115 
GetCacheKey(const char * cache_key_prefix,size_t cache_key_prefix_size,const BlockHandle & handle,char * cache_key)1116 Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
1117                                    size_t cache_key_prefix_size,
1118                                    const BlockHandle& handle, char* cache_key) {
1119   assert(cache_key != nullptr);
1120   assert(cache_key_prefix_size != 0);
1121   assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
1122   memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
1123   char* end =
1124       EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
1125   return Slice(cache_key, static_cast<size_t>(end - cache_key));
1126 }
1127 
Open(const ImmutableCFOptions & ioptions,const EnvOptions & env_options,const BlockBasedTableOptions & table_options,const InternalKeyComparator & internal_comparator,std::unique_ptr<RandomAccessFileReader> && file,uint64_t file_size,std::unique_ptr<TableReader> * table_reader,const SliceTransform * prefix_extractor,const bool prefetch_index_and_filter_in_cache,const bool skip_filters,const int level,const bool immortal_table,const SequenceNumber largest_seqno,TailPrefetchStats * tail_prefetch_stats,BlockCacheTracer * const block_cache_tracer)1128 Status BlockBasedTable::Open(
1129     const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
1130     const BlockBasedTableOptions& table_options,
1131     const InternalKeyComparator& internal_comparator,
1132     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
1133     std::unique_ptr<TableReader>* table_reader,
1134     const SliceTransform* prefix_extractor,
1135     const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
1136     const int level, const bool immortal_table,
1137     const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
1138     BlockCacheTracer* const block_cache_tracer) {
1139   table_reader->reset();
1140 
1141   Status s;
1142   Footer footer;
1143   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
1144 
1145   // prefetch both index and filters, down to all partitions
1146   const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
1147   const bool preload_all = !table_options.cache_index_and_filter_blocks;
1148 
1149   if (!ioptions.allow_mmap_reads) {
1150     s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
1151                      preload_all, &prefetch_buffer);
1152   } else {
1153     // Should not prefetch for mmap mode.
1154     prefetch_buffer.reset(new FilePrefetchBuffer(
1155         nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
1156   }
1157 
1158   // Read in the following order:
1159   //    1. Footer
1160   //    2. [metaindex block]
1161   //    3. [meta block: properties]
1162   //    4. [meta block: range deletion tombstone]
1163   //    5. [meta block: compression dictionary]
1164   //    6. [meta block: index]
1165   //    7. [meta block: filter]
1166   s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
1167                          kBlockBasedTableMagicNumber);
1168   if (!s.ok()) {
1169     return s;
1170   }
1171   if (!BlockBasedTableSupportedVersion(footer.version())) {
1172     return Status::Corruption(
1173         "Unknown Footer version. Maybe this file was created with newer "
1174         "version of RocksDB?");
1175   }
1176 
1177   // We've successfully read the footer. We are ready to serve requests.
1178   // Better not mutate rep_ after the creation. eg. internal_prefix_transform
1179   // raw pointer will be used to create HashIndexReader, whose reset may
1180   // access a dangling pointer.
1181   BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
1182   Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
1183                                       internal_comparator, skip_filters, level,
1184                                       immortal_table);
1185   rep->file = std::move(file);
1186   rep->footer = footer;
1187   rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
1188   // We need to wrap data with internal_prefix_transform to make sure it can
1189   // handle prefix correctly.
1190   rep->internal_prefix_transform.reset(
1191       new InternalKeySliceTransform(prefix_extractor));
1192   SetupCacheKeyPrefix(rep);
1193   std::unique_ptr<BlockBasedTable> new_table(
1194       new BlockBasedTable(rep, block_cache_tracer));
1195 
1196   // page cache options
1197   rep->persistent_cache_options =
1198       PersistentCacheOptions(rep->table_options.persistent_cache,
1199                              std::string(rep->persistent_cache_key_prefix,
1200                                          rep->persistent_cache_key_prefix_size),
1201                              rep->ioptions.statistics);
1202 
1203   // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
1204   // handle to null, otherwise it may be seen as uninitialized during the below
1205   // meta-block reads.
1206   rep->compression_dict_handle = BlockHandle::NullBlockHandle();
1207 
1208   // Read metaindex
1209   std::unique_ptr<Block> metaindex;
1210   std::unique_ptr<InternalIterator> metaindex_iter;
1211   s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex,
1212                                     &metaindex_iter);
1213   if (!s.ok()) {
1214     return s;
1215   }
1216 
1217   // Populates table_properties and some fields that depend on it,
1218   // such as index_type.
1219   s = new_table->ReadPropertiesBlock(prefetch_buffer.get(),
1220                                      metaindex_iter.get(), largest_seqno);
1221   if (!s.ok()) {
1222     return s;
1223   }
1224   s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(),
1225                                    internal_comparator, &lookup_context);
1226   if (!s.ok()) {
1227     return s;
1228   }
1229   s = new_table->PrefetchIndexAndFilterBlocks(
1230       prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
1231       prefetch_all, table_options, level, &lookup_context);
1232 
1233   if (s.ok()) {
1234     // Update tail prefetch stats
1235     assert(prefetch_buffer.get() != nullptr);
1236     if (tail_prefetch_stats != nullptr) {
1237       assert(prefetch_buffer->min_offset_read() < file_size);
1238       tail_prefetch_stats->RecordEffectiveSize(
1239           static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
1240     }
1241 
1242     *table_reader = std::move(new_table);
1243   }
1244 
1245   return s;
1246 }
1247 
PrefetchTail(RandomAccessFileReader * file,uint64_t file_size,TailPrefetchStats * tail_prefetch_stats,const bool prefetch_all,const bool preload_all,std::unique_ptr<FilePrefetchBuffer> * prefetch_buffer)1248 Status BlockBasedTable::PrefetchTail(
1249     RandomAccessFileReader* file, uint64_t file_size,
1250     TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
1251     const bool preload_all,
1252     std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
1253   size_t tail_prefetch_size = 0;
1254   if (tail_prefetch_stats != nullptr) {
1255     // Multiple threads may get a 0 (no history) when running in parallel,
1256     // but it will get cleared after the first of them finishes.
1257     tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
1258   }
1259   if (tail_prefetch_size == 0) {
1260     // Before read footer, readahead backwards to prefetch data. Do more
1261     // readahead if we're going to read index/filter.
1262     // TODO: This may incorrectly select small readahead in case partitioned
1263     // index/filter is enabled and top-level partition pinning is enabled.
1264     // That's because we need to issue readahead before we read the properties,
1265     // at which point we don't yet know the index type.
1266     tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
1267   }
1268   size_t prefetch_off;
1269   size_t prefetch_len;
1270   if (file_size < tail_prefetch_size) {
1271     prefetch_off = 0;
1272     prefetch_len = static_cast<size_t>(file_size);
1273   } else {
1274     prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
1275     prefetch_len = tail_prefetch_size;
1276   }
1277   TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
1278                            &tail_prefetch_size);
1279   Status s;
1280   // TODO should not have this special logic in the future.
1281   if (!file->use_direct_io()) {
1282     prefetch_buffer->reset(new FilePrefetchBuffer(
1283         nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
1284     s = file->Prefetch(prefetch_off, prefetch_len);
1285   } else {
1286     prefetch_buffer->reset(new FilePrefetchBuffer(
1287         nullptr, 0, 0, true /* enable */, true /* track_min_offset */));
1288     s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
1289   }
1290   return s;
1291 }
1292 
VerifyChecksum(const ChecksumType type,const char * buf,size_t len,uint32_t expected)1293 Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
1294                       uint32_t expected) {
1295   Status s;
1296   uint32_t actual = 0;
1297   switch (type) {
1298     case kNoChecksum:
1299       break;
1300     case kCRC32c:
1301       expected = crc32c::Unmask(expected);
1302       actual = crc32c::Value(buf, len);
1303       break;
1304     case kxxHash:
1305       actual = XXH32(buf, static_cast<int>(len), 0);
1306       break;
1307     case kxxHash64:
1308       actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
1309                                      uint64_t{0xffffffff});
1310       break;
1311     default:
1312       s = Status::Corruption("unknown checksum type");
1313   }
1314   if (s.ok() && actual != expected) {
1315     s = Status::Corruption("properties block checksum mismatched");
1316   }
1317   return s;
1318 }
1319 
TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer * prefetch_buffer,const Slice & handle_value,TableProperties ** table_properties)1320 Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
1321     FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
1322     TableProperties** table_properties) {
1323   assert(table_properties != nullptr);
1324   // If this is an external SST file ingested with write_global_seqno set to
1325   // true, then we expect the checksum mismatch because checksum was written
1326   // by SstFileWriter, but its global seqno in the properties block may have
1327   // been changed during ingestion. In this case, we read the properties
1328   // block, copy it to a memory buffer, change the global seqno to its
1329   // original value, i.e. 0, and verify the checksum again.
1330   BlockHandle props_block_handle;
1331   CacheAllocationPtr tmp_buf;
1332   Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
1333                             rep_->footer, rep_->ioptions, table_properties,
1334                             false /* verify_checksum */, &props_block_handle,
1335                             &tmp_buf, false /* compression_type_missing */,
1336                             nullptr /* memory_allocator */);
1337   if (s.ok() && tmp_buf) {
1338     const auto seqno_pos_iter =
1339         (*table_properties)
1340             ->properties_offsets.find(
1341                 ExternalSstFilePropertyNames::kGlobalSeqno);
1342     size_t block_size = static_cast<size_t>(props_block_handle.size());
1343     if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
1344       uint64_t global_seqno_offset = seqno_pos_iter->second;
1345       EncodeFixed64(
1346           tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
1347     }
1348     uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
1349     s = rocksdb::VerifyChecksum(rep_->footer.checksum(), tmp_buf.get(),
1350                                 block_size + 1, value);
1351   }
1352   return s;
1353 }
1354 
ReadPropertiesBlock(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,const SequenceNumber largest_seqno)1355 Status BlockBasedTable::ReadPropertiesBlock(
1356     FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1357     const SequenceNumber largest_seqno) {
1358   bool found_properties_block = true;
1359   Status s;
1360   s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
1361 
1362   if (!s.ok()) {
1363     ROCKS_LOG_WARN(rep_->ioptions.info_log,
1364                    "Error when seeking to properties block from file: %s",
1365                    s.ToString().c_str());
1366   } else if (found_properties_block) {
1367     s = meta_iter->status();
1368     TableProperties* table_properties = nullptr;
1369     if (s.ok()) {
1370       s = ReadProperties(
1371           meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
1372           rep_->ioptions, &table_properties, true /* verify_checksum */,
1373           nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
1374           false /* compression_type_missing */, nullptr /* memory_allocator */);
1375     }
1376 
1377     if (s.IsCorruption()) {
1378       s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
1379                                            &table_properties);
1380     }
1381     std::unique_ptr<TableProperties> props_guard;
1382     if (table_properties != nullptr) {
1383       props_guard.reset(table_properties);
1384     }
1385 
1386     if (!s.ok()) {
1387       ROCKS_LOG_WARN(rep_->ioptions.info_log,
1388                      "Encountered error while reading data from properties "
1389                      "block %s",
1390                      s.ToString().c_str());
1391     } else {
1392       assert(table_properties != nullptr);
1393       rep_->table_properties.reset(props_guard.release());
1394       rep_->blocks_maybe_compressed =
1395           rep_->table_properties->compression_name !=
1396           CompressionTypeToString(kNoCompression);
1397       rep_->blocks_definitely_zstd_compressed =
1398           (rep_->table_properties->compression_name ==
1399                CompressionTypeToString(kZSTD) ||
1400            rep_->table_properties->compression_name ==
1401                CompressionTypeToString(kZSTDNotFinalCompression));
1402     }
1403   } else {
1404     ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1405                     "Cannot find Properties block from file.");
1406   }
1407 #ifndef ROCKSDB_LITE
1408   if (rep_->table_properties) {
1409     ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
1410                         &(rep_->table_prefix_extractor));
1411   }
1412 #endif  // ROCKSDB_LITE
1413 
1414   // Read the table properties, if provided.
1415   if (rep_->table_properties) {
1416     rep_->whole_key_filtering &=
1417         IsFeatureSupported(*(rep_->table_properties),
1418                            BlockBasedTablePropertyNames::kWholeKeyFiltering,
1419                            rep_->ioptions.info_log);
1420     rep_->prefix_filtering &=
1421         IsFeatureSupported(*(rep_->table_properties),
1422                            BlockBasedTablePropertyNames::kPrefixFiltering,
1423                            rep_->ioptions.info_log);
1424 
1425     rep_->index_key_includes_seq =
1426         rep_->table_properties->index_key_is_user_key == 0;
1427     rep_->index_value_is_full =
1428         rep_->table_properties->index_value_is_delta_encoded == 0;
1429 
1430     // Update index_type with the true type.
1431     // If table properties don't contain index type, we assume that the table
1432     // is in very old format and has kBinarySearch index type.
1433     auto& props = rep_->table_properties->user_collected_properties;
1434     auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
1435     if (pos != props.end()) {
1436       rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
1437           DecodeFixed32(pos->second.c_str()));
1438     }
1439 
1440     rep_->index_has_first_key =
1441         rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
1442 
1443     s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
1444                                 &(rep_->global_seqno));
1445     if (!s.ok()) {
1446       ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
1447     }
1448   }
1449   return s;
1450 }
1451 
ReadRangeDelBlock(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,const InternalKeyComparator & internal_comparator,BlockCacheLookupContext * lookup_context)1452 Status BlockBasedTable::ReadRangeDelBlock(
1453     FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1454     const InternalKeyComparator& internal_comparator,
1455     BlockCacheLookupContext* lookup_context) {
1456   Status s;
1457   bool found_range_del_block;
1458   BlockHandle range_del_handle;
1459   s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
1460   if (!s.ok()) {
1461     ROCKS_LOG_WARN(
1462         rep_->ioptions.info_log,
1463         "Error when seeking to range delete tombstones block from file: %s",
1464         s.ToString().c_str());
1465   } else if (found_range_del_block && !range_del_handle.IsNull()) {
1466     ReadOptions read_options;
1467     std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
1468         read_options, range_del_handle,
1469         /*input_iter=*/nullptr, BlockType::kRangeDeletion,
1470         /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
1471     assert(iter != nullptr);
1472     s = iter->status();
1473     if (!s.ok()) {
1474       ROCKS_LOG_WARN(
1475           rep_->ioptions.info_log,
1476           "Encountered error while reading data from range del block %s",
1477           s.ToString().c_str());
1478     } else {
1479       rep_->fragmented_range_dels =
1480           std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
1481                                                          internal_comparator);
1482     }
1483   }
1484   return s;
1485 }
1486 
PrefetchIndexAndFilterBlocks(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,BlockBasedTable * new_table,bool prefetch_all,const BlockBasedTableOptions & table_options,const int level,BlockCacheLookupContext * lookup_context)1487 Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
1488     FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1489     BlockBasedTable* new_table, bool prefetch_all,
1490     const BlockBasedTableOptions& table_options, const int level,
1491     BlockCacheLookupContext* lookup_context) {
1492   Status s;
1493 
1494   // Find filter handle and filter type
1495   if (rep_->filter_policy) {
1496     for (auto filter_type :
1497          {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
1498           Rep::FilterType::kBlockFilter}) {
1499       std::string prefix;
1500       switch (filter_type) {
1501         case Rep::FilterType::kFullFilter:
1502           prefix = kFullFilterBlockPrefix;
1503           break;
1504         case Rep::FilterType::kPartitionedFilter:
1505           prefix = kPartitionedFilterBlockPrefix;
1506           break;
1507         case Rep::FilterType::kBlockFilter:
1508           prefix = kFilterBlockPrefix;
1509           break;
1510         default:
1511           assert(0);
1512       }
1513       std::string filter_block_key = prefix;
1514       filter_block_key.append(rep_->filter_policy->Name());
1515       if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
1516               .ok()) {
1517         rep_->filter_type = filter_type;
1518         break;
1519       }
1520     }
1521   }
1522 
1523   // Find compression dictionary handle
1524   bool found_compression_dict = false;
1525   s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
1526                                  &rep_->compression_dict_handle);
1527   if (!s.ok()) {
1528     return s;
1529   }
1530 
1531   BlockBasedTableOptions::IndexType index_type = rep_->index_type;
1532 
1533   const bool use_cache = table_options.cache_index_and_filter_blocks;
1534 
1535   // pin both index and filters, down to all partitions
1536   const bool pin_all =
1537       rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
1538 
1539   // prefetch the first level of index
1540   const bool prefetch_index =
1541       prefetch_all ||
1542       (table_options.pin_top_level_index_and_filter &&
1543        index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
1544   // pin the first level of index
1545   const bool pin_index =
1546       pin_all || (table_options.pin_top_level_index_and_filter &&
1547                   index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
1548 
1549   std::unique_ptr<IndexReader> index_reader;
1550   s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
1551                                    prefetch_index, pin_index, lookup_context,
1552                                    &index_reader);
1553   if (!s.ok()) {
1554     return s;
1555   }
1556 
1557   rep_->index_reader = std::move(index_reader);
1558 
1559   // The partitions of partitioned index are always stored in cache. They
1560   // are hence follow the configuration for pin and prefetch regardless of
1561   // the value of cache_index_and_filter_blocks
1562   if (prefetch_all) {
1563     rep_->index_reader->CacheDependencies(pin_all);
1564   }
1565 
1566   // prefetch the first level of filter
1567   const bool prefetch_filter =
1568       prefetch_all ||
1569       (table_options.pin_top_level_index_and_filter &&
1570        rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1571   // Partition fitlers cannot be enabled without partition indexes
1572   assert(!prefetch_filter || prefetch_index);
1573   // pin the first level of filter
1574   const bool pin_filter =
1575       pin_all || (table_options.pin_top_level_index_and_filter &&
1576                   rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1577 
1578   if (rep_->filter_policy) {
1579     auto filter = new_table->CreateFilterBlockReader(
1580         prefetch_buffer, use_cache, prefetch_filter, pin_filter,
1581         lookup_context);
1582     if (filter) {
1583       // Refer to the comment above about paritioned indexes always being cached
1584       if (prefetch_all) {
1585         filter->CacheDependencies(pin_all);
1586       }
1587 
1588       rep_->filter = std::move(filter);
1589     }
1590   }
1591 
1592   if (!rep_->compression_dict_handle.IsNull()) {
1593     std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
1594     s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache,
1595                                         prefetch_all, pin_all, lookup_context,
1596                                         &uncompression_dict_reader);
1597     if (!s.ok()) {
1598       return s;
1599     }
1600 
1601     rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
1602   }
1603 
1604   assert(s.ok());
1605   return s;
1606 }
1607 
SetupForCompaction()1608 void BlockBasedTable::SetupForCompaction() {
1609   switch (rep_->ioptions.access_hint_on_compaction_start) {
1610     case Options::NONE:
1611       break;
1612     case Options::NORMAL:
1613       rep_->file->file()->Hint(FSRandomAccessFile::kNormal);
1614       break;
1615     case Options::SEQUENTIAL:
1616       rep_->file->file()->Hint(FSRandomAccessFile::kSequential);
1617       break;
1618     case Options::WILLNEED:
1619       rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed);
1620       break;
1621     default:
1622       assert(false);
1623   }
1624 }
1625 
GetTableProperties() const1626 std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
1627     const {
1628   return rep_->table_properties;
1629 }
1630 
ApproximateMemoryUsage() const1631 size_t BlockBasedTable::ApproximateMemoryUsage() const {
1632   size_t usage = 0;
1633   if (rep_->filter) {
1634     usage += rep_->filter->ApproximateMemoryUsage();
1635   }
1636   if (rep_->index_reader) {
1637     usage += rep_->index_reader->ApproximateMemoryUsage();
1638   }
1639   if (rep_->uncompression_dict_reader) {
1640     usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
1641   }
1642   return usage;
1643 }
1644 
1645 // Load the meta-index-block from the file. On success, return the loaded
1646 // metaindex
1647 // block and its iterator.
ReadMetaIndexBlock(FilePrefetchBuffer * prefetch_buffer,std::unique_ptr<Block> * metaindex_block,std::unique_ptr<InternalIterator> * iter)1648 Status BlockBasedTable::ReadMetaIndexBlock(
1649     FilePrefetchBuffer* prefetch_buffer,
1650     std::unique_ptr<Block>* metaindex_block,
1651     std::unique_ptr<InternalIterator>* iter) {
1652   // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
1653   // it is an empty block.
1654   std::unique_ptr<Block> metaindex;
1655   Status s = ReadBlockFromFile(
1656       rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
1657       rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
1658       true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
1659       UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
1660       kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
1661       GetMemoryAllocator(rep_->table_options), false /* for_compaction */,
1662       rep_->blocks_definitely_zstd_compressed, nullptr /* filter_policy */);
1663 
1664   if (!s.ok()) {
1665     ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1666                     "Encountered error while reading data from properties"
1667                     " block %s",
1668                     s.ToString().c_str());
1669     return s;
1670   }
1671 
1672   *metaindex_block = std::move(metaindex);
1673   // meta block uses bytewise comparator.
1674   iter->reset(metaindex_block->get()->NewDataIterator(BytewiseComparator(),
1675                                                       BytewiseComparator()));
1676   return Status::OK();
1677 }
1678 
1679 template <typename TBlocklike>
GetDataBlockFromCache(const Slice & block_cache_key,const Slice & compressed_block_cache_key,Cache * block_cache,Cache * block_cache_compressed,const ReadOptions & read_options,CachableEntry<TBlocklike> * block,const UncompressionDict & uncompression_dict,BlockType block_type,GetContext * get_context) const1680 Status BlockBasedTable::GetDataBlockFromCache(
1681     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1682     Cache* block_cache, Cache* block_cache_compressed,
1683     const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
1684     const UncompressionDict& uncompression_dict, BlockType block_type,
1685     GetContext* get_context) const {
1686   const size_t read_amp_bytes_per_bit =
1687       block_type == BlockType::kData
1688           ? rep_->table_options.read_amp_bytes_per_bit
1689           : 0;
1690   assert(block);
1691   assert(block->IsEmpty());
1692 
1693   Status s;
1694   BlockContents* compressed_block = nullptr;
1695   Cache::Handle* block_cache_compressed_handle = nullptr;
1696 
1697   // Lookup uncompressed cache first
1698   if (block_cache != nullptr) {
1699     auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
1700                                           block_type, get_context);
1701     if (cache_handle != nullptr) {
1702       block->SetCachedValue(
1703           reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
1704           block_cache, cache_handle);
1705       return s;
1706     }
1707   }
1708 
1709   // If not found, search from the compressed block cache.
1710   assert(block->IsEmpty());
1711 
1712   if (block_cache_compressed == nullptr) {
1713     return s;
1714   }
1715 
1716   assert(!compressed_block_cache_key.empty());
1717   block_cache_compressed_handle =
1718       block_cache_compressed->Lookup(compressed_block_cache_key);
1719 
1720   Statistics* statistics = rep_->ioptions.statistics;
1721 
1722   // if we found in the compressed cache, then uncompress and insert into
1723   // uncompressed cache
1724   if (block_cache_compressed_handle == nullptr) {
1725     RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
1726     return s;
1727   }
1728 
1729   // found compressed block
1730   RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
1731   compressed_block = reinterpret_cast<BlockContents*>(
1732       block_cache_compressed->Value(block_cache_compressed_handle));
1733   CompressionType compression_type = compressed_block->get_compression_type();
1734   assert(compression_type != kNoCompression);
1735 
1736   // Retrieve the uncompressed contents into a new buffer
1737   BlockContents contents;
1738   UncompressionContext context(compression_type);
1739   UncompressionInfo info(context, uncompression_dict, compression_type);
1740   s = UncompressBlockContents(
1741       info, compressed_block->data.data(), compressed_block->data.size(),
1742       &contents, rep_->table_options.format_version, rep_->ioptions,
1743       GetMemoryAllocator(rep_->table_options));
1744 
1745   // Insert uncompressed block into block cache
1746   if (s.ok()) {
1747     std::unique_ptr<TBlocklike> block_holder(
1748         BlocklikeTraits<TBlocklike>::Create(
1749             std::move(contents), rep_->get_global_seqno(block_type),
1750             read_amp_bytes_per_bit, statistics,
1751             rep_->blocks_definitely_zstd_compressed,
1752             rep_->table_options.filter_policy.get()));  // uncompressed block
1753 
1754     if (block_cache != nullptr && block_holder->own_bytes() &&
1755         read_options.fill_cache) {
1756       size_t charge = block_holder->ApproximateMemoryUsage();
1757       Cache::Handle* cache_handle = nullptr;
1758       s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1759                               &DeleteCachedEntry<TBlocklike>, &cache_handle);
1760       if (s.ok()) {
1761         assert(cache_handle != nullptr);
1762         block->SetCachedValue(block_holder.release(), block_cache,
1763                               cache_handle);
1764 
1765         UpdateCacheInsertionMetrics(block_type, get_context, charge);
1766       } else {
1767         RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1768       }
1769     } else {
1770       block->SetOwnedValue(block_holder.release());
1771     }
1772   }
1773 
1774   // Release hold on compressed cache entry
1775   block_cache_compressed->Release(block_cache_compressed_handle);
1776   return s;
1777 }
1778 
1779 template <typename TBlocklike>
PutDataBlockToCache(const Slice & block_cache_key,const Slice & compressed_block_cache_key,Cache * block_cache,Cache * block_cache_compressed,CachableEntry<TBlocklike> * cached_block,BlockContents * raw_block_contents,CompressionType raw_block_comp_type,const UncompressionDict & uncompression_dict,SequenceNumber seq_no,MemoryAllocator * memory_allocator,BlockType block_type,GetContext * get_context) const1780 Status BlockBasedTable::PutDataBlockToCache(
1781     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1782     Cache* block_cache, Cache* block_cache_compressed,
1783     CachableEntry<TBlocklike>* cached_block, BlockContents* raw_block_contents,
1784     CompressionType raw_block_comp_type,
1785     const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
1786     MemoryAllocator* memory_allocator, BlockType block_type,
1787     GetContext* get_context) const {
1788   const ImmutableCFOptions& ioptions = rep_->ioptions;
1789   const uint32_t format_version = rep_->table_options.format_version;
1790   const size_t read_amp_bytes_per_bit =
1791       block_type == BlockType::kData
1792           ? rep_->table_options.read_amp_bytes_per_bit
1793           : 0;
1794   const Cache::Priority priority =
1795       rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
1796               (block_type == BlockType::kFilter ||
1797                block_type == BlockType::kCompressionDictionary ||
1798                block_type == BlockType::kIndex)
1799           ? Cache::Priority::HIGH
1800           : Cache::Priority::LOW;
1801   assert(cached_block);
1802   assert(cached_block->IsEmpty());
1803 
1804   Status s;
1805   Statistics* statistics = ioptions.statistics;
1806 
1807   std::unique_ptr<TBlocklike> block_holder;
1808   if (raw_block_comp_type != kNoCompression) {
1809     // Retrieve the uncompressed contents into a new buffer
1810     BlockContents uncompressed_block_contents;
1811     UncompressionContext context(raw_block_comp_type);
1812     UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
1813     s = UncompressBlockContents(info, raw_block_contents->data.data(),
1814                                 raw_block_contents->data.size(),
1815                                 &uncompressed_block_contents, format_version,
1816                                 ioptions, memory_allocator);
1817     if (!s.ok()) {
1818       return s;
1819     }
1820 
1821     block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
1822         std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit,
1823         statistics, rep_->blocks_definitely_zstd_compressed,
1824         rep_->table_options.filter_policy.get()));
1825   } else {
1826     block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
1827         std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit,
1828         statistics, rep_->blocks_definitely_zstd_compressed,
1829         rep_->table_options.filter_policy.get()));
1830   }
1831 
1832   // Insert compressed block into compressed block cache.
1833   // Release the hold on the compressed cache entry immediately.
1834   if (block_cache_compressed != nullptr &&
1835       raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
1836       raw_block_contents->own_bytes()) {
1837 #ifndef NDEBUG
1838     assert(raw_block_contents->is_raw_block);
1839 #endif  // NDEBUG
1840 
1841     // We cannot directly put raw_block_contents because this could point to
1842     // an object in the stack.
1843     BlockContents* block_cont_for_comp_cache =
1844         new BlockContents(std::move(*raw_block_contents));
1845     s = block_cache_compressed->Insert(
1846         compressed_block_cache_key, block_cont_for_comp_cache,
1847         block_cont_for_comp_cache->ApproximateMemoryUsage(),
1848         &DeleteCachedEntry<BlockContents>);
1849     if (s.ok()) {
1850       // Avoid the following code to delete this cached block.
1851       RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
1852     } else {
1853       RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
1854       delete block_cont_for_comp_cache;
1855     }
1856   }
1857 
1858   // insert into uncompressed block cache
1859   if (block_cache != nullptr && block_holder->own_bytes()) {
1860     size_t charge = block_holder->ApproximateMemoryUsage();
1861     Cache::Handle* cache_handle = nullptr;
1862     s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1863                             &DeleteCachedEntry<TBlocklike>, &cache_handle,
1864                             priority);
1865     if (s.ok()) {
1866       assert(cache_handle != nullptr);
1867       cached_block->SetCachedValue(block_holder.release(), block_cache,
1868                                    cache_handle);
1869 
1870       UpdateCacheInsertionMetrics(block_type, get_context, charge);
1871     } else {
1872       RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1873     }
1874   } else {
1875     cached_block->SetOwnedValue(block_holder.release());
1876   }
1877 
1878   return s;
1879 }
1880 
CreateFilterBlockReader(FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context)1881 std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
1882     FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
1883     bool pin, BlockCacheLookupContext* lookup_context) {
1884   auto& rep = rep_;
1885   auto filter_type = rep->filter_type;
1886   if (filter_type == Rep::FilterType::kNoFilter) {
1887     return std::unique_ptr<FilterBlockReader>();
1888   }
1889 
1890   assert(rep->filter_policy);
1891 
1892   switch (filter_type) {
1893     case Rep::FilterType::kPartitionedFilter:
1894       return PartitionedFilterBlockReader::Create(
1895           this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
1896 
1897     case Rep::FilterType::kBlockFilter:
1898       return BlockBasedFilterBlockReader::Create(
1899           this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
1900 
1901     case Rep::FilterType::kFullFilter:
1902       return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache,
1903                                            prefetch, pin, lookup_context);
1904 
1905     default:
1906       // filter_type is either kNoFilter (exited the function at the first if),
1907       // or it must be covered in this switch block
1908       assert(false);
1909       return std::unique_ptr<FilterBlockReader>();
1910   }
1911 }
1912 
1913 // disable_prefix_seek should be set to true when prefix_extractor found in SST
1914 // differs from the one in mutable_cf_options and index type is HashBasedIndex
NewIndexIterator(const ReadOptions & read_options,bool disable_prefix_seek,IndexBlockIter * input_iter,GetContext * get_context,BlockCacheLookupContext * lookup_context) const1915 InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
1916     const ReadOptions& read_options, bool disable_prefix_seek,
1917     IndexBlockIter* input_iter, GetContext* get_context,
1918     BlockCacheLookupContext* lookup_context) const {
1919   assert(rep_ != nullptr);
1920   assert(rep_->index_reader != nullptr);
1921 
1922   // We don't return pinned data from index blocks, so no need
1923   // to set `block_contents_pinned`.
1924   return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
1925                                          input_iter, get_context,
1926                                          lookup_context);
1927 }
1928 
1929 // Convert an index iterator value (i.e., an encoded BlockHandle)
1930 // into an iterator over the contents of the corresponding block.
1931 // If input_iter is null, new a iterator
1932 // If input_iter is not null, update this iter and return it
1933 template <typename TBlockIter>
NewDataBlockIterator(const ReadOptions & ro,const BlockHandle & handle,TBlockIter * input_iter,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,Status s,FilePrefetchBuffer * prefetch_buffer,bool for_compaction) const1934 TBlockIter* BlockBasedTable::NewDataBlockIterator(
1935     const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
1936     BlockType block_type, GetContext* get_context,
1937     BlockCacheLookupContext* lookup_context, Status s,
1938     FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const {
1939   PERF_TIMER_GUARD(new_table_block_iter_nanos);
1940 
1941   TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
1942   if (!s.ok()) {
1943     iter->Invalidate(s);
1944     return iter;
1945   }
1946 
1947   CachableEntry<UncompressionDict> uncompression_dict;
1948   if (rep_->uncompression_dict_reader) {
1949     const bool no_io = (ro.read_tier == kBlockCacheTier);
1950     s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
1951         prefetch_buffer, no_io, get_context, lookup_context,
1952         &uncompression_dict);
1953     if (!s.ok()) {
1954       iter->Invalidate(s);
1955       return iter;
1956     }
1957   }
1958 
1959   const UncompressionDict& dict = uncompression_dict.GetValue()
1960                                       ? *uncompression_dict.GetValue()
1961                                       : UncompressionDict::GetEmptyDict();
1962 
1963   CachableEntry<Block> block;
1964   s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
1965                     get_context, lookup_context, for_compaction,
1966                     /* use_cache */ true);
1967 
1968   if (!s.ok()) {
1969     assert(block.IsEmpty());
1970     iter->Invalidate(s);
1971     return iter;
1972   }
1973 
1974   assert(block.GetValue() != nullptr);
1975 
1976   // Block contents are pinned and it is still pinned after the iterator
1977   // is destroyed as long as cleanup functions are moved to another object,
1978   // when:
1979   // 1. block cache handle is set to be released in cleanup function, or
1980   // 2. it's pointing to immortal source. If own_bytes is true then we are
1981   //    not reading data from the original source, whether immortal or not.
1982   //    Otherwise, the block is pinned iff the source is immortal.
1983   const bool block_contents_pinned =
1984       block.IsCached() ||
1985       (!block.GetValue()->own_bytes() && rep_->immortal_table);
1986   iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
1987                                        block_contents_pinned);
1988 
1989   if (!block.IsCached()) {
1990     if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
1991       // insert a dummy record to block cache to track the memory usage
1992       Cache* const block_cache = rep_->table_options.block_cache.get();
1993       Cache::Handle* cache_handle = nullptr;
1994       // There are two other types of cache keys: 1) SST cache key added in
1995       // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
1996       // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
1997       // from SST cache key(31 bytes), and use non-zero prefix to
1998       // differentiate from `write_buffer_manager`
1999       const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
2000       char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
2001       // Prefix: use rep_->cache_key_prefix padded by 0s
2002       memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
2003       assert(rep_->cache_key_prefix_size != 0);
2004       assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
2005       memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
2006       char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
2007                                  next_cache_key_id_++);
2008       assert(end - cache_key <=
2009              static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
2010       const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
2011       s = block_cache->Insert(unique_key, nullptr,
2012                               block.GetValue()->ApproximateMemoryUsage(),
2013                               nullptr, &cache_handle);
2014 
2015       if (s.ok()) {
2016         assert(cache_handle != nullptr);
2017         iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
2018                               cache_handle);
2019       }
2020     }
2021   } else {
2022     iter->SetCacheHandle(block.GetCacheHandle());
2023   }
2024 
2025   block.TransferTo(iter);
2026 
2027   return iter;
2028 }
2029 
2030 template <>
InitBlockIterator(const Rep * rep,Block * block,DataBlockIter * input_iter,bool block_contents_pinned)2031 DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
2032     const Rep* rep, Block* block, DataBlockIter* input_iter,
2033     bool block_contents_pinned) {
2034   return block->NewDataIterator(
2035       &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2036       input_iter, rep->ioptions.statistics, block_contents_pinned);
2037 }
2038 
2039 template <>
InitBlockIterator(const Rep * rep,Block * block,IndexBlockIter * input_iter,bool block_contents_pinned)2040 IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
2041     const Rep* rep, Block* block, IndexBlockIter* input_iter,
2042     bool block_contents_pinned) {
2043   return block->NewIndexIterator(
2044       &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2045       input_iter, rep->ioptions.statistics, /* total_order_seek */ true,
2046       rep->index_has_first_key, rep->index_key_includes_seq,
2047       rep->index_value_is_full, block_contents_pinned);
2048 }
2049 
2050 // Convert an uncompressed data block (i.e CachableEntry<Block>)
2051 // into an iterator over the contents of the corresponding block.
2052 // If input_iter is null, new a iterator
2053 // If input_iter is not null, update this iter and return it
2054 template <typename TBlockIter>
NewDataBlockIterator(const ReadOptions & ro,CachableEntry<Block> & block,TBlockIter * input_iter,Status s) const2055 TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro,
2056                                                   CachableEntry<Block>& block,
2057                                                   TBlockIter* input_iter,
2058                                                   Status s) const {
2059   PERF_TIMER_GUARD(new_table_block_iter_nanos);
2060 
2061   TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
2062   if (!s.ok()) {
2063     iter->Invalidate(s);
2064     return iter;
2065   }
2066 
2067   assert(block.GetValue() != nullptr);
2068   // Block contents are pinned and it is still pinned after the iterator
2069   // is destroyed as long as cleanup functions are moved to another object,
2070   // when:
2071   // 1. block cache handle is set to be released in cleanup function, or
2072   // 2. it's pointing to immortal source. If own_bytes is true then we are
2073   //    not reading data from the original source, whether immortal or not.
2074   //    Otherwise, the block is pinned iff the source is immortal.
2075   const bool block_contents_pinned =
2076       block.IsCached() ||
2077       (!block.GetValue()->own_bytes() && rep_->immortal_table);
2078   iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
2079                                        block_contents_pinned);
2080 
2081   if (!block.IsCached()) {
2082     if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
2083       // insert a dummy record to block cache to track the memory usage
2084       Cache* const block_cache = rep_->table_options.block_cache.get();
2085       Cache::Handle* cache_handle = nullptr;
2086       // There are two other types of cache keys: 1) SST cache key added in
2087       // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
2088       // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
2089       // from SST cache key(31 bytes), and use non-zero prefix to
2090       // differentiate from `write_buffer_manager`
2091       const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
2092       char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
2093       // Prefix: use rep_->cache_key_prefix padded by 0s
2094       memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
2095       assert(rep_->cache_key_prefix_size != 0);
2096       assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
2097       memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
2098       char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
2099                                  next_cache_key_id_++);
2100       assert(end - cache_key <=
2101              static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
2102       const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
2103       s = block_cache->Insert(unique_key, nullptr,
2104                               block.GetValue()->ApproximateMemoryUsage(),
2105                               nullptr, &cache_handle);
2106       if (s.ok()) {
2107         assert(cache_handle != nullptr);
2108         iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
2109                               cache_handle);
2110       }
2111     }
2112   } else {
2113     iter->SetCacheHandle(block.GetCacheHandle());
2114   }
2115 
2116   block.TransferTo(iter);
2117   return iter;
2118 }
2119 
2120 // If contents is nullptr, this function looks up the block caches for the
2121 // data block referenced by handle, and read the block from disk if necessary.
2122 // If contents is non-null, it skips the cache lookup and disk read, since
2123 // the caller has already read it. In both cases, if ro.fill_cache is true,
2124 // it inserts the block into the block cache.
2125 template <typename TBlocklike>
MaybeReadBlockAndLoadToCache(FilePrefetchBuffer * prefetch_buffer,const ReadOptions & ro,const BlockHandle & handle,const UncompressionDict & uncompression_dict,CachableEntry<TBlocklike> * block_entry,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,BlockContents * contents) const2126 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
2127     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2128     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2129     CachableEntry<TBlocklike>* block_entry, BlockType block_type,
2130     GetContext* get_context, BlockCacheLookupContext* lookup_context,
2131     BlockContents* contents) const {
2132   assert(block_entry != nullptr);
2133   const bool no_io = (ro.read_tier == kBlockCacheTier);
2134   Cache* block_cache = rep_->table_options.block_cache.get();
2135   // No point to cache compressed blocks if it never goes away
2136   Cache* block_cache_compressed =
2137       rep_->immortal_table ? nullptr
2138                            : rep_->table_options.block_cache_compressed.get();
2139 
2140   // First, try to get the block from the cache
2141   //
2142   // If either block cache is enabled, we'll try to read from it.
2143   Status s;
2144   char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
2145   char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
2146   Slice key /* key to the block cache */;
2147   Slice ckey /* key to the compressed block cache */;
2148   bool is_cache_hit = false;
2149   if (block_cache != nullptr || block_cache_compressed != nullptr) {
2150     // create key for block cache
2151     if (block_cache != nullptr) {
2152       key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
2153                         handle, cache_key);
2154     }
2155 
2156     if (block_cache_compressed != nullptr) {
2157       ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
2158                          rep_->compressed_cache_key_prefix_size, handle,
2159                          compressed_cache_key);
2160     }
2161 
2162     if (!contents) {
2163       s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
2164                                 ro, block_entry, uncompression_dict, block_type,
2165                                 get_context);
2166       if (block_entry->GetValue()) {
2167         // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
2168         // compressed block cache.
2169         is_cache_hit = true;
2170       }
2171     }
2172 
2173     // Can't find the block from the cache. If I/O is allowed, read from the
2174     // file.
2175     if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
2176       Statistics* statistics = rep_->ioptions.statistics;
2177       const bool maybe_compressed =
2178           block_type != BlockType::kFilter &&
2179           block_type != BlockType::kCompressionDictionary &&
2180           rep_->blocks_maybe_compressed;
2181       const bool do_uncompress = maybe_compressed && !block_cache_compressed;
2182       CompressionType raw_block_comp_type;
2183       BlockContents raw_block_contents;
2184       if (!contents) {
2185         StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
2186         BlockFetcher block_fetcher(
2187             rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
2188             &raw_block_contents, rep_->ioptions, do_uncompress,
2189             maybe_compressed, block_type, uncompression_dict,
2190             rep_->persistent_cache_options,
2191             GetMemoryAllocator(rep_->table_options),
2192             GetMemoryAllocatorForCompressedBlock(rep_->table_options));
2193         s = block_fetcher.ReadBlockContents();
2194         raw_block_comp_type = block_fetcher.get_compression_type();
2195         contents = &raw_block_contents;
2196       } else {
2197         raw_block_comp_type = contents->get_compression_type();
2198       }
2199 
2200       if (s.ok()) {
2201         SequenceNumber seq_no = rep_->get_global_seqno(block_type);
2202         // If filling cache is allowed and a cache is configured, try to put the
2203         // block to the cache.
2204         s = PutDataBlockToCache(
2205             key, ckey, block_cache, block_cache_compressed, block_entry,
2206             contents, raw_block_comp_type, uncompression_dict, seq_no,
2207             GetMemoryAllocator(rep_->table_options), block_type, get_context);
2208       }
2209     }
2210   }
2211 
2212   // Fill lookup_context.
2213   if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
2214       lookup_context) {
2215     size_t usage = 0;
2216     uint64_t nkeys = 0;
2217     if (block_entry->GetValue()) {
2218       // Approximate the number of keys in the block using restarts.
2219       nkeys =
2220           rep_->table_options.block_restart_interval *
2221           BlocklikeTraits<TBlocklike>::GetNumRestarts(*block_entry->GetValue());
2222       usage = block_entry->GetValue()->ApproximateMemoryUsage();
2223     }
2224     TraceType trace_block_type = TraceType::kTraceMax;
2225     switch (block_type) {
2226       case BlockType::kData:
2227         trace_block_type = TraceType::kBlockTraceDataBlock;
2228         break;
2229       case BlockType::kFilter:
2230         trace_block_type = TraceType::kBlockTraceFilterBlock;
2231         break;
2232       case BlockType::kCompressionDictionary:
2233         trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
2234         break;
2235       case BlockType::kRangeDeletion:
2236         trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
2237         break;
2238       case BlockType::kIndex:
2239         trace_block_type = TraceType::kBlockTraceIndexBlock;
2240         break;
2241       default:
2242         // This cannot happen.
2243         assert(false);
2244         break;
2245     }
2246     bool no_insert = no_io || !ro.fill_cache;
2247     if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
2248             trace_block_type, lookup_context->caller)) {
2249       // Defer logging the access to Get() and MultiGet() to trace additional
2250       // information, e.g., referenced_key_exist_in_block.
2251 
2252       // Make a copy of the block key here since it will be logged later.
2253       lookup_context->FillLookupContext(
2254           is_cache_hit, no_insert, trace_block_type,
2255           /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
2256     } else {
2257       // Avoid making copy of block_key and cf_name when constructing the access
2258       // record.
2259       BlockCacheTraceRecord access_record(
2260           rep_->ioptions.env->NowMicros(),
2261           /*block_key=*/"", trace_block_type,
2262           /*block_size=*/usage, rep_->cf_id_for_tracing(),
2263           /*cf_name=*/"", rep_->level_for_tracing(),
2264           rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
2265           no_insert, lookup_context->get_id,
2266           lookup_context->get_from_user_specified_snapshot,
2267           /*referenced_key=*/"");
2268       block_cache_tracer_->WriteBlockAccess(access_record, key,
2269                                             rep_->cf_name_for_tracing(),
2270                                             lookup_context->referenced_key);
2271     }
2272   }
2273 
2274   assert(s.ok() || block_entry->GetValue() == nullptr);
2275   return s;
2276 }
2277 
2278 // This function reads multiple data blocks from disk using Env::MultiRead()
2279 // and optionally inserts them into the block cache. It uses the scratch
2280 // buffer provided by the caller, which is contiguous. If scratch is a nullptr
2281 // it allocates a separate buffer for each block. Typically, if the blocks
2282 // need to be uncompressed and there is no compressed block cache, callers
2283 // can allocate a temporary scratch buffer in order to minimize memory
2284 // allocations.
2285 // If options.fill_cache is true, it inserts the blocks into cache. If its
2286 // false and scratch is non-null and the blocks are uncompressed, it copies
2287 // the buffers to heap. In any case, the CachableEntry<Block> returned will
2288 // own the data bytes.
2289 // If compression is enabled and also there is no compressed block cache,
2290 // the adjacent blocks are read out in one IO (combined read)
2291 // batch - A MultiGetRange with only those keys with unique data blocks not
2292 //         found in cache
2293 // handles - A vector of block handles. Some of them me be NULL handles
2294 // scratch - An optional contiguous buffer to read compressed blocks into
RetrieveMultipleBlocks(const ReadOptions & options,const MultiGetRange * batch,const autovector<BlockHandle,MultiGetContext::MAX_BATCH_SIZE> * handles,autovector<Status,MultiGetContext::MAX_BATCH_SIZE> * statuses,autovector<CachableEntry<Block>,MultiGetContext::MAX_BATCH_SIZE> * results,char * scratch,const UncompressionDict & uncompression_dict) const2295 void BlockBasedTable::RetrieveMultipleBlocks(
2296     const ReadOptions& options, const MultiGetRange* batch,
2297     const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
2298     autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
2299     autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
2300     char* scratch, const UncompressionDict& uncompression_dict) const {
2301   RandomAccessFileReader* file = rep_->file.get();
2302   const Footer& footer = rep_->footer;
2303   const ImmutableCFOptions& ioptions = rep_->ioptions;
2304   SequenceNumber global_seqno = rep_->get_global_seqno(BlockType::kData);
2305   size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
2306   MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
2307 
2308   if (file->use_direct_io() || ioptions.allow_mmap_reads) {
2309     size_t idx_in_batch = 0;
2310     for (auto mget_iter = batch->begin(); mget_iter != batch->end();
2311          ++mget_iter, ++idx_in_batch) {
2312       BlockCacheLookupContext lookup_data_block_context(
2313           TableReaderCaller::kUserMultiGet);
2314       const BlockHandle& handle = (*handles)[idx_in_batch];
2315       if (handle.IsNull()) {
2316         continue;
2317       }
2318 
2319       (*statuses)[idx_in_batch] =
2320           RetrieveBlock(nullptr, options, handle, uncompression_dict,
2321                         &(*results)[idx_in_batch], BlockType::kData,
2322                         mget_iter->get_context, &lookup_data_block_context,
2323                         /* for_compaction */ false, /* use_cache */ true);
2324     }
2325     return;
2326   }
2327 
2328   autovector<FSReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs;
2329   size_t buf_offset = 0;
2330   size_t idx_in_batch = 0;
2331 
2332   uint64_t prev_offset = 0;
2333   size_t prev_len = 0;
2334   autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_idx_for_block;
2335   autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_offset_for_block;
2336   for (auto mget_iter = batch->begin(); mget_iter != batch->end();
2337        ++mget_iter, ++idx_in_batch) {
2338     const BlockHandle& handle = (*handles)[idx_in_batch];
2339     if (handle.IsNull()) {
2340       continue;
2341     }
2342 
2343     size_t prev_end = static_cast<size_t>(prev_offset) + prev_len;
2344 
2345     // If current block is adjacent to the previous one, at the same time,
2346     // compression is enabled and there is no compressed cache, we combine
2347     // the two block read as one.
2348     if (scratch != nullptr && prev_end == handle.offset()) {
2349       req_offset_for_block.emplace_back(prev_len);
2350       prev_len += block_size(handle);
2351     } else {
2352       // No compression or current block and previous one is not adjacent:
2353       // Step 1, create a new request for previous blocks
2354       if (prev_len != 0) {
2355         FSReadRequest req;
2356         req.offset = prev_offset;
2357         req.len = prev_len;
2358         if (scratch == nullptr) {
2359           req.scratch = new char[req.len];
2360         } else {
2361           req.scratch = scratch + buf_offset;
2362           buf_offset += req.len;
2363         }
2364         req.status = IOStatus::OK();
2365         read_reqs.emplace_back(req);
2366       }
2367 
2368       // Step 2, remeber the previous block info
2369       prev_offset = handle.offset();
2370       prev_len = block_size(handle);
2371       req_offset_for_block.emplace_back(0);
2372     }
2373     req_idx_for_block.emplace_back(read_reqs.size());
2374   }
2375   // Handle the last block and process the pending last request
2376   if (prev_len != 0) {
2377     FSReadRequest req;
2378     req.offset = prev_offset;
2379     req.len = prev_len;
2380     if (scratch == nullptr) {
2381       req.scratch = new char[req.len];
2382     } else {
2383       req.scratch = scratch + buf_offset;
2384     }
2385     req.status = IOStatus::OK();
2386     read_reqs.emplace_back(req);
2387   }
2388 
2389   file->MultiRead(&read_reqs[0], read_reqs.size());
2390 
2391   idx_in_batch = 0;
2392   size_t valid_batch_idx = 0;
2393   for (auto mget_iter = batch->begin(); mget_iter != batch->end();
2394        ++mget_iter, ++idx_in_batch) {
2395     const BlockHandle& handle = (*handles)[idx_in_batch];
2396 
2397     if (handle.IsNull()) {
2398       continue;
2399     }
2400 
2401     assert(valid_batch_idx < req_idx_for_block.size());
2402     assert(valid_batch_idx < req_offset_for_block.size());
2403     assert(req_idx_for_block[valid_batch_idx] < read_reqs.size());
2404     size_t& req_idx = req_idx_for_block[valid_batch_idx];
2405     size_t& req_offset = req_offset_for_block[valid_batch_idx];
2406     valid_batch_idx++;
2407     FSReadRequest& req = read_reqs[req_idx];
2408     Status s = req.status;
2409     if (s.ok()) {
2410       if (req.result.size() != req.len) {
2411         s = Status::Corruption(
2412             "truncated block read from " + rep_->file->file_name() +
2413             " offset " + ToString(handle.offset()) + ", expected " +
2414             ToString(req.len) + " bytes, got " + ToString(req.result.size()));
2415       }
2416     }
2417 
2418     BlockContents raw_block_contents;
2419     size_t cur_read_end = req_offset + block_size(handle);
2420     if (cur_read_end > req.result.size()) {
2421       s = Status::Corruption(
2422           "truncated block read from " + rep_->file->file_name() + " offset " +
2423           ToString(handle.offset()) + ", expected " + ToString(req.len) +
2424           " bytes, got " + ToString(req.result.size()));
2425     }
2426 
2427     bool blocks_share_read_buffer = (req.result.size() != block_size(handle));
2428     if (s.ok()) {
2429       if (scratch == nullptr && !blocks_share_read_buffer) {
2430         // We allocated a buffer for this block. Give ownership of it to
2431         // BlockContents so it can free the memory
2432         assert(req.result.data() == req.scratch);
2433         std::unique_ptr<char[]> raw_block(req.scratch + req_offset);
2434         raw_block_contents = BlockContents(std::move(raw_block), handle.size());
2435       } else {
2436         // We used the scratch buffer which are shared by the blocks.
2437         // raw_block_contents does not have the ownership.
2438         raw_block_contents =
2439             BlockContents(Slice(req.scratch + req_offset, handle.size()));
2440       }
2441 
2442 #ifndef NDEBUG
2443       raw_block_contents.is_raw_block = true;
2444 #endif
2445       if (options.verify_checksums) {
2446         PERF_TIMER_GUARD(block_checksum_time);
2447         const char* data = req.result.data();
2448         uint32_t expected =
2449             DecodeFixed32(data + req_offset + handle.size() + 1);
2450         // Since the scratch might be shared. the offset of the data block in
2451         // the buffer might not be 0. req.result.data() only point to the
2452         // begin address of each read request, we need to add the offset
2453         // in each read request. Checksum is stored in the block trailer,
2454         // which is handle.size() + 1.
2455         s = rocksdb::VerifyChecksum(footer.checksum(),
2456                                     req.result.data() + req_offset,
2457                                     handle.size() + 1, expected);
2458         TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
2459       }
2460     }
2461 
2462     if (s.ok()) {
2463       // It handles a rare case: compression is set and these is no compressed
2464       // cache (enable combined read). In this case, the scratch != nullptr.
2465       // At the same time, some blocks are actually not compressed,
2466       // since its compression space saving is smaller than the threshold. In
2467       // this case, if the block shares the scratch memory, we need to copy it
2468       // to the heap such that it can be added to the regular block cache.
2469       CompressionType compression_type =
2470           raw_block_contents.get_compression_type();
2471       if (scratch != nullptr && compression_type == kNoCompression) {
2472         Slice raw = Slice(req.scratch + req_offset, block_size(handle));
2473         raw_block_contents = BlockContents(
2474             CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
2475             handle.size());
2476 #ifndef NDEBUG
2477         raw_block_contents.is_raw_block = true;
2478 #endif
2479       }
2480     }
2481 
2482     if (s.ok()) {
2483       if (options.fill_cache) {
2484         BlockCacheLookupContext lookup_data_block_context(
2485             TableReaderCaller::kUserMultiGet);
2486         CachableEntry<Block>* block_entry = &(*results)[idx_in_batch];
2487         // MaybeReadBlockAndLoadToCache will insert into the block caches if
2488         // necessary. Since we're passing the raw block contents, it will
2489         // avoid looking up the block cache
2490         s = MaybeReadBlockAndLoadToCache(
2491             nullptr, options, handle, uncompression_dict, block_entry,
2492             BlockType::kData, mget_iter->get_context,
2493             &lookup_data_block_context, &raw_block_contents);
2494 
2495         // block_entry value could be null if no block cache is present, i.e
2496         // BlockBasedTableOptions::no_block_cache is true and no compressed
2497         // block cache is configured. In that case, fall
2498         // through and set up the block explicitly
2499         if (block_entry->GetValue() != nullptr) {
2500           continue;
2501         }
2502       }
2503 
2504       CompressionType compression_type =
2505           raw_block_contents.get_compression_type();
2506       BlockContents contents;
2507       if (compression_type != kNoCompression) {
2508         UncompressionContext context(compression_type);
2509         UncompressionInfo info(context, uncompression_dict, compression_type);
2510         s = UncompressBlockContents(info, req.result.data() + req_offset,
2511                                     handle.size(), &contents, footer.version(),
2512                                     rep_->ioptions, memory_allocator);
2513       } else {
2514         // There are two cases here: 1) caller uses the scratch buffer; 2) we
2515         // use the requst buffer. If scratch buffer is used, we ensure that
2516         // all raw blocks are copyed to the heap as single blocks. If scratch
2517         // buffer is not used, we also have no combined read, so the raw
2518         // block can be used directly.
2519         contents = std::move(raw_block_contents);
2520       }
2521       if (s.ok()) {
2522         (*results)[idx_in_batch].SetOwnedValue(
2523             new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
2524                       ioptions.statistics));
2525       }
2526     }
2527     (*statuses)[idx_in_batch] = s;
2528   }
2529 }
2530 
2531 template <typename TBlocklike>
RetrieveBlock(FilePrefetchBuffer * prefetch_buffer,const ReadOptions & ro,const BlockHandle & handle,const UncompressionDict & uncompression_dict,CachableEntry<TBlocklike> * block_entry,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,bool for_compaction,bool use_cache) const2532 Status BlockBasedTable::RetrieveBlock(
2533     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2534     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2535     CachableEntry<TBlocklike>* block_entry, BlockType block_type,
2536     GetContext* get_context, BlockCacheLookupContext* lookup_context,
2537     bool for_compaction, bool use_cache) const {
2538   assert(block_entry);
2539   assert(block_entry->IsEmpty());
2540 
2541   Status s;
2542   if (use_cache) {
2543     s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
2544                                      uncompression_dict, block_entry,
2545                                      block_type, get_context, lookup_context,
2546                                      /*contents=*/nullptr);
2547 
2548     if (!s.ok()) {
2549       return s;
2550     }
2551 
2552     if (block_entry->GetValue() != nullptr) {
2553       assert(s.ok());
2554       return s;
2555     }
2556   }
2557 
2558   assert(block_entry->IsEmpty());
2559 
2560   const bool no_io = ro.read_tier == kBlockCacheTier;
2561   if (no_io) {
2562     return Status::Incomplete("no blocking io");
2563   }
2564 
2565   const bool maybe_compressed =
2566       block_type != BlockType::kFilter &&
2567       block_type != BlockType::kCompressionDictionary &&
2568       rep_->blocks_maybe_compressed;
2569   const bool do_uncompress = maybe_compressed;
2570   std::unique_ptr<TBlocklike> block;
2571 
2572   {
2573     StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
2574                  READ_BLOCK_GET_MICROS);
2575     s = ReadBlockFromFile(
2576         rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
2577         rep_->ioptions, do_uncompress, maybe_compressed, block_type,
2578         uncompression_dict, rep_->persistent_cache_options,
2579         rep_->get_global_seqno(block_type),
2580         block_type == BlockType::kData
2581             ? rep_->table_options.read_amp_bytes_per_bit
2582             : 0,
2583         GetMemoryAllocator(rep_->table_options), for_compaction,
2584         rep_->blocks_definitely_zstd_compressed,
2585         rep_->table_options.filter_policy.get());
2586   }
2587 
2588   if (!s.ok()) {
2589     return s;
2590   }
2591 
2592   block_entry->SetOwnedValue(block.release());
2593 
2594   assert(s.ok());
2595   return s;
2596 }
2597 
2598 // Explicitly instantiate templates for both "blocklike" types we use.
2599 // This makes it possible to keep the template definitions in the .cc file.
2600 template Status BlockBasedTable::RetrieveBlock<BlockContents>(
2601     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2602     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2603     CachableEntry<BlockContents>* block_entry, BlockType block_type,
2604     GetContext* get_context, BlockCacheLookupContext* lookup_context,
2605     bool for_compaction, bool use_cache) const;
2606 
2607 template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
2608     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2609     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2610     CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type,
2611     GetContext* get_context, BlockCacheLookupContext* lookup_context,
2612     bool for_compaction, bool use_cache) const;
2613 
2614 template Status BlockBasedTable::RetrieveBlock<Block>(
2615     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2616     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2617     CachableEntry<Block>* block_entry, BlockType block_type,
2618     GetContext* get_context, BlockCacheLookupContext* lookup_context,
2619     bool for_compaction, bool use_cache) const;
2620 
2621 template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
2622     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2623     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2624     CachableEntry<UncompressionDict>* block_entry, BlockType block_type,
2625     GetContext* get_context, BlockCacheLookupContext* lookup_context,
2626     bool for_compaction, bool use_cache) const;
2627 
PartitionedIndexIteratorState(const BlockBasedTable * table,std::unordered_map<uint64_t,CachableEntry<Block>> * block_map)2628 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
2629     const BlockBasedTable* table,
2630     std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
2631     : table_(table), block_map_(block_map) {}
2632 
2633 InternalIteratorBase<IndexValue>*
NewSecondaryIterator(const BlockHandle & handle)2634 BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
2635     const BlockHandle& handle) {
2636   // Return a block iterator on the index partition
2637   auto block = block_map_->find(handle.offset());
2638   // This is a possible scenario since block cache might not have had space
2639   // for the partition
2640   if (block != block_map_->end()) {
2641     const Rep* rep = table_->get_rep();
2642     assert(rep);
2643 
2644     Statistics* kNullStats = nullptr;
2645     // We don't return pinned data from index blocks, so no need
2646     // to set `block_contents_pinned`.
2647     return block->second.GetValue()->NewIndexIterator(
2648         &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2649         nullptr, kNullStats, true, rep->index_has_first_key,
2650         rep->index_key_includes_seq, rep->index_value_is_full);
2651   }
2652   // Create an empty iterator
2653   return new IndexBlockIter();
2654 }
2655 
2656 // This will be broken if the user specifies an unusual implementation
2657 // of Options.comparator, or if the user specifies an unusual
2658 // definition of prefixes in BlockBasedTableOptions.filter_policy.
2659 // In particular, we require the following three properties:
2660 //
2661 // 1) key.starts_with(prefix(key))
2662 // 2) Compare(prefix(key), key) <= 0.
2663 // 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
2664 //
2665 // Otherwise, this method guarantees no I/O will be incurred.
2666 //
2667 // REQUIRES: this method shouldn't be called while the DB lock is held.
PrefixMayMatch(const Slice & internal_key,const ReadOptions & read_options,const SliceTransform * options_prefix_extractor,const bool need_upper_bound_check,BlockCacheLookupContext * lookup_context) const2668 bool BlockBasedTable::PrefixMayMatch(
2669     const Slice& internal_key, const ReadOptions& read_options,
2670     const SliceTransform* options_prefix_extractor,
2671     const bool need_upper_bound_check,
2672     BlockCacheLookupContext* lookup_context) const {
2673   if (!rep_->filter_policy) {
2674     return true;
2675   }
2676 
2677   const SliceTransform* prefix_extractor;
2678 
2679   if (rep_->table_prefix_extractor == nullptr) {
2680     if (need_upper_bound_check) {
2681       return true;
2682     }
2683     prefix_extractor = options_prefix_extractor;
2684   } else {
2685     prefix_extractor = rep_->table_prefix_extractor.get();
2686   }
2687   auto user_key = ExtractUserKey(internal_key);
2688   if (!prefix_extractor->InDomain(user_key)) {
2689     return true;
2690   }
2691 
2692   bool may_match = true;
2693   Status s;
2694 
2695   // First, try check with full filter
2696   FilterBlockReader* const filter = rep_->filter.get();
2697   bool filter_checked = true;
2698   if (filter != nullptr) {
2699     if (!filter->IsBlockBased()) {
2700       const Slice* const const_ikey_ptr = &internal_key;
2701       may_match = filter->RangeMayExist(
2702           read_options.iterate_upper_bound, user_key, prefix_extractor,
2703           rep_->internal_comparator.user_comparator(), const_ikey_ptr,
2704           &filter_checked, need_upper_bound_check, lookup_context);
2705     } else {
2706       // if prefix_extractor changed for block based filter, skip filter
2707       if (need_upper_bound_check) {
2708         return true;
2709       }
2710       auto prefix = prefix_extractor->Transform(user_key);
2711       InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
2712       auto internal_prefix = internal_key_prefix.Encode();
2713 
2714       // To prevent any io operation in this method, we set `read_tier` to make
2715       // sure we always read index or filter only when they have already been
2716       // loaded to memory.
2717       ReadOptions no_io_read_options;
2718       no_io_read_options.read_tier = kBlockCacheTier;
2719 
2720       // Then, try find it within each block
2721       // we already know prefix_extractor and prefix_extractor_name must match
2722       // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
2723       std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
2724           no_io_read_options,
2725           /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
2726           /*get_context=*/nullptr, lookup_context));
2727       iiter->Seek(internal_prefix);
2728 
2729       if (!iiter->Valid()) {
2730         // we're past end of file
2731         // if it's incomplete, it means that we avoided I/O
2732         // and we're not really sure that we're past the end
2733         // of the file
2734         may_match = iiter->status().IsIncomplete();
2735       } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key())
2736                                                : iiter->key())
2737                      .starts_with(ExtractUserKey(internal_prefix))) {
2738         // we need to check for this subtle case because our only
2739         // guarantee is that "the key is a string >= last key in that data
2740         // block" according to the doc/table_format.txt spec.
2741         //
2742         // Suppose iiter->key() starts with the desired prefix; it is not
2743         // necessarily the case that the corresponding data block will
2744         // contain the prefix, since iiter->key() need not be in the
2745         // block.  However, the next data block may contain the prefix, so
2746         // we return true to play it safe.
2747         may_match = true;
2748       } else if (filter->IsBlockBased()) {
2749         // iiter->key() does NOT start with the desired prefix.  Because
2750         // Seek() finds the first key that is >= the seek target, this
2751         // means that iiter->key() > prefix.  Thus, any data blocks coming
2752         // after the data block corresponding to iiter->key() cannot
2753         // possibly contain the key.  Thus, the corresponding data block
2754         // is the only on could potentially contain the prefix.
2755         BlockHandle handle = iiter->value().handle;
2756         may_match = filter->PrefixMayMatch(
2757             prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
2758             /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context);
2759       }
2760     }
2761   }
2762 
2763   if (filter_checked) {
2764     Statistics* statistics = rep_->ioptions.statistics;
2765     RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
2766     if (!may_match) {
2767       RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
2768     }
2769   }
2770 
2771   return may_match;
2772 }
2773 
2774 template <class TBlockIter, typename TValue>
Seek(const Slice & target)2775 void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
2776   SeekImpl(&target);
2777 }
2778 
2779 template <class TBlockIter, typename TValue>
SeekToFirst()2780 void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
2781   SeekImpl(nullptr);
2782 }
2783 
2784 template <class TBlockIter, typename TValue>
SeekImpl(const Slice * target)2785 void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
2786     const Slice* target) {
2787   is_out_of_bound_ = false;
2788   is_at_first_key_from_index_ = false;
2789   if (target && !CheckPrefixMayMatch(*target)) {
2790     ResetDataIter();
2791     return;
2792   }
2793 
2794   bool need_seek_index = true;
2795   if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
2796     // Reseek.
2797     prev_block_offset_ = index_iter_->value().handle.offset();
2798 
2799     if (target) {
2800       // We can avoid an index seek if:
2801       // 1. The new seek key is larger than the current key
2802       // 2. The new seek key is within the upper bound of the block
2803       // Since we don't necessarily know the internal key for either
2804       // the current key or the upper bound, we check user keys and
2805       // exclude the equality case. Considering internal keys can
2806       // improve for the boundary cases, but it would complicate the
2807       // code.
2808       if (user_comparator_.Compare(ExtractUserKey(*target),
2809                                    block_iter_.user_key()) > 0 &&
2810           user_comparator_.Compare(ExtractUserKey(*target),
2811                                    index_iter_->user_key()) < 0) {
2812         need_seek_index = false;
2813       }
2814     }
2815   }
2816 
2817   if (need_seek_index) {
2818     if (target) {
2819       index_iter_->Seek(*target);
2820     } else {
2821       index_iter_->SeekToFirst();
2822     }
2823 
2824     if (!index_iter_->Valid()) {
2825       ResetDataIter();
2826       return;
2827     }
2828   }
2829 
2830   IndexValue v = index_iter_->value();
2831   const bool same_block = block_iter_points_to_real_block_ &&
2832                           v.handle.offset() == prev_block_offset_;
2833 
2834   // TODO(kolmike): Remove the != kBlockCacheTier condition.
2835   if (!v.first_internal_key.empty() && !same_block &&
2836       (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
2837       read_options_.read_tier != kBlockCacheTier) {
2838     // Index contains the first key of the block, and it's >= target.
2839     // We can defer reading the block.
2840     is_at_first_key_from_index_ = true;
2841     // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
2842     // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
2843     // as that will be done later when the data block is actually read.
2844     ResetDataIter();
2845   } else {
2846     // Need to use the data block.
2847     if (!same_block) {
2848       InitDataBlock();
2849     } else {
2850       // When the user does a reseek, the iterate_upper_bound might have
2851       // changed. CheckDataBlockWithinUpperBound() needs to be called
2852       // explicitly if the reseek ends up in the same data block.
2853       // If the reseek ends up in a different block, InitDataBlock() will do
2854       // the iterator upper bound check.
2855       CheckDataBlockWithinUpperBound();
2856     }
2857 
2858     if (target) {
2859       block_iter_.Seek(*target);
2860     } else {
2861       block_iter_.SeekToFirst();
2862     }
2863     FindKeyForward();
2864   }
2865 
2866   CheckOutOfBound();
2867 
2868   if (target) {
2869     assert(!Valid() || ((block_type_ == BlockType::kIndex &&
2870                          !table_->get_rep()->index_key_includes_seq)
2871                             ? (user_comparator_.Compare(ExtractUserKey(*target),
2872                                                         key()) <= 0)
2873                             : (icomp_.Compare(*target, key()) <= 0)));
2874   }
2875 }
2876 
2877 template <class TBlockIter, typename TValue>
SeekForPrev(const Slice & target)2878 void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
2879     const Slice& target) {
2880   is_out_of_bound_ = false;
2881   is_at_first_key_from_index_ = false;
2882   if (!CheckPrefixMayMatch(target)) {
2883     ResetDataIter();
2884     return;
2885   }
2886 
2887   SavePrevIndexValue();
2888 
2889   // Call Seek() rather than SeekForPrev() in the index block, because the
2890   // target data block will likely to contain the position for `target`, the
2891   // same as Seek(), rather than than before.
2892   // For example, if we have three data blocks, each containing two keys:
2893   //   [2, 4]  [6, 8] [10, 12]
2894   //  (the keys in the index block would be [4, 8, 12])
2895   // and the user calls SeekForPrev(7), we need to go to the second block,
2896   // just like if they call Seek(7).
2897   // The only case where the block is difference is when they seek to a position
2898   // in the boundary. For example, if they SeekForPrev(5), we should go to the
2899   // first block, rather than the second. However, we don't have the information
2900   // to distinguish the two unless we read the second block. In this case, we'll
2901   // end up with reading two blocks.
2902   index_iter_->Seek(target);
2903 
2904   if (!index_iter_->Valid()) {
2905     if (!index_iter_->status().ok()) {
2906       ResetDataIter();
2907       return;
2908     }
2909 
2910     index_iter_->SeekToLast();
2911     if (!index_iter_->Valid()) {
2912       ResetDataIter();
2913       return;
2914     }
2915   }
2916 
2917   InitDataBlock();
2918 
2919   block_iter_.SeekForPrev(target);
2920 
2921   FindKeyBackward();
2922   CheckDataBlockWithinUpperBound();
2923   assert(!block_iter_.Valid() ||
2924          icomp_.Compare(target, block_iter_.key()) >= 0);
2925 }
2926 
2927 template <class TBlockIter, typename TValue>
SeekToLast()2928 void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
2929   is_out_of_bound_ = false;
2930   is_at_first_key_from_index_ = false;
2931   SavePrevIndexValue();
2932   index_iter_->SeekToLast();
2933   if (!index_iter_->Valid()) {
2934     ResetDataIter();
2935     return;
2936   }
2937   InitDataBlock();
2938   block_iter_.SeekToLast();
2939   FindKeyBackward();
2940   CheckDataBlockWithinUpperBound();
2941 }
2942 
2943 template <class TBlockIter, typename TValue>
Next()2944 void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
2945   if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
2946     return;
2947   }
2948   assert(block_iter_points_to_real_block_);
2949   block_iter_.Next();
2950   FindKeyForward();
2951   CheckOutOfBound();
2952 }
2953 
2954 template <class TBlockIter, typename TValue>
NextAndGetResult(IterateResult * result)2955 bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
2956     IterateResult* result) {
2957   Next();
2958   bool is_valid = Valid();
2959   if (is_valid) {
2960     result->key = key();
2961     result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
2962   }
2963   return is_valid;
2964 }
2965 
2966 template <class TBlockIter, typename TValue>
Prev()2967 void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
2968   if (is_at_first_key_from_index_) {
2969     is_at_first_key_from_index_ = false;
2970 
2971     index_iter_->Prev();
2972     if (!index_iter_->Valid()) {
2973       return;
2974     }
2975 
2976     InitDataBlock();
2977     block_iter_.SeekToLast();
2978   } else {
2979     assert(block_iter_points_to_real_block_);
2980     block_iter_.Prev();
2981   }
2982 
2983   FindKeyBackward();
2984 }
2985 
2986 template <class TBlockIter, typename TValue>
InitDataBlock()2987 void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
2988   BlockHandle data_block_handle = index_iter_->value().handle;
2989   if (!block_iter_points_to_real_block_ ||
2990       data_block_handle.offset() != prev_block_offset_ ||
2991       // if previous attempt of reading the block missed cache, try again
2992       block_iter_.status().IsIncomplete()) {
2993     if (block_iter_points_to_real_block_) {
2994       ResetDataIter();
2995     }
2996     auto* rep = table_->get_rep();
2997 
2998     // Prefetch additional data for range scans (iterators). Enabled only for
2999     // user reads.
3000     // Implicit auto readahead:
3001     //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
3002     // Explicit user requested readahead:
3003     //   Enabled from the very first IO when ReadOptions.readahead_size is set.
3004     if (lookup_context_.caller != TableReaderCaller::kCompaction) {
3005       if (read_options_.readahead_size == 0) {
3006         // Implicit auto readahead
3007         num_file_reads_++;
3008         if (num_file_reads_ >
3009             BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
3010           if (!rep->file->use_direct_io() &&
3011               (data_block_handle.offset() +
3012                    static_cast<size_t>(block_size(data_block_handle)) >
3013                readahead_limit_)) {
3014             // Buffered I/O
3015             // Discarding the return status of Prefetch calls intentionally, as
3016             // we can fallback to reading from disk if Prefetch fails.
3017             rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
3018             readahead_limit_ = static_cast<size_t>(data_block_handle.offset() +
3019                                                    readahead_size_);
3020             // Keep exponentially increasing readahead size until
3021             // kMaxAutoReadaheadSize.
3022             readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize,
3023                                        readahead_size_ * 2);
3024           } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
3025             // Direct I/O
3026             // Let FilePrefetchBuffer take care of the readahead.
3027             rep->CreateFilePrefetchBuffer(
3028                 BlockBasedTable::kInitAutoReadaheadSize,
3029                 BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
3030           }
3031         }
3032       } else if (!prefetch_buffer_) {
3033         // Explicit user requested readahead
3034         // The actual condition is:
3035         // if (read_options_.readahead_size != 0 && !prefetch_buffer_)
3036         rep->CreateFilePrefetchBuffer(read_options_.readahead_size,
3037                                       read_options_.readahead_size,
3038                                       &prefetch_buffer_);
3039       }
3040     } else if (!prefetch_buffer_) {
3041       rep->CreateFilePrefetchBuffer(compaction_readahead_size_,
3042                                     compaction_readahead_size_,
3043                                     &prefetch_buffer_);
3044     }
3045 
3046     Status s;
3047     table_->NewDataBlockIterator<TBlockIter>(
3048         read_options_, data_block_handle, &block_iter_, block_type_,
3049         /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(),
3050         /*for_compaction=*/lookup_context_.caller ==
3051             TableReaderCaller::kCompaction);
3052     block_iter_points_to_real_block_ = true;
3053     CheckDataBlockWithinUpperBound();
3054   }
3055 }
3056 
3057 template <class TBlockIter, typename TValue>
MaterializeCurrentBlock()3058 bool BlockBasedTableIterator<TBlockIter, TValue>::MaterializeCurrentBlock() {
3059   assert(is_at_first_key_from_index_);
3060   assert(!block_iter_points_to_real_block_);
3061   assert(index_iter_->Valid());
3062 
3063   is_at_first_key_from_index_ = false;
3064   InitDataBlock();
3065   assert(block_iter_points_to_real_block_);
3066   block_iter_.SeekToFirst();
3067 
3068   if (!block_iter_.Valid() ||
3069       icomp_.Compare(block_iter_.key(),
3070                      index_iter_->value().first_internal_key) != 0) {
3071     // Uh oh.
3072     block_iter_.Invalidate(Status::Corruption(
3073         "first key in index doesn't match first key in block"));
3074     return false;
3075   }
3076 
3077   return true;
3078 }
3079 
3080 template <class TBlockIter, typename TValue>
FindKeyForward()3081 void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
3082   // This method's code is kept short to make it likely to be inlined.
3083 
3084   assert(!is_out_of_bound_);
3085   assert(block_iter_points_to_real_block_);
3086 
3087   if (!block_iter_.Valid()) {
3088     // This is the only call site of FindBlockForward(), but it's extracted into
3089     // a separate method to keep FindKeyForward() short and likely to be
3090     // inlined. When transitioning to a different block, we call
3091     // FindBlockForward(), which is much longer and is probably not inlined.
3092     FindBlockForward();
3093   } else {
3094     // This is the fast path that avoids a function call.
3095   }
3096 }
3097 
3098 template <class TBlockIter, typename TValue>
FindBlockForward()3099 void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
3100   // TODO the while loop inherits from two-level-iterator. We don't know
3101   // whether a block can be empty so it can be replaced by an "if".
3102   do {
3103     if (!block_iter_.status().ok()) {
3104       return;
3105     }
3106     // Whether next data block is out of upper bound, if there is one.
3107     const bool next_block_is_out_of_bound =
3108         read_options_.iterate_upper_bound != nullptr &&
3109         block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
3110     assert(!next_block_is_out_of_bound ||
3111            user_comparator_.Compare(*read_options_.iterate_upper_bound,
3112                                     index_iter_->user_key()) <= 0);
3113     ResetDataIter();
3114     index_iter_->Next();
3115     if (next_block_is_out_of_bound) {
3116       // The next block is out of bound. No need to read it.
3117       TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
3118       // We need to make sure this is not the last data block before setting
3119       // is_out_of_bound_, since the index key for the last data block can be
3120       // larger than smallest key of the next file on the same level.
3121       if (index_iter_->Valid()) {
3122         is_out_of_bound_ = true;
3123       }
3124       return;
3125     }
3126 
3127     if (!index_iter_->Valid()) {
3128       return;
3129     }
3130 
3131     IndexValue v = index_iter_->value();
3132 
3133     // TODO(kolmike): Remove the != kBlockCacheTier condition.
3134     if (!v.first_internal_key.empty() &&
3135         read_options_.read_tier != kBlockCacheTier) {
3136       // Index contains the first key of the block. Defer reading the block.
3137       is_at_first_key_from_index_ = true;
3138       return;
3139     }
3140 
3141     InitDataBlock();
3142     block_iter_.SeekToFirst();
3143   } while (!block_iter_.Valid());
3144 }
3145 
3146 template <class TBlockIter, typename TValue>
FindKeyBackward()3147 void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
3148   while (!block_iter_.Valid()) {
3149     if (!block_iter_.status().ok()) {
3150       return;
3151     }
3152 
3153     ResetDataIter();
3154     index_iter_->Prev();
3155 
3156     if (index_iter_->Valid()) {
3157       InitDataBlock();
3158       block_iter_.SeekToLast();
3159     } else {
3160       return;
3161     }
3162   }
3163 
3164   // We could have check lower bound here too, but we opt not to do it for
3165   // code simplicity.
3166 }
3167 
3168 template <class TBlockIter, typename TValue>
CheckOutOfBound()3169 void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
3170   if (read_options_.iterate_upper_bound != nullptr && Valid()) {
3171     is_out_of_bound_ = user_comparator_.Compare(
3172                            *read_options_.iterate_upper_bound, user_key()) <= 0;
3173   }
3174 }
3175 
3176 template <class TBlockIter, typename TValue>
3177 void BlockBasedTableIterator<TBlockIter,
CheckDataBlockWithinUpperBound()3178                              TValue>::CheckDataBlockWithinUpperBound() {
3179   if (read_options_.iterate_upper_bound != nullptr &&
3180       block_iter_points_to_real_block_) {
3181     data_block_within_upper_bound_ =
3182         (user_comparator_.Compare(*read_options_.iterate_upper_bound,
3183                                   index_iter_->user_key()) > 0);
3184   }
3185 }
3186 
NewIterator(const ReadOptions & read_options,const SliceTransform * prefix_extractor,Arena * arena,bool skip_filters,TableReaderCaller caller,size_t compaction_readahead_size)3187 InternalIterator* BlockBasedTable::NewIterator(
3188     const ReadOptions& read_options, const SliceTransform* prefix_extractor,
3189     Arena* arena, bool skip_filters, TableReaderCaller caller,
3190     size_t compaction_readahead_size) {
3191   BlockCacheLookupContext lookup_context{caller};
3192   bool need_upper_bound_check =
3193       PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
3194   if (arena == nullptr) {
3195     return new BlockBasedTableIterator<DataBlockIter>(
3196         this, read_options, rep_->internal_comparator,
3197         NewIndexIterator(
3198             read_options,
3199             need_upper_bound_check &&
3200                 rep_->index_type == BlockBasedTableOptions::kHashSearch,
3201             /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
3202         !skip_filters && !read_options.total_order_seek &&
3203             prefix_extractor != nullptr,
3204         need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
3205         compaction_readahead_size);
3206   } else {
3207     auto* mem =
3208         arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
3209     return new (mem) BlockBasedTableIterator<DataBlockIter>(
3210         this, read_options, rep_->internal_comparator,
3211         NewIndexIterator(
3212             read_options,
3213             need_upper_bound_check &&
3214                 rep_->index_type == BlockBasedTableOptions::kHashSearch,
3215             /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
3216         !skip_filters && !read_options.total_order_seek &&
3217             prefix_extractor != nullptr,
3218         need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
3219         compaction_readahead_size);
3220   }
3221 }
3222 
NewRangeTombstoneIterator(const ReadOptions & read_options)3223 FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
3224     const ReadOptions& read_options) {
3225   if (rep_->fragmented_range_dels == nullptr) {
3226     return nullptr;
3227   }
3228   SequenceNumber snapshot = kMaxSequenceNumber;
3229   if (read_options.snapshot != nullptr) {
3230     snapshot = read_options.snapshot->GetSequenceNumber();
3231   }
3232   return new FragmentedRangeTombstoneIterator(
3233       rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
3234 }
3235 
FullFilterKeyMayMatch(const ReadOptions & read_options,FilterBlockReader * filter,const Slice & internal_key,const bool no_io,const SliceTransform * prefix_extractor,GetContext * get_context,BlockCacheLookupContext * lookup_context) const3236 bool BlockBasedTable::FullFilterKeyMayMatch(
3237     const ReadOptions& read_options, FilterBlockReader* filter,
3238     const Slice& internal_key, const bool no_io,
3239     const SliceTransform* prefix_extractor, GetContext* get_context,
3240     BlockCacheLookupContext* lookup_context) const {
3241   if (filter == nullptr || filter->IsBlockBased()) {
3242     return true;
3243   }
3244   Slice user_key = ExtractUserKey(internal_key);
3245   const Slice* const const_ikey_ptr = &internal_key;
3246   bool may_match = true;
3247   if (rep_->whole_key_filtering) {
3248     size_t ts_sz =
3249         rep_->internal_comparator.user_comparator()->timestamp_size();
3250     Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
3251     may_match =
3252         filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
3253                             no_io, const_ikey_ptr, get_context, lookup_context);
3254   } else if (!read_options.total_order_seek && prefix_extractor &&
3255              rep_->table_properties->prefix_extractor_name.compare(
3256                  prefix_extractor->Name()) == 0 &&
3257              prefix_extractor->InDomain(user_key) &&
3258              !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
3259                                      prefix_extractor, kNotValid, no_io,
3260                                      const_ikey_ptr, get_context,
3261                                      lookup_context)) {
3262     may_match = false;
3263   }
3264   if (may_match) {
3265     RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
3266     PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
3267   }
3268   return may_match;
3269 }
3270 
FullFilterKeysMayMatch(const ReadOptions & read_options,FilterBlockReader * filter,MultiGetRange * range,const bool no_io,const SliceTransform * prefix_extractor,BlockCacheLookupContext * lookup_context) const3271 void BlockBasedTable::FullFilterKeysMayMatch(
3272     const ReadOptions& read_options, FilterBlockReader* filter,
3273     MultiGetRange* range, const bool no_io,
3274     const SliceTransform* prefix_extractor,
3275     BlockCacheLookupContext* lookup_context) const {
3276   if (filter == nullptr || filter->IsBlockBased()) {
3277     return;
3278   }
3279   if (rep_->whole_key_filtering) {
3280     filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
3281                          lookup_context);
3282   } else if (!read_options.total_order_seek && prefix_extractor &&
3283              rep_->table_properties->prefix_extractor_name.compare(
3284                  prefix_extractor->Name()) == 0) {
3285     filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
3286                              lookup_context);
3287   }
3288 }
3289 
Get(const ReadOptions & read_options,const Slice & key,GetContext * get_context,const SliceTransform * prefix_extractor,bool skip_filters)3290 Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
3291                             GetContext* get_context,
3292                             const SliceTransform* prefix_extractor,
3293                             bool skip_filters) {
3294   assert(key.size() >= 8);  // key must be internal key
3295   assert(get_context != nullptr);
3296   Status s;
3297   const bool no_io = read_options.read_tier == kBlockCacheTier;
3298 
3299   FilterBlockReader* const filter =
3300       !skip_filters ? rep_->filter.get() : nullptr;
3301 
3302   // First check the full filter
3303   // If full filter not useful, Then go into each block
3304   uint64_t tracing_get_id = get_context->get_tracing_get_id();
3305   BlockCacheLookupContext lookup_context{
3306       TableReaderCaller::kUserGet, tracing_get_id,
3307       /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
3308   if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
3309     // Trace the key since it contains both user key and sequence number.
3310     lookup_context.referenced_key = key.ToString();
3311     lookup_context.get_from_user_specified_snapshot =
3312         read_options.snapshot != nullptr;
3313   }
3314   const bool may_match =
3315       FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor,
3316                             get_context, &lookup_context);
3317   if (!may_match) {
3318     RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
3319     PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
3320   } else {
3321     IndexBlockIter iiter_on_stack;
3322     // if prefix_extractor found in block differs from options, disable
3323     // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
3324     bool need_upper_bound_check = false;
3325     if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
3326       need_upper_bound_check = PrefixExtractorChanged(
3327           rep_->table_properties.get(), prefix_extractor);
3328     }
3329     auto iiter =
3330         NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
3331                          get_context, &lookup_context);
3332     std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3333     if (iiter != &iiter_on_stack) {
3334       iiter_unique_ptr.reset(iiter);
3335     }
3336 
3337     size_t ts_sz =
3338         rep_->internal_comparator.user_comparator()->timestamp_size();
3339     bool matched = false;  // if such user key mathced a key in SST
3340     bool done = false;
3341     for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
3342       IndexValue v = iiter->value();
3343 
3344       bool not_exist_in_filter =
3345           filter != nullptr && filter->IsBlockBased() == true &&
3346           !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
3347                                prefix_extractor, v.handle.offset(), no_io,
3348                                /*const_ikey_ptr=*/nullptr, get_context,
3349                                &lookup_context);
3350 
3351       if (not_exist_in_filter) {
3352         // Not found
3353         // TODO: think about interaction with Merge. If a user key cannot
3354         // cross one data block, we should be fine.
3355         RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
3356         PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
3357         break;
3358       }
3359 
3360       if (!v.first_internal_key.empty() && !skip_filters &&
3361           UserComparatorWrapper(rep_->internal_comparator.user_comparator())
3362                   .Compare(ExtractUserKey(key),
3363                            ExtractUserKey(v.first_internal_key)) < 0) {
3364         // The requested key falls between highest key in previous block and
3365         // lowest key in current block.
3366         break;
3367       }
3368 
3369       BlockCacheLookupContext lookup_data_block_context{
3370           TableReaderCaller::kUserGet, tracing_get_id,
3371           /*get_from_user_specified_snapshot=*/read_options.snapshot !=
3372               nullptr};
3373       bool does_referenced_key_exist = false;
3374       DataBlockIter biter;
3375       uint64_t referenced_data_size = 0;
3376       NewDataBlockIterator<DataBlockIter>(
3377           read_options, v.handle, &biter, BlockType::kData, get_context,
3378           &lookup_data_block_context,
3379           /*s=*/Status(), /*prefetch_buffer*/ nullptr);
3380 
3381       if (no_io && biter.status().IsIncomplete()) {
3382         // couldn't get block from block_cache
3383         // Update Saver.state to Found because we are only looking for
3384         // whether we can guarantee the key is not there when "no_io" is set
3385         get_context->MarkKeyMayExist();
3386         break;
3387       }
3388       if (!biter.status().ok()) {
3389         s = biter.status();
3390         break;
3391       }
3392 
3393       bool may_exist = biter.SeekForGet(key);
3394       // If user-specified timestamp is supported, we cannot end the search
3395       // just because hash index lookup indicates the key+ts does not exist.
3396       if (!may_exist && ts_sz == 0) {
3397         // HashSeek cannot find the key this block and the the iter is not
3398         // the end of the block, i.e. cannot be in the following blocks
3399         // either. In this case, the seek_key cannot be found, so we break
3400         // from the top level for-loop.
3401         done = true;
3402       } else {
3403         // Call the *saver function on each entry/block until it returns false
3404         for (; biter.Valid(); biter.Next()) {
3405           ParsedInternalKey parsed_key;
3406           if (!ParseInternalKey(biter.key(), &parsed_key)) {
3407             s = Status::Corruption(Slice());
3408           }
3409 
3410           if (!get_context->SaveValue(
3411                   parsed_key, biter.value(), &matched,
3412                   biter.IsValuePinned() ? &biter : nullptr)) {
3413             if (get_context->State() == GetContext::GetState::kFound) {
3414               does_referenced_key_exist = true;
3415               referenced_data_size = biter.key().size() + biter.value().size();
3416             }
3417             done = true;
3418             break;
3419           }
3420         }
3421         s = biter.status();
3422       }
3423       // Write the block cache access record.
3424       if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
3425         // Avoid making copy of block_key, cf_name, and referenced_key when
3426         // constructing the access record.
3427         Slice referenced_key;
3428         if (does_referenced_key_exist) {
3429           referenced_key = biter.key();
3430         } else {
3431           referenced_key = key;
3432         }
3433         BlockCacheTraceRecord access_record(
3434             rep_->ioptions.env->NowMicros(),
3435             /*block_key=*/"", lookup_data_block_context.block_type,
3436             lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
3437             /*cf_name=*/"", rep_->level_for_tracing(),
3438             rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
3439             lookup_data_block_context.is_cache_hit,
3440             lookup_data_block_context.no_insert,
3441             lookup_data_block_context.get_id,
3442             lookup_data_block_context.get_from_user_specified_snapshot,
3443             /*referenced_key=*/"", referenced_data_size,
3444             lookup_data_block_context.num_keys_in_block,
3445             does_referenced_key_exist);
3446         block_cache_tracer_->WriteBlockAccess(
3447             access_record, lookup_data_block_context.block_key,
3448             rep_->cf_name_for_tracing(), referenced_key);
3449       }
3450 
3451       if (done) {
3452         // Avoid the extra Next which is expensive in two-level indexes
3453         break;
3454       }
3455     }
3456     if (matched && filter != nullptr && !filter->IsBlockBased()) {
3457       RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
3458       PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
3459                                 rep_->level);
3460     }
3461     if (s.ok()) {
3462       s = iiter->status();
3463     }
3464   }
3465 
3466   return s;
3467 }
3468 
3469 using MultiGetRange = MultiGetContext::Range;
MultiGet(const ReadOptions & read_options,const MultiGetRange * mget_range,const SliceTransform * prefix_extractor,bool skip_filters)3470 void BlockBasedTable::MultiGet(const ReadOptions& read_options,
3471                                const MultiGetRange* mget_range,
3472                                const SliceTransform* prefix_extractor,
3473                                bool skip_filters) {
3474   FilterBlockReader* const filter =
3475       !skip_filters ? rep_->filter.get() : nullptr;
3476   MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
3477                                mget_range->end());
3478 
3479   // First check the full filter
3480   // If full filter not useful, Then go into each block
3481   const bool no_io = read_options.read_tier == kBlockCacheTier;
3482   uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
3483   if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
3484     tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
3485   }
3486   BlockCacheLookupContext lookup_context{
3487       TableReaderCaller::kUserMultiGet, tracing_mget_id,
3488       /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
3489   FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
3490                          prefix_extractor, &lookup_context);
3491 
3492   if (skip_filters || !sst_file_range.empty()) {
3493     IndexBlockIter iiter_on_stack;
3494     // if prefix_extractor found in block differs from options, disable
3495     // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
3496     bool need_upper_bound_check = false;
3497     if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
3498       need_upper_bound_check = PrefixExtractorChanged(
3499           rep_->table_properties.get(), prefix_extractor);
3500     }
3501     auto iiter =
3502         NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
3503                          sst_file_range.begin()->get_context, &lookup_context);
3504     std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3505     if (iiter != &iiter_on_stack) {
3506       iiter_unique_ptr.reset(iiter);
3507     }
3508 
3509     uint64_t offset = std::numeric_limits<uint64_t>::max();
3510     autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles;
3511     autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results;
3512     autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
3513     char stack_buf[kMultiGetReadStackBufSize];
3514     std::unique_ptr<char[]> block_buf;
3515     {
3516       MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
3517                                      sst_file_range.end());
3518 
3519       CachableEntry<UncompressionDict> uncompression_dict;
3520       Status uncompression_dict_status;
3521       if (rep_->uncompression_dict_reader) {
3522         uncompression_dict_status =
3523             rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
3524                 nullptr /* prefetch_buffer */, no_io,
3525                 sst_file_range.begin()->get_context, &lookup_context,
3526                 &uncompression_dict);
3527       }
3528 
3529       const UncompressionDict& dict = uncompression_dict.GetValue()
3530                                           ? *uncompression_dict.GetValue()
3531                                           : UncompressionDict::GetEmptyDict();
3532 
3533       size_t total_len = 0;
3534       ReadOptions ro = read_options;
3535       ro.read_tier = kBlockCacheTier;
3536 
3537       for (auto miter = data_block_range.begin();
3538            miter != data_block_range.end(); ++miter) {
3539         const Slice& key = miter->ikey;
3540         iiter->Seek(miter->ikey);
3541 
3542         IndexValue v;
3543         if (iiter->Valid()) {
3544           v = iiter->value();
3545         }
3546         if (!iiter->Valid() ||
3547             (!v.first_internal_key.empty() && !skip_filters &&
3548              UserComparatorWrapper(rep_->internal_comparator.user_comparator())
3549                      .Compare(ExtractUserKey(key),
3550                               ExtractUserKey(v.first_internal_key)) < 0)) {
3551           // The requested key falls between highest key in previous block and
3552           // lowest key in current block.
3553           *(miter->s) = iiter->status();
3554           data_block_range.SkipKey(miter);
3555           sst_file_range.SkipKey(miter);
3556           continue;
3557         }
3558 
3559         if (!uncompression_dict_status.ok()) {
3560           *(miter->s) = uncompression_dict_status;
3561           data_block_range.SkipKey(miter);
3562           sst_file_range.SkipKey(miter);
3563           continue;
3564         }
3565 
3566         statuses.emplace_back();
3567         results.emplace_back();
3568         if (v.handle.offset() == offset) {
3569           // We're going to reuse the block for this key later on. No need to
3570           // look it up now. Place a null handle
3571           block_handles.emplace_back(BlockHandle::NullBlockHandle());
3572           continue;
3573         }
3574         // Lookup the cache for the given data block referenced by an index
3575         // iterator value (i.e BlockHandle). If it exists in the cache,
3576         // initialize block to the contents of the data block.
3577         offset = v.handle.offset();
3578         BlockHandle handle = v.handle;
3579         BlockCacheLookupContext lookup_data_block_context(
3580             TableReaderCaller::kUserMultiGet);
3581         Status s = RetrieveBlock(
3582             nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
3583             miter->get_context, &lookup_data_block_context,
3584             /* for_compaction */ false, /* use_cache */ true);
3585         if (s.IsIncomplete()) {
3586           s = Status::OK();
3587         }
3588         if (s.ok() && !results.back().IsEmpty()) {
3589           // Found it in the cache. Add NULL handle to indicate there is
3590           // nothing to read from disk
3591           block_handles.emplace_back(BlockHandle::NullBlockHandle());
3592         } else {
3593           block_handles.emplace_back(handle);
3594           total_len += block_size(handle);
3595         }
3596       }
3597 
3598       if (total_len) {
3599         char* scratch = nullptr;
3600         // If the blocks need to be uncompressed and we don't need the
3601         // compressed blocks, then we can use a contiguous block of
3602         // memory to read in all the blocks as it will be temporary
3603         // storage
3604         // 1. If blocks are compressed and compressed block cache is there,
3605         //    alloc heap bufs
3606         // 2. If blocks are uncompressed, alloc heap bufs
3607         // 3. If blocks are compressed and no compressed block cache, use
3608         //    stack buf
3609         if (rep_->table_options.block_cache_compressed == nullptr &&
3610             rep_->blocks_maybe_compressed) {
3611           if (total_len <= kMultiGetReadStackBufSize) {
3612             scratch = stack_buf;
3613           } else {
3614             scratch = new char[total_len];
3615             block_buf.reset(scratch);
3616           }
3617         }
3618         RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles,
3619                                &statuses, &results, scratch, dict);
3620       }
3621     }
3622 
3623     DataBlockIter first_biter;
3624     DataBlockIter next_biter;
3625     size_t idx_in_batch = 0;
3626     for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
3627          ++miter) {
3628       Status s;
3629       GetContext* get_context = miter->get_context;
3630       const Slice& key = miter->ikey;
3631       bool matched = false;  // if such user key matched a key in SST
3632       bool done = false;
3633       bool first_block = true;
3634       do {
3635         DataBlockIter* biter = nullptr;
3636         bool reusing_block = true;
3637         uint64_t referenced_data_size = 0;
3638         bool does_referenced_key_exist = false;
3639         BlockCacheLookupContext lookup_data_block_context(
3640             TableReaderCaller::kUserMultiGet, tracing_mget_id,
3641             /*get_from_user_specified_snapshot=*/read_options.snapshot !=
3642                 nullptr);
3643         if (first_block) {
3644           if (!block_handles[idx_in_batch].IsNull() ||
3645               !results[idx_in_batch].IsEmpty()) {
3646             first_biter.Invalidate(Status::OK());
3647             NewDataBlockIterator<DataBlockIter>(
3648                 read_options, results[idx_in_batch], &first_biter,
3649                 statuses[idx_in_batch]);
3650             reusing_block = false;
3651           }
3652           biter = &first_biter;
3653           idx_in_batch++;
3654         } else {
3655           IndexValue v = iiter->value();
3656           if (!v.first_internal_key.empty() && !skip_filters &&
3657               UserComparatorWrapper(rep_->internal_comparator.user_comparator())
3658                       .Compare(ExtractUserKey(key),
3659                                ExtractUserKey(v.first_internal_key)) < 0) {
3660             // The requested key falls between highest key in previous block and
3661             // lowest key in current block.
3662             break;
3663           }
3664 
3665           next_biter.Invalidate(Status::OK());
3666           NewDataBlockIterator<DataBlockIter>(
3667               read_options, iiter->value().handle, &next_biter,
3668               BlockType::kData, get_context, &lookup_data_block_context,
3669               Status(), nullptr);
3670           biter = &next_biter;
3671           reusing_block = false;
3672         }
3673 
3674         if (read_options.read_tier == kBlockCacheTier &&
3675             biter->status().IsIncomplete()) {
3676           // couldn't get block from block_cache
3677           // Update Saver.state to Found because we are only looking for
3678           // whether we can guarantee the key is not there when "no_io" is set
3679           get_context->MarkKeyMayExist();
3680           break;
3681         }
3682         if (!biter->status().ok()) {
3683           s = biter->status();
3684           break;
3685         }
3686 
3687         bool may_exist = biter->SeekForGet(key);
3688         if (!may_exist) {
3689           // HashSeek cannot find the key this block and the the iter is not
3690           // the end of the block, i.e. cannot be in the following blocks
3691           // either. In this case, the seek_key cannot be found, so we break
3692           // from the top level for-loop.
3693           break;
3694         }
3695 
3696         // Call the *saver function on each entry/block until it returns false
3697         for (; biter->Valid(); biter->Next()) {
3698           ParsedInternalKey parsed_key;
3699           Cleanable dummy;
3700           Cleanable* value_pinner = nullptr;
3701           if (!ParseInternalKey(biter->key(), &parsed_key)) {
3702             s = Status::Corruption(Slice());
3703           }
3704           if (biter->IsValuePinned()) {
3705             if (reusing_block) {
3706               Cache* block_cache = rep_->table_options.block_cache.get();
3707               assert(biter->cache_handle() != nullptr);
3708               block_cache->Ref(biter->cache_handle());
3709               dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
3710                                     biter->cache_handle());
3711               value_pinner = &dummy;
3712             } else {
3713               value_pinner = biter;
3714             }
3715           }
3716           if (!get_context->SaveValue(parsed_key, biter->value(), &matched,
3717                                       value_pinner)) {
3718             if (get_context->State() == GetContext::GetState::kFound) {
3719               does_referenced_key_exist = true;
3720               referenced_data_size =
3721                   biter->key().size() + biter->value().size();
3722             }
3723             done = true;
3724             break;
3725           }
3726           s = biter->status();
3727         }
3728         // Write the block cache access.
3729         if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
3730           // Avoid making copy of block_key, cf_name, and referenced_key when
3731           // constructing the access record.
3732           Slice referenced_key;
3733           if (does_referenced_key_exist) {
3734             referenced_key = biter->key();
3735           } else {
3736             referenced_key = key;
3737           }
3738           BlockCacheTraceRecord access_record(
3739               rep_->ioptions.env->NowMicros(),
3740               /*block_key=*/"", lookup_data_block_context.block_type,
3741               lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
3742               /*cf_name=*/"", rep_->level_for_tracing(),
3743               rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
3744               lookup_data_block_context.is_cache_hit,
3745               lookup_data_block_context.no_insert,
3746               lookup_data_block_context.get_id,
3747               lookup_data_block_context.get_from_user_specified_snapshot,
3748               /*referenced_key=*/"", referenced_data_size,
3749               lookup_data_block_context.num_keys_in_block,
3750               does_referenced_key_exist);
3751           block_cache_tracer_->WriteBlockAccess(
3752               access_record, lookup_data_block_context.block_key,
3753               rep_->cf_name_for_tracing(), referenced_key);
3754         }
3755         s = biter->status();
3756         if (done) {
3757           // Avoid the extra Next which is expensive in two-level indexes
3758           break;
3759         }
3760         if (first_block) {
3761           iiter->Seek(key);
3762         }
3763         first_block = false;
3764         iiter->Next();
3765       } while (iiter->Valid());
3766 
3767       if (matched && filter != nullptr && !filter->IsBlockBased()) {
3768         RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
3769         PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
3770                                   rep_->level);
3771       }
3772       if (s.ok()) {
3773         s = iiter->status();
3774       }
3775       *(miter->s) = s;
3776     }
3777   }
3778 }
3779 
Prefetch(const Slice * const begin,const Slice * const end)3780 Status BlockBasedTable::Prefetch(const Slice* const begin,
3781                                  const Slice* const end) {
3782   auto& comparator = rep_->internal_comparator;
3783   UserComparatorWrapper user_comparator(comparator.user_comparator());
3784   // pre-condition
3785   if (begin && end && comparator.Compare(*begin, *end) > 0) {
3786     return Status::InvalidArgument(*begin, *end);
3787   }
3788   BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
3789   IndexBlockIter iiter_on_stack;
3790   auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
3791                                 &iiter_on_stack, /*get_context=*/nullptr,
3792                                 &lookup_context);
3793   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3794   if (iiter != &iiter_on_stack) {
3795     iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
3796   }
3797 
3798   if (!iiter->status().ok()) {
3799     // error opening index iterator
3800     return iiter->status();
3801   }
3802 
3803   // indicates if we are on the last page that need to be pre-fetched
3804   bool prefetching_boundary_page = false;
3805 
3806   for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
3807        iiter->Next()) {
3808     BlockHandle block_handle = iiter->value().handle;
3809     const bool is_user_key = !rep_->index_key_includes_seq;
3810     if (end &&
3811         ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
3812          (is_user_key &&
3813           user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
3814       if (prefetching_boundary_page) {
3815         break;
3816       }
3817 
3818       // The index entry represents the last key in the data block.
3819       // We should load this page into memory as well, but no more
3820       prefetching_boundary_page = true;
3821     }
3822 
3823     // Load the block specified by the block_handle into the block cache
3824     DataBlockIter biter;
3825 
3826     NewDataBlockIterator<DataBlockIter>(
3827         ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
3828         /*get_context=*/nullptr, &lookup_context, Status(),
3829         /*prefetch_buffer=*/nullptr);
3830 
3831     if (!biter.status().ok()) {
3832       // there was an unexpected error while pre-fetching
3833       return biter.status();
3834     }
3835   }
3836 
3837   return Status::OK();
3838 }
3839 
VerifyChecksum(const ReadOptions & read_options,TableReaderCaller caller)3840 Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
3841                                        TableReaderCaller caller) {
3842   Status s;
3843   // Check Meta blocks
3844   std::unique_ptr<Block> metaindex;
3845   std::unique_ptr<InternalIterator> metaindex_iter;
3846   s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex,
3847                          &metaindex_iter);
3848   if (s.ok()) {
3849     s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
3850     if (!s.ok()) {
3851       return s;
3852     }
3853   } else {
3854     return s;
3855   }
3856   // Check Data blocks
3857   IndexBlockIter iiter_on_stack;
3858   BlockCacheLookupContext context{caller};
3859   InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
3860       read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
3861       /*get_context=*/nullptr, &context);
3862   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3863   if (iiter != &iiter_on_stack) {
3864     iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
3865   }
3866   if (!iiter->status().ok()) {
3867     // error opening index iterator
3868     return iiter->status();
3869   }
3870   s = VerifyChecksumInBlocks(read_options, iiter);
3871   return s;
3872 }
3873 
VerifyChecksumInBlocks(const ReadOptions & read_options,InternalIteratorBase<IndexValue> * index_iter)3874 Status BlockBasedTable::VerifyChecksumInBlocks(
3875     const ReadOptions& read_options,
3876     InternalIteratorBase<IndexValue>* index_iter) {
3877   Status s;
3878   // We are scanning the whole file, so no need to do exponential
3879   // increasing of the buffer size.
3880   size_t readahead_size = (read_options.readahead_size != 0)
3881                               ? read_options.readahead_size
3882                               : kMaxAutoReadaheadSize;
3883   // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
3884   // needed there.
3885   FilePrefetchBuffer prefetch_buffer(
3886       rep_->file.get(), readahead_size /* readadhead_size */,
3887       readahead_size /* max_readahead_size */,
3888       !rep_->ioptions.allow_mmap_reads /* enable */);
3889 
3890   for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
3891     s = index_iter->status();
3892     if (!s.ok()) {
3893       break;
3894     }
3895     BlockHandle handle = index_iter->value().handle;
3896     BlockContents contents;
3897     BlockFetcher block_fetcher(
3898         rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle,
3899         &contents, rep_->ioptions, false /* decompress */,
3900         false /*maybe_compressed*/, BlockType::kData,
3901         UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
3902     s = block_fetcher.ReadBlockContents();
3903     if (!s.ok()) {
3904       break;
3905     }
3906   }
3907   return s;
3908 }
3909 
GetBlockTypeForMetaBlockByName(const Slice & meta_block_name)3910 BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
3911     const Slice& meta_block_name) {
3912   if (meta_block_name.starts_with(kFilterBlockPrefix) ||
3913       meta_block_name.starts_with(kFullFilterBlockPrefix) ||
3914       meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
3915     return BlockType::kFilter;
3916   }
3917 
3918   if (meta_block_name == kPropertiesBlock) {
3919     return BlockType::kProperties;
3920   }
3921 
3922   if (meta_block_name == kCompressionDictBlock) {
3923     return BlockType::kCompressionDictionary;
3924   }
3925 
3926   if (meta_block_name == kRangeDelBlock) {
3927     return BlockType::kRangeDeletion;
3928   }
3929 
3930   if (meta_block_name == kHashIndexPrefixesBlock) {
3931     return BlockType::kHashIndexPrefixes;
3932   }
3933 
3934   if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
3935     return BlockType::kHashIndexMetadata;
3936   }
3937 
3938   assert(false);
3939   return BlockType::kInvalid;
3940 }
3941 
VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice> * index_iter)3942 Status BlockBasedTable::VerifyChecksumInMetaBlocks(
3943     InternalIteratorBase<Slice>* index_iter) {
3944   Status s;
3945   for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
3946     s = index_iter->status();
3947     if (!s.ok()) {
3948       break;
3949     }
3950     BlockHandle handle;
3951     Slice input = index_iter->value();
3952     s = handle.DecodeFrom(&input);
3953     BlockContents contents;
3954     const Slice meta_block_name = index_iter->key();
3955     BlockFetcher block_fetcher(
3956         rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
3957         ReadOptions(), handle, &contents, rep_->ioptions,
3958         false /* decompress */, false /*maybe_compressed*/,
3959         GetBlockTypeForMetaBlockByName(meta_block_name),
3960         UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
3961     s = block_fetcher.ReadBlockContents();
3962     if (s.IsCorruption() && meta_block_name == kPropertiesBlock) {
3963       TableProperties* table_properties;
3964       s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
3965                                            index_iter->value(),
3966                                            &table_properties);
3967       delete table_properties;
3968     }
3969     if (!s.ok()) {
3970       break;
3971     }
3972   }
3973   return s;
3974 }
3975 
TEST_BlockInCache(const BlockHandle & handle) const3976 bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
3977   assert(rep_ != nullptr);
3978 
3979   Cache* const cache = rep_->table_options.block_cache.get();
3980   if (cache == nullptr) {
3981     return false;
3982   }
3983 
3984   char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
3985   Slice cache_key =
3986       GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
3987                   cache_key_storage);
3988 
3989   Cache::Handle* const cache_handle = cache->Lookup(cache_key);
3990   if (cache_handle == nullptr) {
3991     return false;
3992   }
3993 
3994   cache->Release(cache_handle);
3995 
3996   return true;
3997 }
3998 
TEST_KeyInCache(const ReadOptions & options,const Slice & key)3999 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
4000                                       const Slice& key) {
4001   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
4002       options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
4003       /*get_context=*/nullptr, /*lookup_context=*/nullptr));
4004   iiter->Seek(key);
4005   assert(iiter->Valid());
4006 
4007   return TEST_BlockInCache(iiter->value().handle);
4008 }
4009 
4010 // REQUIRES: The following fields of rep_ should have already been populated:
4011 //  1. file
4012 //  2. index_handle,
4013 //  3. options
4014 //  4. internal_comparator
4015 //  5. index_type
CreateIndexReader(FilePrefetchBuffer * prefetch_buffer,InternalIterator * preloaded_meta_index_iter,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)4016 Status BlockBasedTable::CreateIndexReader(
4017     FilePrefetchBuffer* prefetch_buffer,
4018     InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
4019     bool pin, BlockCacheLookupContext* lookup_context,
4020     std::unique_ptr<IndexReader>* index_reader) {
4021   // kHashSearch requires non-empty prefix_extractor but bypass checking
4022   // prefix_extractor here since we have no access to MutableCFOptions.
4023   // Add need_upper_bound_check flag in  BlockBasedTable::NewIndexIterator.
4024   // If prefix_extractor does not match prefix_extractor_name from table
4025   // properties, turn off Hash Index by setting total_order_seek to true
4026 
4027   switch (rep_->index_type) {
4028     case BlockBasedTableOptions::kTwoLevelIndexSearch: {
4029       return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
4030                                           prefetch, pin, lookup_context,
4031                                           index_reader);
4032     }
4033     case BlockBasedTableOptions::kBinarySearch:
4034     case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
4035       return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
4036                                              prefetch, pin, lookup_context,
4037                                              index_reader);
4038     }
4039     case BlockBasedTableOptions::kHashSearch: {
4040       std::unique_ptr<Block> metaindex_guard;
4041       std::unique_ptr<InternalIterator> metaindex_iter_guard;
4042       auto meta_index_iter = preloaded_meta_index_iter;
4043       if (meta_index_iter == nullptr) {
4044         auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard,
4045                                     &metaindex_iter_guard);
4046         if (!s.ok()) {
4047           // we simply fall back to binary search in case there is any
4048           // problem with prefix hash index loading.
4049           ROCKS_LOG_WARN(rep_->ioptions.info_log,
4050                          "Unable to read the metaindex block."
4051                          " Fall back to binary search index.");
4052           return BinarySearchIndexReader::Create(this, prefetch_buffer,
4053                                                  use_cache, prefetch, pin,
4054                                                  lookup_context, index_reader);
4055         }
4056         meta_index_iter = metaindex_iter_guard.get();
4057       }
4058 
4059       return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
4060                                      use_cache, prefetch, pin, lookup_context,
4061                                      index_reader);
4062     }
4063     default: {
4064       std::string error_message =
4065           "Unrecognized index type: " + ToString(rep_->index_type);
4066       return Status::InvalidArgument(error_message.c_str());
4067     }
4068   }
4069 }
4070 
ApproximateOffsetOf(const InternalIteratorBase<IndexValue> & index_iter) const4071 uint64_t BlockBasedTable::ApproximateOffsetOf(
4072     const InternalIteratorBase<IndexValue>& index_iter) const {
4073   uint64_t result = 0;
4074   if (index_iter.Valid()) {
4075     BlockHandle handle = index_iter.value().handle;
4076     result = handle.offset();
4077   } else {
4078     // The iterator is past the last key in the file. If table_properties is not
4079     // available, approximate the offset by returning the offset of the
4080     // metaindex block (which is right near the end of the file).
4081     if (rep_->table_properties) {
4082       result = rep_->table_properties->data_size;
4083     }
4084     // table_properties is not present in the table.
4085     if (result == 0) {
4086       result = rep_->footer.metaindex_handle().offset();
4087     }
4088   }
4089 
4090   return result;
4091 }
4092 
ApproximateOffsetOf(const Slice & key,TableReaderCaller caller)4093 uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
4094                                               TableReaderCaller caller) {
4095   BlockCacheLookupContext context(caller);
4096   IndexBlockIter iiter_on_stack;
4097   ReadOptions ro;
4098   ro.total_order_seek = true;
4099   auto index_iter =
4100       NewIndexIterator(ro, /*disable_prefix_seek=*/true,
4101                        /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
4102                        /*lookup_context=*/&context);
4103   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
4104   if (index_iter != &iiter_on_stack) {
4105     iiter_unique_ptr.reset(index_iter);
4106   }
4107 
4108   index_iter->Seek(key);
4109   return ApproximateOffsetOf(*index_iter);
4110 }
4111 
ApproximateSize(const Slice & start,const Slice & end,TableReaderCaller caller)4112 uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
4113                                           TableReaderCaller caller) {
4114   assert(rep_->internal_comparator.Compare(start, end) <= 0);
4115 
4116   BlockCacheLookupContext context(caller);
4117   IndexBlockIter iiter_on_stack;
4118   ReadOptions ro;
4119   ro.total_order_seek = true;
4120   auto index_iter =
4121       NewIndexIterator(ro, /*disable_prefix_seek=*/true,
4122                        /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
4123                        /*lookup_context=*/&context);
4124   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
4125   if (index_iter != &iiter_on_stack) {
4126     iiter_unique_ptr.reset(index_iter);
4127   }
4128 
4129   index_iter->Seek(start);
4130   uint64_t start_offset = ApproximateOffsetOf(*index_iter);
4131   index_iter->Seek(end);
4132   uint64_t end_offset = ApproximateOffsetOf(*index_iter);
4133 
4134   assert(end_offset >= start_offset);
4135   return end_offset - start_offset;
4136 }
4137 
TEST_FilterBlockInCache() const4138 bool BlockBasedTable::TEST_FilterBlockInCache() const {
4139   assert(rep_ != nullptr);
4140   return TEST_BlockInCache(rep_->filter_handle);
4141 }
4142 
TEST_IndexBlockInCache() const4143 bool BlockBasedTable::TEST_IndexBlockInCache() const {
4144   assert(rep_ != nullptr);
4145 
4146   return TEST_BlockInCache(rep_->footer.index_handle());
4147 }
4148 
GetKVPairsFromDataBlocks(std::vector<KVPairBlock> * kv_pair_blocks)4149 Status BlockBasedTable::GetKVPairsFromDataBlocks(
4150     std::vector<KVPairBlock>* kv_pair_blocks) {
4151   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
4152       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
4153                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
4154                        /*lookup_contex=*/nullptr));
4155 
4156   Status s = blockhandles_iter->status();
4157   if (!s.ok()) {
4158     // Cannot read Index Block
4159     return s;
4160   }
4161 
4162   for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
4163        blockhandles_iter->Next()) {
4164     s = blockhandles_iter->status();
4165 
4166     if (!s.ok()) {
4167       break;
4168     }
4169 
4170     std::unique_ptr<InternalIterator> datablock_iter;
4171     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
4172         ReadOptions(), blockhandles_iter->value().handle,
4173         /*input_iter=*/nullptr, /*type=*/BlockType::kData,
4174         /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
4175         /*prefetch_buffer=*/nullptr));
4176     s = datablock_iter->status();
4177 
4178     if (!s.ok()) {
4179       // Error reading the block - Skipped
4180       continue;
4181     }
4182 
4183     KVPairBlock kv_pair_block;
4184     for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
4185          datablock_iter->Next()) {
4186       s = datablock_iter->status();
4187       if (!s.ok()) {
4188         // Error reading the block - Skipped
4189         break;
4190       }
4191       const Slice& key = datablock_iter->key();
4192       const Slice& value = datablock_iter->value();
4193       std::string key_copy = std::string(key.data(), key.size());
4194       std::string value_copy = std::string(value.data(), value.size());
4195 
4196       kv_pair_block.push_back(
4197           std::make_pair(std::move(key_copy), std::move(value_copy)));
4198     }
4199     kv_pair_blocks->push_back(std::move(kv_pair_block));
4200   }
4201   return Status::OK();
4202 }
4203 
DumpTable(WritableFile * out_file)4204 Status BlockBasedTable::DumpTable(WritableFile* out_file) {
4205   // Output Footer
4206   out_file->Append(
4207       "Footer Details:\n"
4208       "--------------------------------------\n"
4209       "  ");
4210   out_file->Append(rep_->footer.ToString().c_str());
4211   out_file->Append("\n");
4212 
4213   // Output MetaIndex
4214   out_file->Append(
4215       "Metaindex Details:\n"
4216       "--------------------------------------\n");
4217   std::unique_ptr<Block> metaindex;
4218   std::unique_ptr<InternalIterator> metaindex_iter;
4219   Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex,
4220                                 &metaindex_iter);
4221   if (s.ok()) {
4222     for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
4223          metaindex_iter->Next()) {
4224       s = metaindex_iter->status();
4225       if (!s.ok()) {
4226         return s;
4227       }
4228       if (metaindex_iter->key() == rocksdb::kPropertiesBlock) {
4229         out_file->Append("  Properties block handle: ");
4230         out_file->Append(metaindex_iter->value().ToString(true).c_str());
4231         out_file->Append("\n");
4232       } else if (metaindex_iter->key() == rocksdb::kCompressionDictBlock) {
4233         out_file->Append("  Compression dictionary block handle: ");
4234         out_file->Append(metaindex_iter->value().ToString(true).c_str());
4235         out_file->Append("\n");
4236       } else if (strstr(metaindex_iter->key().ToString().c_str(),
4237                         "filter.rocksdb.") != nullptr) {
4238         out_file->Append("  Filter block handle: ");
4239         out_file->Append(metaindex_iter->value().ToString(true).c_str());
4240         out_file->Append("\n");
4241       } else if (metaindex_iter->key() == rocksdb::kRangeDelBlock) {
4242         out_file->Append("  Range deletion block handle: ");
4243         out_file->Append(metaindex_iter->value().ToString(true).c_str());
4244         out_file->Append("\n");
4245       }
4246     }
4247     out_file->Append("\n");
4248   } else {
4249     return s;
4250   }
4251 
4252   // Output TableProperties
4253   const rocksdb::TableProperties* table_properties;
4254   table_properties = rep_->table_properties.get();
4255 
4256   if (table_properties != nullptr) {
4257     out_file->Append(
4258         "Table Properties:\n"
4259         "--------------------------------------\n"
4260         "  ");
4261     out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
4262     out_file->Append("\n");
4263   }
4264 
4265   if (rep_->filter) {
4266     out_file->Append(
4267         "Filter Details:\n"
4268         "--------------------------------------\n"
4269         "  ");
4270     out_file->Append(rep_->filter->ToString().c_str());
4271     out_file->Append("\n");
4272   }
4273 
4274   // Output Index block
4275   s = DumpIndexBlock(out_file);
4276   if (!s.ok()) {
4277     return s;
4278   }
4279 
4280   // Output compression dictionary
4281   if (rep_->uncompression_dict_reader) {
4282     CachableEntry<UncompressionDict> uncompression_dict;
4283     s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
4284         nullptr /* prefetch_buffer */, false /* no_io */,
4285         nullptr /* get_context */, nullptr /* lookup_context */,
4286         &uncompression_dict);
4287     if (!s.ok()) {
4288       return s;
4289     }
4290 
4291     assert(uncompression_dict.GetValue());
4292 
4293     const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
4294     out_file->Append(
4295         "Compression Dictionary:\n"
4296         "--------------------------------------\n");
4297     out_file->Append("  size (bytes): ");
4298     out_file->Append(rocksdb::ToString(raw_dict.size()));
4299     out_file->Append("\n\n");
4300     out_file->Append("  HEX    ");
4301     out_file->Append(raw_dict.ToString(true).c_str());
4302     out_file->Append("\n\n");
4303   }
4304 
4305   // Output range deletions block
4306   auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
4307   if (range_del_iter != nullptr) {
4308     range_del_iter->SeekToFirst();
4309     if (range_del_iter->Valid()) {
4310       out_file->Append(
4311           "Range deletions:\n"
4312           "--------------------------------------\n"
4313           "  ");
4314       for (; range_del_iter->Valid(); range_del_iter->Next()) {
4315         DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
4316       }
4317       out_file->Append("\n");
4318     }
4319     delete range_del_iter;
4320   }
4321   // Output Data blocks
4322   s = DumpDataBlocks(out_file);
4323 
4324   return s;
4325 }
4326 
DumpIndexBlock(WritableFile * out_file)4327 Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
4328   out_file->Append(
4329       "Index Details:\n"
4330       "--------------------------------------\n");
4331   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
4332       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
4333                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
4334                        /*lookup_contex=*/nullptr));
4335   Status s = blockhandles_iter->status();
4336   if (!s.ok()) {
4337     out_file->Append("Can not read Index Block \n\n");
4338     return s;
4339   }
4340 
4341   out_file->Append("  Block key hex dump: Data block handle\n");
4342   out_file->Append("  Block key ascii\n\n");
4343   for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
4344        blockhandles_iter->Next()) {
4345     s = blockhandles_iter->status();
4346     if (!s.ok()) {
4347       break;
4348     }
4349     Slice key = blockhandles_iter->key();
4350     Slice user_key;
4351     InternalKey ikey;
4352     if (!rep_->index_key_includes_seq) {
4353       user_key = key;
4354     } else {
4355       ikey.DecodeFrom(key);
4356       user_key = ikey.user_key();
4357     }
4358 
4359     out_file->Append("  HEX    ");
4360     out_file->Append(user_key.ToString(true).c_str());
4361     out_file->Append(": ");
4362     out_file->Append(blockhandles_iter->value()
4363                          .ToString(true, rep_->index_has_first_key)
4364                          .c_str());
4365     out_file->Append("\n");
4366 
4367     std::string str_key = user_key.ToString();
4368     std::string res_key("");
4369     char cspace = ' ';
4370     for (size_t i = 0; i < str_key.size(); i++) {
4371       res_key.append(&str_key[i], 1);
4372       res_key.append(1, cspace);
4373     }
4374     out_file->Append("  ASCII  ");
4375     out_file->Append(res_key.c_str());
4376     out_file->Append("\n  ------\n");
4377   }
4378   out_file->Append("\n");
4379   return Status::OK();
4380 }
4381 
DumpDataBlocks(WritableFile * out_file)4382 Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
4383   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
4384       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
4385                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
4386                        /*lookup_contex=*/nullptr));
4387   Status s = blockhandles_iter->status();
4388   if (!s.ok()) {
4389     out_file->Append("Can not read Index Block \n\n");
4390     return s;
4391   }
4392 
4393   uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
4394   uint64_t datablock_size_max = 0;
4395   uint64_t datablock_size_sum = 0;
4396 
4397   size_t block_id = 1;
4398   for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
4399        block_id++, blockhandles_iter->Next()) {
4400     s = blockhandles_iter->status();
4401     if (!s.ok()) {
4402       break;
4403     }
4404 
4405     BlockHandle bh = blockhandles_iter->value().handle;
4406     uint64_t datablock_size = bh.size();
4407     datablock_size_min = std::min(datablock_size_min, datablock_size);
4408     datablock_size_max = std::max(datablock_size_max, datablock_size);
4409     datablock_size_sum += datablock_size;
4410 
4411     out_file->Append("Data Block # ");
4412     out_file->Append(rocksdb::ToString(block_id));
4413     out_file->Append(" @ ");
4414     out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str());
4415     out_file->Append("\n");
4416     out_file->Append("--------------------------------------\n");
4417 
4418     std::unique_ptr<InternalIterator> datablock_iter;
4419     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
4420         ReadOptions(), blockhandles_iter->value().handle,
4421         /*input_iter=*/nullptr, /*type=*/BlockType::kData,
4422         /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
4423         /*prefetch_buffer=*/nullptr));
4424     s = datablock_iter->status();
4425 
4426     if (!s.ok()) {
4427       out_file->Append("Error reading the block - Skipped \n\n");
4428       continue;
4429     }
4430 
4431     for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
4432          datablock_iter->Next()) {
4433       s = datablock_iter->status();
4434       if (!s.ok()) {
4435         out_file->Append("Error reading the block - Skipped \n");
4436         break;
4437       }
4438       DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
4439     }
4440     out_file->Append("\n");
4441   }
4442 
4443   uint64_t num_datablocks = block_id - 1;
4444   if (num_datablocks) {
4445     double datablock_size_avg =
4446         static_cast<double>(datablock_size_sum) / num_datablocks;
4447     out_file->Append("Data Block Summary:\n");
4448     out_file->Append("--------------------------------------");
4449     out_file->Append("\n  # data blocks: ");
4450     out_file->Append(rocksdb::ToString(num_datablocks));
4451     out_file->Append("\n  min data block size: ");
4452     out_file->Append(rocksdb::ToString(datablock_size_min));
4453     out_file->Append("\n  max data block size: ");
4454     out_file->Append(rocksdb::ToString(datablock_size_max));
4455     out_file->Append("\n  avg data block size: ");
4456     out_file->Append(rocksdb::ToString(datablock_size_avg));
4457     out_file->Append("\n");
4458   }
4459 
4460   return Status::OK();
4461 }
4462 
DumpKeyValue(const Slice & key,const Slice & value,WritableFile * out_file)4463 void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
4464                                    WritableFile* out_file) {
4465   InternalKey ikey;
4466   ikey.DecodeFrom(key);
4467 
4468   out_file->Append("  HEX    ");
4469   out_file->Append(ikey.user_key().ToString(true).c_str());
4470   out_file->Append(": ");
4471   out_file->Append(value.ToString(true).c_str());
4472   out_file->Append("\n");
4473 
4474   std::string str_key = ikey.user_key().ToString();
4475   std::string str_value = value.ToString();
4476   std::string res_key(""), res_value("");
4477   char cspace = ' ';
4478   for (size_t i = 0; i < str_key.size(); i++) {
4479     if (str_key[i] == '\0') {
4480       res_key.append("\\0", 2);
4481     } else {
4482       res_key.append(&str_key[i], 1);
4483     }
4484     res_key.append(1, cspace);
4485   }
4486   for (size_t i = 0; i < str_value.size(); i++) {
4487     if (str_value[i] == '\0') {
4488       res_value.append("\\0", 2);
4489     } else {
4490       res_value.append(&str_value[i], 1);
4491     }
4492     res_value.append(1, cspace);
4493   }
4494 
4495   out_file->Append("  ASCII  ");
4496   out_file->Append(res_key.c_str());
4497   out_file->Append(": ");
4498   out_file->Append(res_value.c_str());
4499   out_file->Append("\n  ------\n");
4500 }
4501 
4502 }  // namespace rocksdb
4503