1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 #include "table/block_based/block_based_table_reader.h"
10 #include <algorithm>
11 #include <array>
12 #include <limits>
13 #include <string>
14 #include <utility>
15 #include <vector>
16
17 #include "db/dbformat.h"
18 #include "db/pinned_iterators_manager.h"
19
20 #include "file/file_prefetch_buffer.h"
21 #include "file/random_access_file_reader.h"
22
23 #include "rocksdb/cache.h"
24 #include "rocksdb/comparator.h"
25 #include "rocksdb/env.h"
26 #include "rocksdb/file_system.h"
27 #include "rocksdb/filter_policy.h"
28 #include "rocksdb/iterator.h"
29 #include "rocksdb/options.h"
30 #include "rocksdb/statistics.h"
31 #include "rocksdb/table.h"
32 #include "rocksdb/table_properties.h"
33
34 #include "table/block_based/block.h"
35 #include "table/block_based/block_based_filter_block.h"
36 #include "table/block_based/block_based_table_factory.h"
37 #include "table/block_based/block_prefix_index.h"
38 #include "table/block_based/filter_block.h"
39 #include "table/block_based/full_filter_block.h"
40 #include "table/block_based/partitioned_filter_block.h"
41 #include "table/block_fetcher.h"
42 #include "table/format.h"
43 #include "table/get_context.h"
44 #include "table/internal_iterator.h"
45 #include "table/meta_blocks.h"
46 #include "table/multiget_context.h"
47 #include "table/persistent_cache_helper.h"
48 #include "table/sst_file_writer_collectors.h"
49 #include "table/two_level_iterator.h"
50
51 #include "monitoring/perf_context_imp.h"
52 #include "test_util/sync_point.h"
53 #include "util/coding.h"
54 #include "util/crc32c.h"
55 #include "util/stop_watch.h"
56 #include "util/string_util.h"
57 #include "util/util.h"
58 #include "util/xxhash.h"
59
60 namespace ROCKSDB_NAMESPACE {
61
62 extern const uint64_t kBlockBasedTableMagicNumber;
63 extern const std::string kHashIndexPrefixesBlock;
64 extern const std::string kHashIndexPrefixesMetadataBlock;
65
66 typedef BlockBasedTable::IndexReader IndexReader;
67
68 // Found that 256 KB readahead size provides the best performance, based on
69 // experiments, for auto readahead. Experiment data is in PR #3282.
70 const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
71
~BlockBasedTable()72 BlockBasedTable::~BlockBasedTable() {
73 delete rep_;
74 }
75
76 std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
77
78 template <typename TBlocklike>
79 class BlocklikeTraits;
80
81 template <>
82 class BlocklikeTraits<BlockContents> {
83 public:
Create(BlockContents && contents,SequenceNumber,size_t,Statistics *,bool,const FilterPolicy *)84 static BlockContents* Create(BlockContents&& contents,
85 SequenceNumber /* global_seqno */,
86 size_t /* read_amp_bytes_per_bit */,
87 Statistics* /* statistics */,
88 bool /* using_zstd */,
89 const FilterPolicy* /* filter_policy */) {
90 return new BlockContents(std::move(contents));
91 }
92
GetNumRestarts(const BlockContents &)93 static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
94 return 0;
95 }
96 };
97
98 template <>
99 class BlocklikeTraits<ParsedFullFilterBlock> {
100 public:
Create(BlockContents && contents,SequenceNumber,size_t,Statistics *,bool,const FilterPolicy * filter_policy)101 static ParsedFullFilterBlock* Create(BlockContents&& contents,
102 SequenceNumber /* global_seqno */,
103 size_t /* read_amp_bytes_per_bit */,
104 Statistics* /* statistics */,
105 bool /* using_zstd */,
106 const FilterPolicy* filter_policy) {
107 return new ParsedFullFilterBlock(filter_policy, std::move(contents));
108 }
109
GetNumRestarts(const ParsedFullFilterBlock &)110 static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
111 return 0;
112 }
113 };
114
115 template <>
116 class BlocklikeTraits<Block> {
117 public:
Create(BlockContents && contents,SequenceNumber global_seqno,size_t read_amp_bytes_per_bit,Statistics * statistics,bool,const FilterPolicy *)118 static Block* Create(BlockContents&& contents, SequenceNumber global_seqno,
119 size_t read_amp_bytes_per_bit, Statistics* statistics,
120 bool /* using_zstd */,
121 const FilterPolicy* /* filter_policy */) {
122 return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
123 statistics);
124 }
125
GetNumRestarts(const Block & block)126 static uint32_t GetNumRestarts(const Block& block) {
127 return block.NumRestarts();
128 }
129 };
130
131 template <>
132 class BlocklikeTraits<UncompressionDict> {
133 public:
Create(BlockContents && contents,SequenceNumber,size_t,Statistics *,bool using_zstd,const FilterPolicy *)134 static UncompressionDict* Create(BlockContents&& contents,
135 SequenceNumber /* global_seqno */,
136 size_t /* read_amp_bytes_per_bit */,
137 Statistics* /* statistics */,
138 bool using_zstd,
139 const FilterPolicy* /* filter_policy */) {
140 return new UncompressionDict(contents.data, std::move(contents.allocation),
141 using_zstd);
142 }
143
GetNumRestarts(const UncompressionDict &)144 static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
145 return 0;
146 }
147 };
148
149 namespace {
150 // Read the block identified by "handle" from "file".
151 // The only relevant option is options.verify_checksums for now.
152 // On failure return non-OK.
153 // On success fill *result and return OK - caller owns *result
154 // @param uncompression_dict Data for presetting the compression library's
155 // dictionary.
156 template <typename TBlocklike>
ReadBlockFromFile(RandomAccessFileReader * file,FilePrefetchBuffer * prefetch_buffer,const Footer & footer,const ReadOptions & options,const BlockHandle & handle,std::unique_ptr<TBlocklike> * result,const ImmutableCFOptions & ioptions,bool do_uncompress,bool maybe_compressed,BlockType block_type,const UncompressionDict & uncompression_dict,const PersistentCacheOptions & cache_options,SequenceNumber global_seqno,size_t read_amp_bytes_per_bit,MemoryAllocator * memory_allocator,bool for_compaction,bool using_zstd,const FilterPolicy * filter_policy)157 Status ReadBlockFromFile(
158 RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
159 const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
160 std::unique_ptr<TBlocklike>* result, const ImmutableCFOptions& ioptions,
161 bool do_uncompress, bool maybe_compressed, BlockType block_type,
162 const UncompressionDict& uncompression_dict,
163 const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
164 size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
165 bool for_compaction, bool using_zstd, const FilterPolicy* filter_policy) {
166 assert(result);
167
168 BlockContents contents;
169 BlockFetcher block_fetcher(
170 file, prefetch_buffer, footer, options, handle, &contents, ioptions,
171 do_uncompress, maybe_compressed, block_type, uncompression_dict,
172 cache_options, memory_allocator, nullptr, for_compaction);
173 Status s = block_fetcher.ReadBlockContents();
174 if (s.ok()) {
175 result->reset(BlocklikeTraits<TBlocklike>::Create(
176 std::move(contents), global_seqno, read_amp_bytes_per_bit,
177 ioptions.statistics, using_zstd, filter_policy));
178 }
179
180 return s;
181 }
182
GetMemoryAllocator(const BlockBasedTableOptions & table_options)183 inline MemoryAllocator* GetMemoryAllocator(
184 const BlockBasedTableOptions& table_options) {
185 return table_options.block_cache.get()
186 ? table_options.block_cache->memory_allocator()
187 : nullptr;
188 }
189
GetMemoryAllocatorForCompressedBlock(const BlockBasedTableOptions & table_options)190 inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
191 const BlockBasedTableOptions& table_options) {
192 return table_options.block_cache_compressed.get()
193 ? table_options.block_cache_compressed->memory_allocator()
194 : nullptr;
195 }
196
197 // Delete the entry resided in the cache.
198 template <class Entry>
DeleteCachedEntry(const Slice &,void * value)199 void DeleteCachedEntry(const Slice& /*key*/, void* value) {
200 auto entry = reinterpret_cast<Entry*>(value);
201 delete entry;
202 }
203
204 // Release the cached entry and decrement its ref count.
ForceReleaseCachedEntry(void * arg,void * h)205 void ForceReleaseCachedEntry(void* arg, void* h) {
206 Cache* cache = reinterpret_cast<Cache*>(arg);
207 Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
208 cache->Release(handle, true /* force_erase */);
209 }
210
211 // Release the cached entry and decrement its ref count.
212 // Do not force erase
ReleaseCachedEntry(void * arg,void * h)213 void ReleaseCachedEntry(void* arg, void* h) {
214 Cache* cache = reinterpret_cast<Cache*>(arg);
215 Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
216 cache->Release(handle, false /* force_erase */);
217 }
218
219 // For hash based index, return true if prefix_extractor and
220 // prefix_extractor_block mismatch, false otherwise. This flag will be used
221 // as total_order_seek via NewIndexIterator
PrefixExtractorChanged(const TableProperties * table_properties,const SliceTransform * prefix_extractor)222 bool PrefixExtractorChanged(const TableProperties* table_properties,
223 const SliceTransform* prefix_extractor) {
224 // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
225 // Turn off hash index in prefix_extractor is not set; if prefix_extractor
226 // is set but prefix_extractor_block is not set, also disable hash index
227 if (prefix_extractor == nullptr || table_properties == nullptr ||
228 table_properties->prefix_extractor_name.empty()) {
229 return true;
230 }
231
232 // prefix_extractor and prefix_extractor_block are both non-empty
233 if (table_properties->prefix_extractor_name.compare(
234 prefix_extractor->Name()) != 0) {
235 return true;
236 } else {
237 return false;
238 }
239 }
240
CopyBufferToHeap(MemoryAllocator * allocator,Slice & buf)241 CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
242 CacheAllocationPtr heap_buf;
243 heap_buf = AllocateBlock(buf.size(), allocator);
244 memcpy(heap_buf.get(), buf.data(), buf.size());
245 return heap_buf;
246 }
247
248 } // namespace
249
250 // Encapsulates common functionality for the various index reader
251 // implementations. Provides access to the index block regardless of whether
252 // it is owned by the reader or stored in the cache, or whether it is pinned
253 // in the cache or not.
254 class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
255 public:
IndexReaderCommon(const BlockBasedTable * t,CachableEntry<Block> && index_block)256 IndexReaderCommon(const BlockBasedTable* t,
257 CachableEntry<Block>&& index_block)
258 : table_(t), index_block_(std::move(index_block)) {
259 assert(table_ != nullptr);
260 }
261
262 protected:
263 static Status ReadIndexBlock(const BlockBasedTable* table,
264 FilePrefetchBuffer* prefetch_buffer,
265 const ReadOptions& read_options, bool use_cache,
266 GetContext* get_context,
267 BlockCacheLookupContext* lookup_context,
268 CachableEntry<Block>* index_block);
269
table() const270 const BlockBasedTable* table() const { return table_; }
271
internal_comparator() const272 const InternalKeyComparator* internal_comparator() const {
273 assert(table_ != nullptr);
274 assert(table_->get_rep() != nullptr);
275
276 return &table_->get_rep()->internal_comparator;
277 }
278
index_has_first_key() const279 bool index_has_first_key() const {
280 assert(table_ != nullptr);
281 assert(table_->get_rep() != nullptr);
282 return table_->get_rep()->index_has_first_key;
283 }
284
index_key_includes_seq() const285 bool index_key_includes_seq() const {
286 assert(table_ != nullptr);
287 assert(table_->get_rep() != nullptr);
288 return table_->get_rep()->index_key_includes_seq;
289 }
290
index_value_is_full() const291 bool index_value_is_full() const {
292 assert(table_ != nullptr);
293 assert(table_->get_rep() != nullptr);
294 return table_->get_rep()->index_value_is_full;
295 }
296
cache_index_blocks() const297 bool cache_index_blocks() const {
298 assert(table_ != nullptr);
299 assert(table_->get_rep() != nullptr);
300 return table_->get_rep()->table_options.cache_index_and_filter_blocks;
301 }
302
303 Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
304 BlockCacheLookupContext* lookup_context,
305 CachableEntry<Block>* index_block) const;
306
ApproximateIndexBlockMemoryUsage() const307 size_t ApproximateIndexBlockMemoryUsage() const {
308 assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
309 return index_block_.GetOwnValue()
310 ? index_block_.GetValue()->ApproximateMemoryUsage()
311 : 0;
312 }
313
314 private:
315 const BlockBasedTable* table_;
316 CachableEntry<Block> index_block_;
317 };
318
ReadIndexBlock(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,const ReadOptions & read_options,bool use_cache,GetContext * get_context,BlockCacheLookupContext * lookup_context,CachableEntry<Block> * index_block)319 Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
320 const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
321 const ReadOptions& read_options, bool use_cache, GetContext* get_context,
322 BlockCacheLookupContext* lookup_context,
323 CachableEntry<Block>* index_block) {
324 PERF_TIMER_GUARD(read_index_block_nanos);
325
326 assert(table != nullptr);
327 assert(index_block != nullptr);
328 assert(index_block->IsEmpty());
329
330 const Rep* const rep = table->get_rep();
331 assert(rep != nullptr);
332
333 const Status s = table->RetrieveBlock(
334 prefetch_buffer, read_options, rep->footer.index_handle(),
335 UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
336 get_context, lookup_context, /* for_compaction */ false, use_cache);
337
338 return s;
339 }
340
GetOrReadIndexBlock(bool no_io,GetContext * get_context,BlockCacheLookupContext * lookup_context,CachableEntry<Block> * index_block) const341 Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
342 bool no_io, GetContext* get_context,
343 BlockCacheLookupContext* lookup_context,
344 CachableEntry<Block>* index_block) const {
345 assert(index_block != nullptr);
346
347 if (!index_block_.IsEmpty()) {
348 index_block->SetUnownedValue(index_block_.GetValue());
349 return Status::OK();
350 }
351
352 ReadOptions read_options;
353 if (no_io) {
354 read_options.read_tier = kBlockCacheTier;
355 }
356
357 return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
358 cache_index_blocks(), get_context, lookup_context,
359 index_block);
360 }
361
362 // Index that allows binary search lookup in a two-level index structure.
363 class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
364 public:
365 // Read the partition index from the file and create an instance for
366 // `PartitionIndexReader`.
367 // On success, index_reader will be populated; otherwise it will remain
368 // unmodified.
Create(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)369 static Status Create(const BlockBasedTable* table,
370 FilePrefetchBuffer* prefetch_buffer, bool use_cache,
371 bool prefetch, bool pin,
372 BlockCacheLookupContext* lookup_context,
373 std::unique_ptr<IndexReader>* index_reader) {
374 assert(table != nullptr);
375 assert(table->get_rep());
376 assert(!pin || prefetch);
377 assert(index_reader != nullptr);
378
379 CachableEntry<Block> index_block;
380 if (prefetch || !use_cache) {
381 const Status s =
382 ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
383 /*get_context=*/nullptr, lookup_context, &index_block);
384 if (!s.ok()) {
385 return s;
386 }
387
388 if (use_cache && !pin) {
389 index_block.Reset();
390 }
391 }
392
393 index_reader->reset(
394 new PartitionIndexReader(table, std::move(index_block)));
395
396 return Status::OK();
397 }
398
399 // return a two-level iterator: first level is on the partition index
NewIterator(const ReadOptions & read_options,bool,IndexBlockIter * iter,GetContext * get_context,BlockCacheLookupContext * lookup_context)400 InternalIteratorBase<IndexValue>* NewIterator(
401 const ReadOptions& read_options, bool /* disable_prefix_seek */,
402 IndexBlockIter* iter, GetContext* get_context,
403 BlockCacheLookupContext* lookup_context) override {
404 const bool no_io = (read_options.read_tier == kBlockCacheTier);
405 CachableEntry<Block> index_block;
406 const Status s =
407 GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
408 if (!s.ok()) {
409 if (iter != nullptr) {
410 iter->Invalidate(s);
411 return iter;
412 }
413
414 return NewErrorInternalIterator<IndexValue>(s);
415 }
416
417 InternalIteratorBase<IndexValue>* it = nullptr;
418
419 Statistics* kNullStats = nullptr;
420 // Filters are already checked before seeking the index
421 if (!partition_map_.empty()) {
422 // We don't return pinned data from index blocks, so no need
423 // to set `block_contents_pinned`.
424 it = NewTwoLevelIterator(
425 new BlockBasedTable::PartitionedIndexIteratorState(table(),
426 &partition_map_),
427 index_block.GetValue()->NewIndexIterator(
428 internal_comparator(), internal_comparator()->user_comparator(),
429 nullptr, kNullStats, true, index_has_first_key(),
430 index_key_includes_seq(), index_value_is_full()));
431 } else {
432 ReadOptions ro;
433 ro.fill_cache = read_options.fill_cache;
434 // We don't return pinned data from index blocks, so no need
435 // to set `block_contents_pinned`.
436 it = new BlockBasedTableIterator<IndexBlockIter, IndexValue>(
437 table(), ro, *internal_comparator(),
438 index_block.GetValue()->NewIndexIterator(
439 internal_comparator(), internal_comparator()->user_comparator(),
440 nullptr, kNullStats, true, index_has_first_key(),
441 index_key_includes_seq(), index_value_is_full()),
442 false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
443 lookup_context ? lookup_context->caller
444 : TableReaderCaller::kUncategorized);
445 }
446
447 assert(it != nullptr);
448 index_block.TransferTo(it);
449
450 return it;
451
452 // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
453 // on-stack BlockIter while the state is on heap. Currentlly it assumes
454 // the first level iter is always on heap and will attempt to delete it
455 // in its destructor.
456 }
457
CacheDependencies(bool pin)458 void CacheDependencies(bool pin) override {
459 // Before read partitions, prefetch them to avoid lots of IOs
460 BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
461 const BlockBasedTable::Rep* rep = table()->rep_;
462 IndexBlockIter biter;
463 BlockHandle handle;
464 Statistics* kNullStats = nullptr;
465
466 CachableEntry<Block> index_block;
467 Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
468 &lookup_context, &index_block);
469 if (!s.ok()) {
470 ROCKS_LOG_WARN(rep->ioptions.info_log,
471 "Error retrieving top-level index block while trying to "
472 "cache index partitions: %s",
473 s.ToString().c_str());
474 return;
475 }
476
477 // We don't return pinned data from index blocks, so no need
478 // to set `block_contents_pinned`.
479 index_block.GetValue()->NewIndexIterator(
480 internal_comparator(), internal_comparator()->user_comparator(), &biter,
481 kNullStats, true, index_has_first_key(), index_key_includes_seq(),
482 index_value_is_full());
483 // Index partitions are assumed to be consecuitive. Prefetch them all.
484 // Read the first block offset
485 biter.SeekToFirst();
486 if (!biter.Valid()) {
487 // Empty index.
488 return;
489 }
490 handle = biter.value().handle;
491 uint64_t prefetch_off = handle.offset();
492
493 // Read the last block's offset
494 biter.SeekToLast();
495 if (!biter.Valid()) {
496 // Empty index.
497 return;
498 }
499 handle = biter.value().handle;
500 uint64_t last_off = handle.offset() + block_size(handle);
501 uint64_t prefetch_len = last_off - prefetch_off;
502 std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
503 rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer);
504 s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off,
505 static_cast<size_t>(prefetch_len));
506
507 // After prefetch, read the partitions one by one
508 biter.SeekToFirst();
509 auto ro = ReadOptions();
510 for (; biter.Valid(); biter.Next()) {
511 handle = biter.value().handle;
512 CachableEntry<Block> block;
513 // TODO: Support counter batch update for partitioned index and
514 // filter blocks
515 s = table()->MaybeReadBlockAndLoadToCache(
516 prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
517 &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context,
518 /*contents=*/nullptr);
519
520 assert(s.ok() || block.GetValue() == nullptr);
521 if (s.ok() && block.GetValue() != nullptr) {
522 if (block.IsCached()) {
523 if (pin) {
524 partition_map_[handle.offset()] = std::move(block);
525 }
526 }
527 }
528 }
529 }
530
ApproximateMemoryUsage() const531 size_t ApproximateMemoryUsage() const override {
532 size_t usage = ApproximateIndexBlockMemoryUsage();
533 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
534 usage += malloc_usable_size(const_cast<PartitionIndexReader*>(this));
535 #else
536 usage += sizeof(*this);
537 #endif // ROCKSDB_MALLOC_USABLE_SIZE
538 // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
539 return usage;
540 }
541
542 private:
PartitionIndexReader(const BlockBasedTable * t,CachableEntry<Block> && index_block)543 PartitionIndexReader(const BlockBasedTable* t,
544 CachableEntry<Block>&& index_block)
545 : IndexReaderCommon(t, std::move(index_block)) {}
546
547 std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
548 };
549
550 // Index that allows binary search lookup for the first key of each block.
551 // This class can be viewed as a thin wrapper for `Block` class which already
552 // supports binary search.
553 class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
554 public:
555 // Read index from the file and create an intance for
556 // `BinarySearchIndexReader`.
557 // On success, index_reader will be populated; otherwise it will remain
558 // unmodified.
Create(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)559 static Status Create(const BlockBasedTable* table,
560 FilePrefetchBuffer* prefetch_buffer, bool use_cache,
561 bool prefetch, bool pin,
562 BlockCacheLookupContext* lookup_context,
563 std::unique_ptr<IndexReader>* index_reader) {
564 assert(table != nullptr);
565 assert(table->get_rep());
566 assert(!pin || prefetch);
567 assert(index_reader != nullptr);
568
569 CachableEntry<Block> index_block;
570 if (prefetch || !use_cache) {
571 const Status s =
572 ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
573 /*get_context=*/nullptr, lookup_context, &index_block);
574 if (!s.ok()) {
575 return s;
576 }
577
578 if (use_cache && !pin) {
579 index_block.Reset();
580 }
581 }
582
583 index_reader->reset(
584 new BinarySearchIndexReader(table, std::move(index_block)));
585
586 return Status::OK();
587 }
588
NewIterator(const ReadOptions & read_options,bool,IndexBlockIter * iter,GetContext * get_context,BlockCacheLookupContext * lookup_context)589 InternalIteratorBase<IndexValue>* NewIterator(
590 const ReadOptions& read_options, bool /* disable_prefix_seek */,
591 IndexBlockIter* iter, GetContext* get_context,
592 BlockCacheLookupContext* lookup_context) override {
593 const bool no_io = (read_options.read_tier == kBlockCacheTier);
594 CachableEntry<Block> index_block;
595 const Status s =
596 GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
597 if (!s.ok()) {
598 if (iter != nullptr) {
599 iter->Invalidate(s);
600 return iter;
601 }
602
603 return NewErrorInternalIterator<IndexValue>(s);
604 }
605
606 Statistics* kNullStats = nullptr;
607 // We don't return pinned data from index blocks, so no need
608 // to set `block_contents_pinned`.
609 auto it = index_block.GetValue()->NewIndexIterator(
610 internal_comparator(), internal_comparator()->user_comparator(), iter,
611 kNullStats, true, index_has_first_key(), index_key_includes_seq(),
612 index_value_is_full());
613
614 assert(it != nullptr);
615 index_block.TransferTo(it);
616
617 return it;
618 }
619
ApproximateMemoryUsage() const620 size_t ApproximateMemoryUsage() const override {
621 size_t usage = ApproximateIndexBlockMemoryUsage();
622 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
623 usage += malloc_usable_size(const_cast<BinarySearchIndexReader*>(this));
624 #else
625 usage += sizeof(*this);
626 #endif // ROCKSDB_MALLOC_USABLE_SIZE
627 return usage;
628 }
629
630 private:
BinarySearchIndexReader(const BlockBasedTable * t,CachableEntry<Block> && index_block)631 BinarySearchIndexReader(const BlockBasedTable* t,
632 CachableEntry<Block>&& index_block)
633 : IndexReaderCommon(t, std::move(index_block)) {}
634 };
635
636 // Index that leverages an internal hash table to quicken the lookup for a given
637 // key.
638 class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
639 public:
Create(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_index_iter,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)640 static Status Create(const BlockBasedTable* table,
641 FilePrefetchBuffer* prefetch_buffer,
642 InternalIterator* meta_index_iter, bool use_cache,
643 bool prefetch, bool pin,
644 BlockCacheLookupContext* lookup_context,
645 std::unique_ptr<IndexReader>* index_reader) {
646 assert(table != nullptr);
647 assert(index_reader != nullptr);
648 assert(!pin || prefetch);
649
650 const BlockBasedTable::Rep* rep = table->get_rep();
651 assert(rep != nullptr);
652
653 CachableEntry<Block> index_block;
654 if (prefetch || !use_cache) {
655 const Status s =
656 ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
657 /*get_context=*/nullptr, lookup_context, &index_block);
658 if (!s.ok()) {
659 return s;
660 }
661
662 if (use_cache && !pin) {
663 index_block.Reset();
664 }
665 }
666
667 // Note, failure to create prefix hash index does not need to be a
668 // hard error. We can still fall back to the original binary search index.
669 // So, Create will succeed regardless, from this point on.
670
671 index_reader->reset(new HashIndexReader(table, std::move(index_block)));
672
673 // Get prefixes block
674 BlockHandle prefixes_handle;
675 Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
676 &prefixes_handle);
677 if (!s.ok()) {
678 // TODO: log error
679 return Status::OK();
680 }
681
682 // Get index metadata block
683 BlockHandle prefixes_meta_handle;
684 s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
685 &prefixes_meta_handle);
686 if (!s.ok()) {
687 // TODO: log error
688 return Status::OK();
689 }
690
691 RandomAccessFileReader* const file = rep->file.get();
692 const Footer& footer = rep->footer;
693 const ImmutableCFOptions& ioptions = rep->ioptions;
694 const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
695 MemoryAllocator* const memory_allocator =
696 GetMemoryAllocator(rep->table_options);
697
698 // Read contents for the blocks
699 BlockContents prefixes_contents;
700 BlockFetcher prefixes_block_fetcher(
701 file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
702 &prefixes_contents, ioptions, true /*decompress*/,
703 true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
704 UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
705 s = prefixes_block_fetcher.ReadBlockContents();
706 if (!s.ok()) {
707 return s;
708 }
709 BlockContents prefixes_meta_contents;
710 BlockFetcher prefixes_meta_block_fetcher(
711 file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
712 &prefixes_meta_contents, ioptions, true /*decompress*/,
713 true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
714 UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
715 s = prefixes_meta_block_fetcher.ReadBlockContents();
716 if (!s.ok()) {
717 // TODO: log error
718 return Status::OK();
719 }
720
721 BlockPrefixIndex* prefix_index = nullptr;
722 assert(rep->internal_prefix_transform.get() != nullptr);
723 s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
724 prefixes_contents.data,
725 prefixes_meta_contents.data, &prefix_index);
726 // TODO: log error
727 if (s.ok()) {
728 HashIndexReader* const hash_index_reader =
729 static_cast<HashIndexReader*>(index_reader->get());
730 hash_index_reader->prefix_index_.reset(prefix_index);
731 }
732
733 return Status::OK();
734 }
735
NewIterator(const ReadOptions & read_options,bool disable_prefix_seek,IndexBlockIter * iter,GetContext * get_context,BlockCacheLookupContext * lookup_context)736 InternalIteratorBase<IndexValue>* NewIterator(
737 const ReadOptions& read_options, bool disable_prefix_seek,
738 IndexBlockIter* iter, GetContext* get_context,
739 BlockCacheLookupContext* lookup_context) override {
740 const bool no_io = (read_options.read_tier == kBlockCacheTier);
741 CachableEntry<Block> index_block;
742 const Status s =
743 GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
744 if (!s.ok()) {
745 if (iter != nullptr) {
746 iter->Invalidate(s);
747 return iter;
748 }
749
750 return NewErrorInternalIterator<IndexValue>(s);
751 }
752
753 Statistics* kNullStats = nullptr;
754 const bool total_order_seek =
755 read_options.total_order_seek || disable_prefix_seek;
756 // We don't return pinned data from index blocks, so no need
757 // to set `block_contents_pinned`.
758 auto it = index_block.GetValue()->NewIndexIterator(
759 internal_comparator(), internal_comparator()->user_comparator(), iter,
760 kNullStats, total_order_seek, index_has_first_key(),
761 index_key_includes_seq(), index_value_is_full(),
762 false /* block_contents_pinned */, prefix_index_.get());
763
764 assert(it != nullptr);
765 index_block.TransferTo(it);
766
767 return it;
768 }
769
ApproximateMemoryUsage() const770 size_t ApproximateMemoryUsage() const override {
771 size_t usage = ApproximateIndexBlockMemoryUsage();
772 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
773 usage += malloc_usable_size(const_cast<HashIndexReader*>(this));
774 #else
775 if (prefix_index_) {
776 usage += prefix_index_->ApproximateMemoryUsage();
777 }
778 usage += sizeof(*this);
779 #endif // ROCKSDB_MALLOC_USABLE_SIZE
780 return usage;
781 }
782
783 private:
HashIndexReader(const BlockBasedTable * t,CachableEntry<Block> && index_block)784 HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
785 : IndexReaderCommon(t, std::move(index_block)) {}
786
787 std::unique_ptr<BlockPrefixIndex> prefix_index_;
788 };
789
UpdateCacheHitMetrics(BlockType block_type,GetContext * get_context,size_t usage) const790 void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
791 GetContext* get_context,
792 size_t usage) const {
793 Statistics* const statistics = rep_->ioptions.statistics;
794
795 PERF_COUNTER_ADD(block_cache_hit_count, 1);
796 PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
797 static_cast<uint32_t>(rep_->level));
798
799 if (get_context) {
800 ++get_context->get_context_stats_.num_cache_hit;
801 get_context->get_context_stats_.num_cache_bytes_read += usage;
802 } else {
803 RecordTick(statistics, BLOCK_CACHE_HIT);
804 RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
805 }
806
807 switch (block_type) {
808 case BlockType::kFilter:
809 PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
810
811 if (get_context) {
812 ++get_context->get_context_stats_.num_cache_filter_hit;
813 } else {
814 RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
815 }
816 break;
817
818 case BlockType::kCompressionDictionary:
819 // TODO: introduce perf counter for compression dictionary hit count
820 if (get_context) {
821 ++get_context->get_context_stats_.num_cache_compression_dict_hit;
822 } else {
823 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
824 }
825 break;
826
827 case BlockType::kIndex:
828 PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
829
830 if (get_context) {
831 ++get_context->get_context_stats_.num_cache_index_hit;
832 } else {
833 RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
834 }
835 break;
836
837 default:
838 // TODO: introduce dedicated tickers/statistics/counters
839 // for range tombstones
840 if (get_context) {
841 ++get_context->get_context_stats_.num_cache_data_hit;
842 } else {
843 RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
844 }
845 break;
846 }
847 }
848
UpdateCacheMissMetrics(BlockType block_type,GetContext * get_context) const849 void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
850 GetContext* get_context) const {
851 Statistics* const statistics = rep_->ioptions.statistics;
852
853 // TODO: introduce aggregate (not per-level) block cache miss count
854 PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
855 static_cast<uint32_t>(rep_->level));
856
857 if (get_context) {
858 ++get_context->get_context_stats_.num_cache_miss;
859 } else {
860 RecordTick(statistics, BLOCK_CACHE_MISS);
861 }
862
863 // TODO: introduce perf counters for misses per block type
864 switch (block_type) {
865 case BlockType::kFilter:
866 if (get_context) {
867 ++get_context->get_context_stats_.num_cache_filter_miss;
868 } else {
869 RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
870 }
871 break;
872
873 case BlockType::kCompressionDictionary:
874 if (get_context) {
875 ++get_context->get_context_stats_.num_cache_compression_dict_miss;
876 } else {
877 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
878 }
879 break;
880
881 case BlockType::kIndex:
882 if (get_context) {
883 ++get_context->get_context_stats_.num_cache_index_miss;
884 } else {
885 RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
886 }
887 break;
888
889 default:
890 // TODO: introduce dedicated tickers/statistics/counters
891 // for range tombstones
892 if (get_context) {
893 ++get_context->get_context_stats_.num_cache_data_miss;
894 } else {
895 RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
896 }
897 break;
898 }
899 }
900
UpdateCacheInsertionMetrics(BlockType block_type,GetContext * get_context,size_t usage) const901 void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
902 GetContext* get_context,
903 size_t usage) const {
904 Statistics* const statistics = rep_->ioptions.statistics;
905
906 // TODO: introduce perf counters for block cache insertions
907 if (get_context) {
908 ++get_context->get_context_stats_.num_cache_add;
909 get_context->get_context_stats_.num_cache_bytes_write += usage;
910 } else {
911 RecordTick(statistics, BLOCK_CACHE_ADD);
912 RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
913 }
914
915 switch (block_type) {
916 case BlockType::kFilter:
917 if (get_context) {
918 ++get_context->get_context_stats_.num_cache_filter_add;
919 get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
920 } else {
921 RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
922 RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
923 }
924 break;
925
926 case BlockType::kCompressionDictionary:
927 if (get_context) {
928 ++get_context->get_context_stats_.num_cache_compression_dict_add;
929 get_context->get_context_stats_
930 .num_cache_compression_dict_bytes_insert += usage;
931 } else {
932 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
933 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
934 usage);
935 }
936 break;
937
938 case BlockType::kIndex:
939 if (get_context) {
940 ++get_context->get_context_stats_.num_cache_index_add;
941 get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
942 } else {
943 RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
944 RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
945 }
946 break;
947
948 default:
949 // TODO: introduce dedicated tickers/statistics/counters
950 // for range tombstones
951 if (get_context) {
952 ++get_context->get_context_stats_.num_cache_data_add;
953 get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
954 } else {
955 RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
956 RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
957 }
958 break;
959 }
960 }
961
GetEntryFromCache(Cache * block_cache,const Slice & key,BlockType block_type,GetContext * get_context) const962 Cache::Handle* BlockBasedTable::GetEntryFromCache(
963 Cache* block_cache, const Slice& key, BlockType block_type,
964 GetContext* get_context) const {
965 auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);
966
967 if (cache_handle != nullptr) {
968 UpdateCacheHitMetrics(block_type, get_context,
969 block_cache->GetUsage(cache_handle));
970 } else {
971 UpdateCacheMissMetrics(block_type, get_context);
972 }
973
974 return cache_handle;
975 }
976
977 // Helper function to setup the cache key's prefix for the Table.
SetupCacheKeyPrefix(Rep * rep)978 void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
979 assert(kMaxCacheKeyPrefixSize >= 10);
980 rep->cache_key_prefix_size = 0;
981 rep->compressed_cache_key_prefix_size = 0;
982 if (rep->table_options.block_cache != nullptr) {
983 GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
984 &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
985 }
986 if (rep->table_options.persistent_cache != nullptr) {
987 GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
988 &rep->persistent_cache_key_prefix[0],
989 &rep->persistent_cache_key_prefix_size);
990 }
991 if (rep->table_options.block_cache_compressed != nullptr) {
992 GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
993 rep->file->file(), &rep->compressed_cache_key_prefix[0],
994 &rep->compressed_cache_key_prefix_size);
995 }
996 }
997
GenerateCachePrefix(Cache * cc,FSRandomAccessFile * file,char * buffer,size_t * size)998 void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
999 char* buffer, size_t* size) {
1000 // generate an id from the file
1001 *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
1002
1003 // If the prefix wasn't generated or was too long,
1004 // create one from the cache.
1005 if (cc != nullptr && *size == 0) {
1006 char* end = EncodeVarint64(buffer, cc->NewId());
1007 *size = static_cast<size_t>(end - buffer);
1008 }
1009 }
1010
GenerateCachePrefix(Cache * cc,FSWritableFile * file,char * buffer,size_t * size)1011 void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSWritableFile* file,
1012 char* buffer, size_t* size) {
1013 // generate an id from the file
1014 *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
1015
1016 // If the prefix wasn't generated or was too long,
1017 // create one from the cache.
1018 if (cc != nullptr && *size == 0) {
1019 char* end = EncodeVarint64(buffer, cc->NewId());
1020 *size = static_cast<size_t>(end - buffer);
1021 }
1022 }
1023
1024 namespace {
1025 // Return True if table_properties has `user_prop_name` has a `true` value
1026 // or it doesn't contain this property (for backward compatible).
IsFeatureSupported(const TableProperties & table_properties,const std::string & user_prop_name,Logger * info_log)1027 bool IsFeatureSupported(const TableProperties& table_properties,
1028 const std::string& user_prop_name, Logger* info_log) {
1029 auto& props = table_properties.user_collected_properties;
1030 auto pos = props.find(user_prop_name);
1031 // Older version doesn't have this value set. Skip this check.
1032 if (pos != props.end()) {
1033 if (pos->second == kPropFalse) {
1034 return false;
1035 } else if (pos->second != kPropTrue) {
1036 ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
1037 user_prop_name.c_str(), pos->second.c_str());
1038 }
1039 }
1040 return true;
1041 }
1042
1043 // Caller has to ensure seqno is not nullptr.
GetGlobalSequenceNumber(const TableProperties & table_properties,SequenceNumber largest_seqno,SequenceNumber * seqno)1044 Status GetGlobalSequenceNumber(const TableProperties& table_properties,
1045 SequenceNumber largest_seqno,
1046 SequenceNumber* seqno) {
1047 const auto& props = table_properties.user_collected_properties;
1048 const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
1049 const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
1050
1051 *seqno = kDisableGlobalSequenceNumber;
1052 if (version_pos == props.end()) {
1053 if (seqno_pos != props.end()) {
1054 std::array<char, 200> msg_buf;
1055 // This is not an external sst file, global_seqno is not supported.
1056 snprintf(
1057 msg_buf.data(), msg_buf.max_size(),
1058 "A non-external sst file have global seqno property with value %s",
1059 seqno_pos->second.c_str());
1060 return Status::Corruption(msg_buf.data());
1061 }
1062 return Status::OK();
1063 }
1064
1065 uint32_t version = DecodeFixed32(version_pos->second.c_str());
1066 if (version < 2) {
1067 if (seqno_pos != props.end() || version != 1) {
1068 std::array<char, 200> msg_buf;
1069 // This is a v1 external sst file, global_seqno is not supported.
1070 snprintf(msg_buf.data(), msg_buf.max_size(),
1071 "An external sst file with version %u have global seqno "
1072 "property with value %s",
1073 version, seqno_pos->second.c_str());
1074 return Status::Corruption(msg_buf.data());
1075 }
1076 return Status::OK();
1077 }
1078
1079 // Since we have a plan to deprecate global_seqno, we do not return failure
1080 // if seqno_pos == props.end(). We rely on version_pos to detect whether the
1081 // SST is external.
1082 SequenceNumber global_seqno(0);
1083 if (seqno_pos != props.end()) {
1084 global_seqno = DecodeFixed64(seqno_pos->second.c_str());
1085 }
1086 // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
1087 // to denote it is unknown.
1088 if (largest_seqno < kMaxSequenceNumber) {
1089 if (global_seqno == 0) {
1090 global_seqno = largest_seqno;
1091 }
1092 if (global_seqno != largest_seqno) {
1093 std::array<char, 200> msg_buf;
1094 snprintf(
1095 msg_buf.data(), msg_buf.max_size(),
1096 "An external sst file with version %u have global seqno property "
1097 "with value %s, while largest seqno in the file is %llu",
1098 version, seqno_pos->second.c_str(),
1099 static_cast<unsigned long long>(largest_seqno));
1100 return Status::Corruption(msg_buf.data());
1101 }
1102 }
1103 *seqno = global_seqno;
1104
1105 if (global_seqno > kMaxSequenceNumber) {
1106 std::array<char, 200> msg_buf;
1107 snprintf(msg_buf.data(), msg_buf.max_size(),
1108 "An external sst file with version %u have global seqno property "
1109 "with value %llu, which is greater than kMaxSequenceNumber",
1110 version, static_cast<unsigned long long>(global_seqno));
1111 return Status::Corruption(msg_buf.data());
1112 }
1113
1114 return Status::OK();
1115 }
1116 } // namespace
1117
GetCacheKey(const char * cache_key_prefix,size_t cache_key_prefix_size,const BlockHandle & handle,char * cache_key)1118 Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
1119 size_t cache_key_prefix_size,
1120 const BlockHandle& handle, char* cache_key) {
1121 assert(cache_key != nullptr);
1122 assert(cache_key_prefix_size != 0);
1123 assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
1124 memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
1125 char* end =
1126 EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
1127 return Slice(cache_key, static_cast<size_t>(end - cache_key));
1128 }
1129
Open(const ImmutableCFOptions & ioptions,const EnvOptions & env_options,const BlockBasedTableOptions & table_options,const InternalKeyComparator & internal_comparator,std::unique_ptr<RandomAccessFileReader> && file,uint64_t file_size,std::unique_ptr<TableReader> * table_reader,const SliceTransform * prefix_extractor,const bool prefetch_index_and_filter_in_cache,const bool skip_filters,const int level,const bool immortal_table,const SequenceNumber largest_seqno,TailPrefetchStats * tail_prefetch_stats,BlockCacheTracer * const block_cache_tracer)1130 Status BlockBasedTable::Open(
1131 const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
1132 const BlockBasedTableOptions& table_options,
1133 const InternalKeyComparator& internal_comparator,
1134 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
1135 std::unique_ptr<TableReader>* table_reader,
1136 const SliceTransform* prefix_extractor,
1137 const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
1138 const int level, const bool immortal_table,
1139 const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
1140 BlockCacheTracer* const block_cache_tracer) {
1141 table_reader->reset();
1142
1143 Status s;
1144 Footer footer;
1145 std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
1146
1147 // prefetch both index and filters, down to all partitions
1148 const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
1149 const bool preload_all = !table_options.cache_index_and_filter_blocks;
1150
1151 if (!ioptions.allow_mmap_reads) {
1152 s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
1153 preload_all, &prefetch_buffer);
1154 } else {
1155 // Should not prefetch for mmap mode.
1156 prefetch_buffer.reset(new FilePrefetchBuffer(
1157 nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
1158 }
1159
1160 // Read in the following order:
1161 // 1. Footer
1162 // 2. [metaindex block]
1163 // 3. [meta block: properties]
1164 // 4. [meta block: range deletion tombstone]
1165 // 5. [meta block: compression dictionary]
1166 // 6. [meta block: index]
1167 // 7. [meta block: filter]
1168 s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
1169 kBlockBasedTableMagicNumber);
1170 if (!s.ok()) {
1171 return s;
1172 }
1173 if (!BlockBasedTableSupportedVersion(footer.version())) {
1174 return Status::Corruption(
1175 "Unknown Footer version. Maybe this file was created with newer "
1176 "version of RocksDB?");
1177 }
1178
1179 // We've successfully read the footer. We are ready to serve requests.
1180 // Better not mutate rep_ after the creation. eg. internal_prefix_transform
1181 // raw pointer will be used to create HashIndexReader, whose reset may
1182 // access a dangling pointer.
1183 BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
1184 Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
1185 internal_comparator, skip_filters, level,
1186 immortal_table);
1187 rep->file = std::move(file);
1188 rep->footer = footer;
1189 rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
1190 // We need to wrap data with internal_prefix_transform to make sure it can
1191 // handle prefix correctly.
1192 if (prefix_extractor != nullptr) {
1193 rep->internal_prefix_transform.reset(
1194 new InternalKeySliceTransform(prefix_extractor));
1195 }
1196 SetupCacheKeyPrefix(rep);
1197 std::unique_ptr<BlockBasedTable> new_table(
1198 new BlockBasedTable(rep, block_cache_tracer));
1199
1200 // page cache options
1201 rep->persistent_cache_options =
1202 PersistentCacheOptions(rep->table_options.persistent_cache,
1203 std::string(rep->persistent_cache_key_prefix,
1204 rep->persistent_cache_key_prefix_size),
1205 rep->ioptions.statistics);
1206
1207 // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
1208 // handle to null, otherwise it may be seen as uninitialized during the below
1209 // meta-block reads.
1210 rep->compression_dict_handle = BlockHandle::NullBlockHandle();
1211
1212 // Read metaindex
1213 std::unique_ptr<Block> metaindex;
1214 std::unique_ptr<InternalIterator> metaindex_iter;
1215 s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex,
1216 &metaindex_iter);
1217 if (!s.ok()) {
1218 return s;
1219 }
1220
1221 // Populates table_properties and some fields that depend on it,
1222 // such as index_type.
1223 s = new_table->ReadPropertiesBlock(prefetch_buffer.get(),
1224 metaindex_iter.get(), largest_seqno);
1225 if (!s.ok()) {
1226 return s;
1227 }
1228 s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(),
1229 internal_comparator, &lookup_context);
1230 if (!s.ok()) {
1231 return s;
1232 }
1233 s = new_table->PrefetchIndexAndFilterBlocks(
1234 prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
1235 prefetch_all, table_options, level, &lookup_context);
1236
1237 if (s.ok()) {
1238 // Update tail prefetch stats
1239 assert(prefetch_buffer.get() != nullptr);
1240 if (tail_prefetch_stats != nullptr) {
1241 assert(prefetch_buffer->min_offset_read() < file_size);
1242 tail_prefetch_stats->RecordEffectiveSize(
1243 static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
1244 }
1245
1246 *table_reader = std::move(new_table);
1247 }
1248
1249 return s;
1250 }
1251
PrefetchTail(RandomAccessFileReader * file,uint64_t file_size,TailPrefetchStats * tail_prefetch_stats,const bool prefetch_all,const bool preload_all,std::unique_ptr<FilePrefetchBuffer> * prefetch_buffer)1252 Status BlockBasedTable::PrefetchTail(
1253 RandomAccessFileReader* file, uint64_t file_size,
1254 TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
1255 const bool preload_all,
1256 std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
1257 size_t tail_prefetch_size = 0;
1258 if (tail_prefetch_stats != nullptr) {
1259 // Multiple threads may get a 0 (no history) when running in parallel,
1260 // but it will get cleared after the first of them finishes.
1261 tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
1262 }
1263 if (tail_prefetch_size == 0) {
1264 // Before read footer, readahead backwards to prefetch data. Do more
1265 // readahead if we're going to read index/filter.
1266 // TODO: This may incorrectly select small readahead in case partitioned
1267 // index/filter is enabled and top-level partition pinning is enabled.
1268 // That's because we need to issue readahead before we read the properties,
1269 // at which point we don't yet know the index type.
1270 tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
1271 }
1272 size_t prefetch_off;
1273 size_t prefetch_len;
1274 if (file_size < tail_prefetch_size) {
1275 prefetch_off = 0;
1276 prefetch_len = static_cast<size_t>(file_size);
1277 } else {
1278 prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
1279 prefetch_len = tail_prefetch_size;
1280 }
1281 TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
1282 &tail_prefetch_size);
1283 Status s;
1284 // TODO should not have this special logic in the future.
1285 if (!file->use_direct_io()) {
1286 prefetch_buffer->reset(new FilePrefetchBuffer(
1287 nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
1288 s = file->Prefetch(prefetch_off, prefetch_len);
1289 } else {
1290 prefetch_buffer->reset(new FilePrefetchBuffer(
1291 nullptr, 0, 0, true /* enable */, true /* track_min_offset */));
1292 s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
1293 }
1294 return s;
1295 }
1296
VerifyChecksum(const ChecksumType type,const char * buf,size_t len,uint32_t expected)1297 Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
1298 uint32_t expected) {
1299 Status s;
1300 uint32_t actual = 0;
1301 switch (type) {
1302 case kNoChecksum:
1303 break;
1304 case kCRC32c:
1305 expected = crc32c::Unmask(expected);
1306 actual = crc32c::Value(buf, len);
1307 break;
1308 case kxxHash:
1309 actual = XXH32(buf, static_cast<int>(len), 0);
1310 break;
1311 case kxxHash64:
1312 actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
1313 uint64_t{0xffffffff});
1314 break;
1315 default:
1316 s = Status::Corruption("unknown checksum type");
1317 }
1318 if (s.ok() && actual != expected) {
1319 s = Status::Corruption("properties block checksum mismatched");
1320 }
1321 return s;
1322 }
1323
TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer * prefetch_buffer,const Slice & handle_value,TableProperties ** table_properties)1324 Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
1325 FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
1326 TableProperties** table_properties) {
1327 assert(table_properties != nullptr);
1328 // If this is an external SST file ingested with write_global_seqno set to
1329 // true, then we expect the checksum mismatch because checksum was written
1330 // by SstFileWriter, but its global seqno in the properties block may have
1331 // been changed during ingestion. In this case, we read the properties
1332 // block, copy it to a memory buffer, change the global seqno to its
1333 // original value, i.e. 0, and verify the checksum again.
1334 BlockHandle props_block_handle;
1335 CacheAllocationPtr tmp_buf;
1336 Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
1337 rep_->footer, rep_->ioptions, table_properties,
1338 false /* verify_checksum */, &props_block_handle,
1339 &tmp_buf, false /* compression_type_missing */,
1340 nullptr /* memory_allocator */);
1341 if (s.ok() && tmp_buf) {
1342 const auto seqno_pos_iter =
1343 (*table_properties)
1344 ->properties_offsets.find(
1345 ExternalSstFilePropertyNames::kGlobalSeqno);
1346 size_t block_size = static_cast<size_t>(props_block_handle.size());
1347 if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
1348 uint64_t global_seqno_offset = seqno_pos_iter->second;
1349 EncodeFixed64(
1350 tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
1351 }
1352 uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
1353 s = ROCKSDB_NAMESPACE::VerifyChecksum(rep_->footer.checksum(),
1354 tmp_buf.get(), block_size + 1, value);
1355 }
1356 return s;
1357 }
1358
ReadPropertiesBlock(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,const SequenceNumber largest_seqno)1359 Status BlockBasedTable::ReadPropertiesBlock(
1360 FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1361 const SequenceNumber largest_seqno) {
1362 bool found_properties_block = true;
1363 Status s;
1364 s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
1365
1366 if (!s.ok()) {
1367 ROCKS_LOG_WARN(rep_->ioptions.info_log,
1368 "Error when seeking to properties block from file: %s",
1369 s.ToString().c_str());
1370 } else if (found_properties_block) {
1371 s = meta_iter->status();
1372 TableProperties* table_properties = nullptr;
1373 if (s.ok()) {
1374 s = ReadProperties(
1375 meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
1376 rep_->ioptions, &table_properties, true /* verify_checksum */,
1377 nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
1378 false /* compression_type_missing */, nullptr /* memory_allocator */);
1379 }
1380
1381 if (s.IsCorruption()) {
1382 s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
1383 &table_properties);
1384 }
1385 std::unique_ptr<TableProperties> props_guard;
1386 if (table_properties != nullptr) {
1387 props_guard.reset(table_properties);
1388 }
1389
1390 if (!s.ok()) {
1391 ROCKS_LOG_WARN(rep_->ioptions.info_log,
1392 "Encountered error while reading data from properties "
1393 "block %s",
1394 s.ToString().c_str());
1395 } else {
1396 assert(table_properties != nullptr);
1397 rep_->table_properties.reset(props_guard.release());
1398 rep_->blocks_maybe_compressed =
1399 rep_->table_properties->compression_name !=
1400 CompressionTypeToString(kNoCompression);
1401 rep_->blocks_definitely_zstd_compressed =
1402 (rep_->table_properties->compression_name ==
1403 CompressionTypeToString(kZSTD) ||
1404 rep_->table_properties->compression_name ==
1405 CompressionTypeToString(kZSTDNotFinalCompression));
1406 }
1407 } else {
1408 ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1409 "Cannot find Properties block from file.");
1410 }
1411 #ifndef ROCKSDB_LITE
1412 if (rep_->table_properties) {
1413 ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
1414 &(rep_->table_prefix_extractor));
1415 }
1416 #endif // ROCKSDB_LITE
1417
1418 // Read the table properties, if provided.
1419 if (rep_->table_properties) {
1420 rep_->whole_key_filtering &=
1421 IsFeatureSupported(*(rep_->table_properties),
1422 BlockBasedTablePropertyNames::kWholeKeyFiltering,
1423 rep_->ioptions.info_log);
1424 rep_->prefix_filtering &=
1425 IsFeatureSupported(*(rep_->table_properties),
1426 BlockBasedTablePropertyNames::kPrefixFiltering,
1427 rep_->ioptions.info_log);
1428
1429 rep_->index_key_includes_seq =
1430 rep_->table_properties->index_key_is_user_key == 0;
1431 rep_->index_value_is_full =
1432 rep_->table_properties->index_value_is_delta_encoded == 0;
1433
1434 // Update index_type with the true type.
1435 // If table properties don't contain index type, we assume that the table
1436 // is in very old format and has kBinarySearch index type.
1437 auto& props = rep_->table_properties->user_collected_properties;
1438 auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
1439 if (pos != props.end()) {
1440 rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
1441 DecodeFixed32(pos->second.c_str()));
1442 }
1443
1444 rep_->index_has_first_key =
1445 rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
1446
1447 s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
1448 &(rep_->global_seqno));
1449 if (!s.ok()) {
1450 ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
1451 }
1452 }
1453 return s;
1454 }
1455
ReadRangeDelBlock(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,const InternalKeyComparator & internal_comparator,BlockCacheLookupContext * lookup_context)1456 Status BlockBasedTable::ReadRangeDelBlock(
1457 FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1458 const InternalKeyComparator& internal_comparator,
1459 BlockCacheLookupContext* lookup_context) {
1460 Status s;
1461 bool found_range_del_block;
1462 BlockHandle range_del_handle;
1463 s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
1464 if (!s.ok()) {
1465 ROCKS_LOG_WARN(
1466 rep_->ioptions.info_log,
1467 "Error when seeking to range delete tombstones block from file: %s",
1468 s.ToString().c_str());
1469 } else if (found_range_del_block && !range_del_handle.IsNull()) {
1470 ReadOptions read_options;
1471 std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
1472 read_options, range_del_handle,
1473 /*input_iter=*/nullptr, BlockType::kRangeDeletion,
1474 /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
1475 assert(iter != nullptr);
1476 s = iter->status();
1477 if (!s.ok()) {
1478 ROCKS_LOG_WARN(
1479 rep_->ioptions.info_log,
1480 "Encountered error while reading data from range del block %s",
1481 s.ToString().c_str());
1482 } else {
1483 rep_->fragmented_range_dels =
1484 std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
1485 internal_comparator);
1486 }
1487 }
1488 return s;
1489 }
1490
PrefetchIndexAndFilterBlocks(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,BlockBasedTable * new_table,bool prefetch_all,const BlockBasedTableOptions & table_options,const int level,BlockCacheLookupContext * lookup_context)1491 Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
1492 FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1493 BlockBasedTable* new_table, bool prefetch_all,
1494 const BlockBasedTableOptions& table_options, const int level,
1495 BlockCacheLookupContext* lookup_context) {
1496 Status s;
1497
1498 // Find filter handle and filter type
1499 if (rep_->filter_policy) {
1500 for (auto filter_type :
1501 {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
1502 Rep::FilterType::kBlockFilter}) {
1503 std::string prefix;
1504 switch (filter_type) {
1505 case Rep::FilterType::kFullFilter:
1506 prefix = kFullFilterBlockPrefix;
1507 break;
1508 case Rep::FilterType::kPartitionedFilter:
1509 prefix = kPartitionedFilterBlockPrefix;
1510 break;
1511 case Rep::FilterType::kBlockFilter:
1512 prefix = kFilterBlockPrefix;
1513 break;
1514 default:
1515 assert(0);
1516 }
1517 std::string filter_block_key = prefix;
1518 filter_block_key.append(rep_->filter_policy->Name());
1519 if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
1520 .ok()) {
1521 rep_->filter_type = filter_type;
1522 break;
1523 }
1524 }
1525 }
1526
1527 // Find compression dictionary handle
1528 bool found_compression_dict = false;
1529 s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
1530 &rep_->compression_dict_handle);
1531 if (!s.ok()) {
1532 return s;
1533 }
1534
1535 BlockBasedTableOptions::IndexType index_type = rep_->index_type;
1536
1537 const bool use_cache = table_options.cache_index_and_filter_blocks;
1538
1539 // pin both index and filters, down to all partitions
1540 const bool pin_all =
1541 rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
1542
1543 // prefetch the first level of index
1544 const bool prefetch_index =
1545 prefetch_all ||
1546 (table_options.pin_top_level_index_and_filter &&
1547 index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
1548 // pin the first level of index
1549 const bool pin_index =
1550 pin_all || (table_options.pin_top_level_index_and_filter &&
1551 index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
1552
1553 std::unique_ptr<IndexReader> index_reader;
1554 s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
1555 prefetch_index, pin_index, lookup_context,
1556 &index_reader);
1557 if (!s.ok()) {
1558 return s;
1559 }
1560
1561 rep_->index_reader = std::move(index_reader);
1562
1563 // The partitions of partitioned index are always stored in cache. They
1564 // are hence follow the configuration for pin and prefetch regardless of
1565 // the value of cache_index_and_filter_blocks
1566 if (prefetch_all) {
1567 rep_->index_reader->CacheDependencies(pin_all);
1568 }
1569
1570 // prefetch the first level of filter
1571 const bool prefetch_filter =
1572 prefetch_all ||
1573 (table_options.pin_top_level_index_and_filter &&
1574 rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1575 // Partition fitlers cannot be enabled without partition indexes
1576 assert(!prefetch_filter || prefetch_index);
1577 // pin the first level of filter
1578 const bool pin_filter =
1579 pin_all || (table_options.pin_top_level_index_and_filter &&
1580 rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1581
1582 if (rep_->filter_policy) {
1583 auto filter = new_table->CreateFilterBlockReader(
1584 prefetch_buffer, use_cache, prefetch_filter, pin_filter,
1585 lookup_context);
1586 if (filter) {
1587 // Refer to the comment above about paritioned indexes always being cached
1588 if (prefetch_all) {
1589 filter->CacheDependencies(pin_all);
1590 }
1591
1592 rep_->filter = std::move(filter);
1593 }
1594 }
1595
1596 if (!rep_->compression_dict_handle.IsNull()) {
1597 std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
1598 s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache,
1599 prefetch_all, pin_all, lookup_context,
1600 &uncompression_dict_reader);
1601 if (!s.ok()) {
1602 return s;
1603 }
1604
1605 rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
1606 }
1607
1608 assert(s.ok());
1609 return s;
1610 }
1611
SetupForCompaction()1612 void BlockBasedTable::SetupForCompaction() {
1613 switch (rep_->ioptions.access_hint_on_compaction_start) {
1614 case Options::NONE:
1615 break;
1616 case Options::NORMAL:
1617 rep_->file->file()->Hint(FSRandomAccessFile::kNormal);
1618 break;
1619 case Options::SEQUENTIAL:
1620 rep_->file->file()->Hint(FSRandomAccessFile::kSequential);
1621 break;
1622 case Options::WILLNEED:
1623 rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed);
1624 break;
1625 default:
1626 assert(false);
1627 }
1628 }
1629
GetTableProperties() const1630 std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
1631 const {
1632 return rep_->table_properties;
1633 }
1634
ApproximateMemoryUsage() const1635 size_t BlockBasedTable::ApproximateMemoryUsage() const {
1636 size_t usage = 0;
1637 if (rep_->filter) {
1638 usage += rep_->filter->ApproximateMemoryUsage();
1639 }
1640 if (rep_->index_reader) {
1641 usage += rep_->index_reader->ApproximateMemoryUsage();
1642 }
1643 if (rep_->uncompression_dict_reader) {
1644 usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
1645 }
1646 return usage;
1647 }
1648
1649 // Load the meta-index-block from the file. On success, return the loaded
1650 // metaindex
1651 // block and its iterator.
ReadMetaIndexBlock(FilePrefetchBuffer * prefetch_buffer,std::unique_ptr<Block> * metaindex_block,std::unique_ptr<InternalIterator> * iter)1652 Status BlockBasedTable::ReadMetaIndexBlock(
1653 FilePrefetchBuffer* prefetch_buffer,
1654 std::unique_ptr<Block>* metaindex_block,
1655 std::unique_ptr<InternalIterator>* iter) {
1656 // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
1657 // it is an empty block.
1658 std::unique_ptr<Block> metaindex;
1659 Status s = ReadBlockFromFile(
1660 rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
1661 rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
1662 true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
1663 UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
1664 kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
1665 GetMemoryAllocator(rep_->table_options), false /* for_compaction */,
1666 rep_->blocks_definitely_zstd_compressed, nullptr /* filter_policy */);
1667
1668 if (!s.ok()) {
1669 ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1670 "Encountered error while reading data from properties"
1671 " block %s",
1672 s.ToString().c_str());
1673 return s;
1674 }
1675
1676 *metaindex_block = std::move(metaindex);
1677 // meta block uses bytewise comparator.
1678 iter->reset(metaindex_block->get()->NewDataIterator(BytewiseComparator(),
1679 BytewiseComparator()));
1680 return Status::OK();
1681 }
1682
1683 template <typename TBlocklike>
GetDataBlockFromCache(const Slice & block_cache_key,const Slice & compressed_block_cache_key,Cache * block_cache,Cache * block_cache_compressed,const ReadOptions & read_options,CachableEntry<TBlocklike> * block,const UncompressionDict & uncompression_dict,BlockType block_type,GetContext * get_context) const1684 Status BlockBasedTable::GetDataBlockFromCache(
1685 const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1686 Cache* block_cache, Cache* block_cache_compressed,
1687 const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
1688 const UncompressionDict& uncompression_dict, BlockType block_type,
1689 GetContext* get_context) const {
1690 const size_t read_amp_bytes_per_bit =
1691 block_type == BlockType::kData
1692 ? rep_->table_options.read_amp_bytes_per_bit
1693 : 0;
1694 assert(block);
1695 assert(block->IsEmpty());
1696
1697 Status s;
1698 BlockContents* compressed_block = nullptr;
1699 Cache::Handle* block_cache_compressed_handle = nullptr;
1700
1701 // Lookup uncompressed cache first
1702 if (block_cache != nullptr) {
1703 auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
1704 block_type, get_context);
1705 if (cache_handle != nullptr) {
1706 block->SetCachedValue(
1707 reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
1708 block_cache, cache_handle);
1709 return s;
1710 }
1711 }
1712
1713 // If not found, search from the compressed block cache.
1714 assert(block->IsEmpty());
1715
1716 if (block_cache_compressed == nullptr) {
1717 return s;
1718 }
1719
1720 assert(!compressed_block_cache_key.empty());
1721 block_cache_compressed_handle =
1722 block_cache_compressed->Lookup(compressed_block_cache_key);
1723
1724 Statistics* statistics = rep_->ioptions.statistics;
1725
1726 // if we found in the compressed cache, then uncompress and insert into
1727 // uncompressed cache
1728 if (block_cache_compressed_handle == nullptr) {
1729 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
1730 return s;
1731 }
1732
1733 // found compressed block
1734 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
1735 compressed_block = reinterpret_cast<BlockContents*>(
1736 block_cache_compressed->Value(block_cache_compressed_handle));
1737 CompressionType compression_type = compressed_block->get_compression_type();
1738 assert(compression_type != kNoCompression);
1739
1740 // Retrieve the uncompressed contents into a new buffer
1741 BlockContents contents;
1742 UncompressionContext context(compression_type);
1743 UncompressionInfo info(context, uncompression_dict, compression_type);
1744 s = UncompressBlockContents(
1745 info, compressed_block->data.data(), compressed_block->data.size(),
1746 &contents, rep_->table_options.format_version, rep_->ioptions,
1747 GetMemoryAllocator(rep_->table_options));
1748
1749 // Insert uncompressed block into block cache
1750 if (s.ok()) {
1751 std::unique_ptr<TBlocklike> block_holder(
1752 BlocklikeTraits<TBlocklike>::Create(
1753 std::move(contents), rep_->get_global_seqno(block_type),
1754 read_amp_bytes_per_bit, statistics,
1755 rep_->blocks_definitely_zstd_compressed,
1756 rep_->table_options.filter_policy.get())); // uncompressed block
1757
1758 if (block_cache != nullptr && block_holder->own_bytes() &&
1759 read_options.fill_cache) {
1760 size_t charge = block_holder->ApproximateMemoryUsage();
1761 Cache::Handle* cache_handle = nullptr;
1762 s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1763 &DeleteCachedEntry<TBlocklike>, &cache_handle);
1764 if (s.ok()) {
1765 assert(cache_handle != nullptr);
1766 block->SetCachedValue(block_holder.release(), block_cache,
1767 cache_handle);
1768
1769 UpdateCacheInsertionMetrics(block_type, get_context, charge);
1770 } else {
1771 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1772 }
1773 } else {
1774 block->SetOwnedValue(block_holder.release());
1775 }
1776 }
1777
1778 // Release hold on compressed cache entry
1779 block_cache_compressed->Release(block_cache_compressed_handle);
1780 return s;
1781 }
1782
1783 template <typename TBlocklike>
PutDataBlockToCache(const Slice & block_cache_key,const Slice & compressed_block_cache_key,Cache * block_cache,Cache * block_cache_compressed,CachableEntry<TBlocklike> * cached_block,BlockContents * raw_block_contents,CompressionType raw_block_comp_type,const UncompressionDict & uncompression_dict,SequenceNumber seq_no,MemoryAllocator * memory_allocator,BlockType block_type,GetContext * get_context) const1784 Status BlockBasedTable::PutDataBlockToCache(
1785 const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1786 Cache* block_cache, Cache* block_cache_compressed,
1787 CachableEntry<TBlocklike>* cached_block, BlockContents* raw_block_contents,
1788 CompressionType raw_block_comp_type,
1789 const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
1790 MemoryAllocator* memory_allocator, BlockType block_type,
1791 GetContext* get_context) const {
1792 const ImmutableCFOptions& ioptions = rep_->ioptions;
1793 const uint32_t format_version = rep_->table_options.format_version;
1794 const size_t read_amp_bytes_per_bit =
1795 block_type == BlockType::kData
1796 ? rep_->table_options.read_amp_bytes_per_bit
1797 : 0;
1798 const Cache::Priority priority =
1799 rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
1800 (block_type == BlockType::kFilter ||
1801 block_type == BlockType::kCompressionDictionary ||
1802 block_type == BlockType::kIndex)
1803 ? Cache::Priority::HIGH
1804 : Cache::Priority::LOW;
1805 assert(cached_block);
1806 assert(cached_block->IsEmpty());
1807
1808 Status s;
1809 Statistics* statistics = ioptions.statistics;
1810
1811 std::unique_ptr<TBlocklike> block_holder;
1812 if (raw_block_comp_type != kNoCompression) {
1813 // Retrieve the uncompressed contents into a new buffer
1814 BlockContents uncompressed_block_contents;
1815 UncompressionContext context(raw_block_comp_type);
1816 UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
1817 s = UncompressBlockContents(info, raw_block_contents->data.data(),
1818 raw_block_contents->data.size(),
1819 &uncompressed_block_contents, format_version,
1820 ioptions, memory_allocator);
1821 if (!s.ok()) {
1822 return s;
1823 }
1824
1825 block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
1826 std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit,
1827 statistics, rep_->blocks_definitely_zstd_compressed,
1828 rep_->table_options.filter_policy.get()));
1829 } else {
1830 block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
1831 std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit,
1832 statistics, rep_->blocks_definitely_zstd_compressed,
1833 rep_->table_options.filter_policy.get()));
1834 }
1835
1836 // Insert compressed block into compressed block cache.
1837 // Release the hold on the compressed cache entry immediately.
1838 if (block_cache_compressed != nullptr &&
1839 raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
1840 raw_block_contents->own_bytes()) {
1841 #ifndef NDEBUG
1842 assert(raw_block_contents->is_raw_block);
1843 #endif // NDEBUG
1844
1845 // We cannot directly put raw_block_contents because this could point to
1846 // an object in the stack.
1847 BlockContents* block_cont_for_comp_cache =
1848 new BlockContents(std::move(*raw_block_contents));
1849 s = block_cache_compressed->Insert(
1850 compressed_block_cache_key, block_cont_for_comp_cache,
1851 block_cont_for_comp_cache->ApproximateMemoryUsage(),
1852 &DeleteCachedEntry<BlockContents>);
1853 if (s.ok()) {
1854 // Avoid the following code to delete this cached block.
1855 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
1856 } else {
1857 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
1858 delete block_cont_for_comp_cache;
1859 }
1860 }
1861
1862 // insert into uncompressed block cache
1863 if (block_cache != nullptr && block_holder->own_bytes()) {
1864 size_t charge = block_holder->ApproximateMemoryUsage();
1865 Cache::Handle* cache_handle = nullptr;
1866 s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1867 &DeleteCachedEntry<TBlocklike>, &cache_handle,
1868 priority);
1869 if (s.ok()) {
1870 assert(cache_handle != nullptr);
1871 cached_block->SetCachedValue(block_holder.release(), block_cache,
1872 cache_handle);
1873
1874 UpdateCacheInsertionMetrics(block_type, get_context, charge);
1875 } else {
1876 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1877 }
1878 } else {
1879 cached_block->SetOwnedValue(block_holder.release());
1880 }
1881
1882 return s;
1883 }
1884
CreateFilterBlockReader(FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context)1885 std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
1886 FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
1887 bool pin, BlockCacheLookupContext* lookup_context) {
1888 auto& rep = rep_;
1889 auto filter_type = rep->filter_type;
1890 if (filter_type == Rep::FilterType::kNoFilter) {
1891 return std::unique_ptr<FilterBlockReader>();
1892 }
1893
1894 assert(rep->filter_policy);
1895
1896 switch (filter_type) {
1897 case Rep::FilterType::kPartitionedFilter:
1898 return PartitionedFilterBlockReader::Create(
1899 this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
1900
1901 case Rep::FilterType::kBlockFilter:
1902 return BlockBasedFilterBlockReader::Create(
1903 this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
1904
1905 case Rep::FilterType::kFullFilter:
1906 return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache,
1907 prefetch, pin, lookup_context);
1908
1909 default:
1910 // filter_type is either kNoFilter (exited the function at the first if),
1911 // or it must be covered in this switch block
1912 assert(false);
1913 return std::unique_ptr<FilterBlockReader>();
1914 }
1915 }
1916
1917 // disable_prefix_seek should be set to true when prefix_extractor found in SST
1918 // differs from the one in mutable_cf_options and index type is HashBasedIndex
NewIndexIterator(const ReadOptions & read_options,bool disable_prefix_seek,IndexBlockIter * input_iter,GetContext * get_context,BlockCacheLookupContext * lookup_context) const1919 InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
1920 const ReadOptions& read_options, bool disable_prefix_seek,
1921 IndexBlockIter* input_iter, GetContext* get_context,
1922 BlockCacheLookupContext* lookup_context) const {
1923 assert(rep_ != nullptr);
1924 assert(rep_->index_reader != nullptr);
1925
1926 // We don't return pinned data from index blocks, so no need
1927 // to set `block_contents_pinned`.
1928 return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
1929 input_iter, get_context,
1930 lookup_context);
1931 }
1932
1933 // Convert an index iterator value (i.e., an encoded BlockHandle)
1934 // into an iterator over the contents of the corresponding block.
1935 // If input_iter is null, new a iterator
1936 // If input_iter is not null, update this iter and return it
1937 template <typename TBlockIter>
NewDataBlockIterator(const ReadOptions & ro,const BlockHandle & handle,TBlockIter * input_iter,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,Status s,FilePrefetchBuffer * prefetch_buffer,bool for_compaction) const1938 TBlockIter* BlockBasedTable::NewDataBlockIterator(
1939 const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
1940 BlockType block_type, GetContext* get_context,
1941 BlockCacheLookupContext* lookup_context, Status s,
1942 FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const {
1943 PERF_TIMER_GUARD(new_table_block_iter_nanos);
1944
1945 TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
1946 if (!s.ok()) {
1947 iter->Invalidate(s);
1948 return iter;
1949 }
1950
1951 CachableEntry<UncompressionDict> uncompression_dict;
1952 if (rep_->uncompression_dict_reader) {
1953 const bool no_io = (ro.read_tier == kBlockCacheTier);
1954 s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
1955 prefetch_buffer, no_io, get_context, lookup_context,
1956 &uncompression_dict);
1957 if (!s.ok()) {
1958 iter->Invalidate(s);
1959 return iter;
1960 }
1961 }
1962
1963 const UncompressionDict& dict = uncompression_dict.GetValue()
1964 ? *uncompression_dict.GetValue()
1965 : UncompressionDict::GetEmptyDict();
1966
1967 CachableEntry<Block> block;
1968 s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
1969 get_context, lookup_context, for_compaction,
1970 /* use_cache */ true);
1971
1972 if (!s.ok()) {
1973 assert(block.IsEmpty());
1974 iter->Invalidate(s);
1975 return iter;
1976 }
1977
1978 assert(block.GetValue() != nullptr);
1979
1980 // Block contents are pinned and it is still pinned after the iterator
1981 // is destroyed as long as cleanup functions are moved to another object,
1982 // when:
1983 // 1. block cache handle is set to be released in cleanup function, or
1984 // 2. it's pointing to immortal source. If own_bytes is true then we are
1985 // not reading data from the original source, whether immortal or not.
1986 // Otherwise, the block is pinned iff the source is immortal.
1987 const bool block_contents_pinned =
1988 block.IsCached() ||
1989 (!block.GetValue()->own_bytes() && rep_->immortal_table);
1990 iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
1991 block_contents_pinned);
1992
1993 if (!block.IsCached()) {
1994 if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
1995 // insert a dummy record to block cache to track the memory usage
1996 Cache* const block_cache = rep_->table_options.block_cache.get();
1997 Cache::Handle* cache_handle = nullptr;
1998 // There are two other types of cache keys: 1) SST cache key added in
1999 // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
2000 // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
2001 // from SST cache key(31 bytes), and use non-zero prefix to
2002 // differentiate from `write_buffer_manager`
2003 const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
2004 char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
2005 // Prefix: use rep_->cache_key_prefix padded by 0s
2006 memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
2007 assert(rep_->cache_key_prefix_size != 0);
2008 assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
2009 memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
2010 char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
2011 next_cache_key_id_++);
2012 assert(end - cache_key <=
2013 static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
2014 const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
2015 s = block_cache->Insert(unique_key, nullptr,
2016 block.GetValue()->ApproximateMemoryUsage(),
2017 nullptr, &cache_handle);
2018
2019 if (s.ok()) {
2020 assert(cache_handle != nullptr);
2021 iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
2022 cache_handle);
2023 }
2024 }
2025 } else {
2026 iter->SetCacheHandle(block.GetCacheHandle());
2027 }
2028
2029 block.TransferTo(iter);
2030
2031 return iter;
2032 }
2033
2034 template <>
InitBlockIterator(const Rep * rep,Block * block,DataBlockIter * input_iter,bool block_contents_pinned)2035 DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
2036 const Rep* rep, Block* block, DataBlockIter* input_iter,
2037 bool block_contents_pinned) {
2038 return block->NewDataIterator(
2039 &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2040 input_iter, rep->ioptions.statistics, block_contents_pinned);
2041 }
2042
2043 template <>
InitBlockIterator(const Rep * rep,Block * block,IndexBlockIter * input_iter,bool block_contents_pinned)2044 IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
2045 const Rep* rep, Block* block, IndexBlockIter* input_iter,
2046 bool block_contents_pinned) {
2047 return block->NewIndexIterator(
2048 &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2049 input_iter, rep->ioptions.statistics, /* total_order_seek */ true,
2050 rep->index_has_first_key, rep->index_key_includes_seq,
2051 rep->index_value_is_full, block_contents_pinned);
2052 }
2053
2054 // Convert an uncompressed data block (i.e CachableEntry<Block>)
2055 // into an iterator over the contents of the corresponding block.
2056 // If input_iter is null, new a iterator
2057 // If input_iter is not null, update this iter and return it
2058 template <typename TBlockIter>
NewDataBlockIterator(const ReadOptions & ro,CachableEntry<Block> & block,TBlockIter * input_iter,Status s) const2059 TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro,
2060 CachableEntry<Block>& block,
2061 TBlockIter* input_iter,
2062 Status s) const {
2063 PERF_TIMER_GUARD(new_table_block_iter_nanos);
2064
2065 TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
2066 if (!s.ok()) {
2067 iter->Invalidate(s);
2068 return iter;
2069 }
2070
2071 assert(block.GetValue() != nullptr);
2072 // Block contents are pinned and it is still pinned after the iterator
2073 // is destroyed as long as cleanup functions are moved to another object,
2074 // when:
2075 // 1. block cache handle is set to be released in cleanup function, or
2076 // 2. it's pointing to immortal source. If own_bytes is true then we are
2077 // not reading data from the original source, whether immortal or not.
2078 // Otherwise, the block is pinned iff the source is immortal.
2079 const bool block_contents_pinned =
2080 block.IsCached() ||
2081 (!block.GetValue()->own_bytes() && rep_->immortal_table);
2082 iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
2083 block_contents_pinned);
2084
2085 if (!block.IsCached()) {
2086 if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
2087 // insert a dummy record to block cache to track the memory usage
2088 Cache* const block_cache = rep_->table_options.block_cache.get();
2089 Cache::Handle* cache_handle = nullptr;
2090 // There are two other types of cache keys: 1) SST cache key added in
2091 // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
2092 // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
2093 // from SST cache key(31 bytes), and use non-zero prefix to
2094 // differentiate from `write_buffer_manager`
2095 const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
2096 char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
2097 // Prefix: use rep_->cache_key_prefix padded by 0s
2098 memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
2099 assert(rep_->cache_key_prefix_size != 0);
2100 assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
2101 memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
2102 char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
2103 next_cache_key_id_++);
2104 assert(end - cache_key <=
2105 static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
2106 const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
2107 s = block_cache->Insert(unique_key, nullptr,
2108 block.GetValue()->ApproximateMemoryUsage(),
2109 nullptr, &cache_handle);
2110 if (s.ok()) {
2111 assert(cache_handle != nullptr);
2112 iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
2113 cache_handle);
2114 }
2115 }
2116 } else {
2117 iter->SetCacheHandle(block.GetCacheHandle());
2118 }
2119
2120 block.TransferTo(iter);
2121 return iter;
2122 }
2123
2124 // If contents is nullptr, this function looks up the block caches for the
2125 // data block referenced by handle, and read the block from disk if necessary.
2126 // If contents is non-null, it skips the cache lookup and disk read, since
2127 // the caller has already read it. In both cases, if ro.fill_cache is true,
2128 // it inserts the block into the block cache.
2129 template <typename TBlocklike>
MaybeReadBlockAndLoadToCache(FilePrefetchBuffer * prefetch_buffer,const ReadOptions & ro,const BlockHandle & handle,const UncompressionDict & uncompression_dict,CachableEntry<TBlocklike> * block_entry,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,BlockContents * contents) const2130 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
2131 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2132 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2133 CachableEntry<TBlocklike>* block_entry, BlockType block_type,
2134 GetContext* get_context, BlockCacheLookupContext* lookup_context,
2135 BlockContents* contents) const {
2136 assert(block_entry != nullptr);
2137 const bool no_io = (ro.read_tier == kBlockCacheTier);
2138 Cache* block_cache = rep_->table_options.block_cache.get();
2139 // No point to cache compressed blocks if it never goes away
2140 Cache* block_cache_compressed =
2141 rep_->immortal_table ? nullptr
2142 : rep_->table_options.block_cache_compressed.get();
2143
2144 // First, try to get the block from the cache
2145 //
2146 // If either block cache is enabled, we'll try to read from it.
2147 Status s;
2148 char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
2149 char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
2150 Slice key /* key to the block cache */;
2151 Slice ckey /* key to the compressed block cache */;
2152 bool is_cache_hit = false;
2153 if (block_cache != nullptr || block_cache_compressed != nullptr) {
2154 // create key for block cache
2155 if (block_cache != nullptr) {
2156 key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
2157 handle, cache_key);
2158 }
2159
2160 if (block_cache_compressed != nullptr) {
2161 ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
2162 rep_->compressed_cache_key_prefix_size, handle,
2163 compressed_cache_key);
2164 }
2165
2166 if (!contents) {
2167 s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
2168 ro, block_entry, uncompression_dict, block_type,
2169 get_context);
2170 if (block_entry->GetValue()) {
2171 // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
2172 // compressed block cache.
2173 is_cache_hit = true;
2174 }
2175 }
2176
2177 // Can't find the block from the cache. If I/O is allowed, read from the
2178 // file.
2179 if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
2180 Statistics* statistics = rep_->ioptions.statistics;
2181 const bool maybe_compressed =
2182 block_type != BlockType::kFilter &&
2183 block_type != BlockType::kCompressionDictionary &&
2184 rep_->blocks_maybe_compressed;
2185 const bool do_uncompress = maybe_compressed && !block_cache_compressed;
2186 CompressionType raw_block_comp_type;
2187 BlockContents raw_block_contents;
2188 if (!contents) {
2189 StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
2190 BlockFetcher block_fetcher(
2191 rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
2192 &raw_block_contents, rep_->ioptions, do_uncompress,
2193 maybe_compressed, block_type, uncompression_dict,
2194 rep_->persistent_cache_options,
2195 GetMemoryAllocator(rep_->table_options),
2196 GetMemoryAllocatorForCompressedBlock(rep_->table_options));
2197 s = block_fetcher.ReadBlockContents();
2198 raw_block_comp_type = block_fetcher.get_compression_type();
2199 contents = &raw_block_contents;
2200 } else {
2201 raw_block_comp_type = contents->get_compression_type();
2202 }
2203
2204 if (s.ok()) {
2205 SequenceNumber seq_no = rep_->get_global_seqno(block_type);
2206 // If filling cache is allowed and a cache is configured, try to put the
2207 // block to the cache.
2208 s = PutDataBlockToCache(
2209 key, ckey, block_cache, block_cache_compressed, block_entry,
2210 contents, raw_block_comp_type, uncompression_dict, seq_no,
2211 GetMemoryAllocator(rep_->table_options), block_type, get_context);
2212 }
2213 }
2214 }
2215
2216 // Fill lookup_context.
2217 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
2218 lookup_context) {
2219 size_t usage = 0;
2220 uint64_t nkeys = 0;
2221 if (block_entry->GetValue()) {
2222 // Approximate the number of keys in the block using restarts.
2223 nkeys =
2224 rep_->table_options.block_restart_interval *
2225 BlocklikeTraits<TBlocklike>::GetNumRestarts(*block_entry->GetValue());
2226 usage = block_entry->GetValue()->ApproximateMemoryUsage();
2227 }
2228 TraceType trace_block_type = TraceType::kTraceMax;
2229 switch (block_type) {
2230 case BlockType::kData:
2231 trace_block_type = TraceType::kBlockTraceDataBlock;
2232 break;
2233 case BlockType::kFilter:
2234 trace_block_type = TraceType::kBlockTraceFilterBlock;
2235 break;
2236 case BlockType::kCompressionDictionary:
2237 trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
2238 break;
2239 case BlockType::kRangeDeletion:
2240 trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
2241 break;
2242 case BlockType::kIndex:
2243 trace_block_type = TraceType::kBlockTraceIndexBlock;
2244 break;
2245 default:
2246 // This cannot happen.
2247 assert(false);
2248 break;
2249 }
2250 bool no_insert = no_io || !ro.fill_cache;
2251 if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
2252 trace_block_type, lookup_context->caller)) {
2253 // Defer logging the access to Get() and MultiGet() to trace additional
2254 // information, e.g., referenced_key_exist_in_block.
2255
2256 // Make a copy of the block key here since it will be logged later.
2257 lookup_context->FillLookupContext(
2258 is_cache_hit, no_insert, trace_block_type,
2259 /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
2260 } else {
2261 // Avoid making copy of block_key and cf_name when constructing the access
2262 // record.
2263 BlockCacheTraceRecord access_record(
2264 rep_->ioptions.env->NowMicros(),
2265 /*block_key=*/"", trace_block_type,
2266 /*block_size=*/usage, rep_->cf_id_for_tracing(),
2267 /*cf_name=*/"", rep_->level_for_tracing(),
2268 rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
2269 no_insert, lookup_context->get_id,
2270 lookup_context->get_from_user_specified_snapshot,
2271 /*referenced_key=*/"");
2272 block_cache_tracer_->WriteBlockAccess(access_record, key,
2273 rep_->cf_name_for_tracing(),
2274 lookup_context->referenced_key);
2275 }
2276 }
2277
2278 assert(s.ok() || block_entry->GetValue() == nullptr);
2279 return s;
2280 }
2281
2282 // This function reads multiple data blocks from disk using Env::MultiRead()
2283 // and optionally inserts them into the block cache. It uses the scratch
2284 // buffer provided by the caller, which is contiguous. If scratch is a nullptr
2285 // it allocates a separate buffer for each block. Typically, if the blocks
2286 // need to be uncompressed and there is no compressed block cache, callers
2287 // can allocate a temporary scratch buffer in order to minimize memory
2288 // allocations.
2289 // If options.fill_cache is true, it inserts the blocks into cache. If its
2290 // false and scratch is non-null and the blocks are uncompressed, it copies
2291 // the buffers to heap. In any case, the CachableEntry<Block> returned will
2292 // own the data bytes.
2293 // If compression is enabled and also there is no compressed block cache,
2294 // the adjacent blocks are read out in one IO (combined read)
2295 // batch - A MultiGetRange with only those keys with unique data blocks not
2296 // found in cache
2297 // handles - A vector of block handles. Some of them me be NULL handles
2298 // scratch - An optional contiguous buffer to read compressed blocks into
RetrieveMultipleBlocks(const ReadOptions & options,const MultiGetRange * batch,const autovector<BlockHandle,MultiGetContext::MAX_BATCH_SIZE> * handles,autovector<Status,MultiGetContext::MAX_BATCH_SIZE> * statuses,autovector<CachableEntry<Block>,MultiGetContext::MAX_BATCH_SIZE> * results,char * scratch,const UncompressionDict & uncompression_dict) const2299 void BlockBasedTable::RetrieveMultipleBlocks(
2300 const ReadOptions& options, const MultiGetRange* batch,
2301 const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
2302 autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
2303 autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
2304 char* scratch, const UncompressionDict& uncompression_dict) const {
2305 RandomAccessFileReader* file = rep_->file.get();
2306 const Footer& footer = rep_->footer;
2307 const ImmutableCFOptions& ioptions = rep_->ioptions;
2308 SequenceNumber global_seqno = rep_->get_global_seqno(BlockType::kData);
2309 size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
2310 MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
2311
2312 if (file->use_direct_io() || ioptions.allow_mmap_reads) {
2313 size_t idx_in_batch = 0;
2314 for (auto mget_iter = batch->begin(); mget_iter != batch->end();
2315 ++mget_iter, ++idx_in_batch) {
2316 BlockCacheLookupContext lookup_data_block_context(
2317 TableReaderCaller::kUserMultiGet);
2318 const BlockHandle& handle = (*handles)[idx_in_batch];
2319 if (handle.IsNull()) {
2320 continue;
2321 }
2322
2323 (*statuses)[idx_in_batch] =
2324 RetrieveBlock(nullptr, options, handle, uncompression_dict,
2325 &(*results)[idx_in_batch], BlockType::kData,
2326 mget_iter->get_context, &lookup_data_block_context,
2327 /* for_compaction */ false, /* use_cache */ true);
2328 }
2329 return;
2330 }
2331
2332 autovector<FSReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs;
2333 size_t buf_offset = 0;
2334 size_t idx_in_batch = 0;
2335
2336 uint64_t prev_offset = 0;
2337 size_t prev_len = 0;
2338 autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_idx_for_block;
2339 autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_offset_for_block;
2340 for (auto mget_iter = batch->begin(); mget_iter != batch->end();
2341 ++mget_iter, ++idx_in_batch) {
2342 const BlockHandle& handle = (*handles)[idx_in_batch];
2343 if (handle.IsNull()) {
2344 continue;
2345 }
2346
2347 size_t prev_end = static_cast<size_t>(prev_offset) + prev_len;
2348
2349 // If current block is adjacent to the previous one, at the same time,
2350 // compression is enabled and there is no compressed cache, we combine
2351 // the two block read as one.
2352 if (scratch != nullptr && prev_end == handle.offset()) {
2353 req_offset_for_block.emplace_back(prev_len);
2354 prev_len += block_size(handle);
2355 } else {
2356 // No compression or current block and previous one is not adjacent:
2357 // Step 1, create a new request for previous blocks
2358 if (prev_len != 0) {
2359 FSReadRequest req;
2360 req.offset = prev_offset;
2361 req.len = prev_len;
2362 if (scratch == nullptr) {
2363 req.scratch = new char[req.len];
2364 } else {
2365 req.scratch = scratch + buf_offset;
2366 buf_offset += req.len;
2367 }
2368 req.status = IOStatus::OK();
2369 read_reqs.emplace_back(req);
2370 }
2371
2372 // Step 2, remeber the previous block info
2373 prev_offset = handle.offset();
2374 prev_len = block_size(handle);
2375 req_offset_for_block.emplace_back(0);
2376 }
2377 req_idx_for_block.emplace_back(read_reqs.size());
2378 }
2379 // Handle the last block and process the pending last request
2380 if (prev_len != 0) {
2381 FSReadRequest req;
2382 req.offset = prev_offset;
2383 req.len = prev_len;
2384 if (scratch == nullptr) {
2385 req.scratch = new char[req.len];
2386 } else {
2387 req.scratch = scratch + buf_offset;
2388 }
2389 req.status = IOStatus::OK();
2390 read_reqs.emplace_back(req);
2391 }
2392
2393 file->MultiRead(&read_reqs[0], read_reqs.size());
2394
2395 idx_in_batch = 0;
2396 size_t valid_batch_idx = 0;
2397 for (auto mget_iter = batch->begin(); mget_iter != batch->end();
2398 ++mget_iter, ++idx_in_batch) {
2399 const BlockHandle& handle = (*handles)[idx_in_batch];
2400
2401 if (handle.IsNull()) {
2402 continue;
2403 }
2404
2405 assert(valid_batch_idx < req_idx_for_block.size());
2406 assert(valid_batch_idx < req_offset_for_block.size());
2407 assert(req_idx_for_block[valid_batch_idx] < read_reqs.size());
2408 size_t& req_idx = req_idx_for_block[valid_batch_idx];
2409 size_t& req_offset = req_offset_for_block[valid_batch_idx];
2410 valid_batch_idx++;
2411 FSReadRequest& req = read_reqs[req_idx];
2412 Status s = req.status;
2413 if (s.ok()) {
2414 if (req.result.size() != req.len) {
2415 s = Status::Corruption(
2416 "truncated block read from " + rep_->file->file_name() +
2417 " offset " + ToString(handle.offset()) + ", expected " +
2418 ToString(req.len) + " bytes, got " + ToString(req.result.size()));
2419 }
2420 }
2421
2422 BlockContents raw_block_contents;
2423 size_t cur_read_end = req_offset + block_size(handle);
2424 if (cur_read_end > req.result.size()) {
2425 s = Status::Corruption(
2426 "truncated block read from " + rep_->file->file_name() + " offset " +
2427 ToString(handle.offset()) + ", expected " + ToString(req.len) +
2428 " bytes, got " + ToString(req.result.size()));
2429 }
2430
2431 bool blocks_share_read_buffer = (req.result.size() != block_size(handle));
2432 if (s.ok()) {
2433 if (scratch == nullptr && !blocks_share_read_buffer) {
2434 // We allocated a buffer for this block. Give ownership of it to
2435 // BlockContents so it can free the memory
2436 assert(req.result.data() == req.scratch);
2437 std::unique_ptr<char[]> raw_block(req.scratch + req_offset);
2438 raw_block_contents = BlockContents(std::move(raw_block), handle.size());
2439 } else {
2440 // We used the scratch buffer which are shared by the blocks.
2441 // raw_block_contents does not have the ownership.
2442 raw_block_contents =
2443 BlockContents(Slice(req.scratch + req_offset, handle.size()));
2444 }
2445
2446 #ifndef NDEBUG
2447 raw_block_contents.is_raw_block = true;
2448 #endif
2449 if (options.verify_checksums) {
2450 PERF_TIMER_GUARD(block_checksum_time);
2451 const char* data = req.result.data();
2452 uint32_t expected =
2453 DecodeFixed32(data + req_offset + handle.size() + 1);
2454 // Since the scratch might be shared. the offset of the data block in
2455 // the buffer might not be 0. req.result.data() only point to the
2456 // begin address of each read request, we need to add the offset
2457 // in each read request. Checksum is stored in the block trailer,
2458 // which is handle.size() + 1.
2459 s = ROCKSDB_NAMESPACE::VerifyChecksum(footer.checksum(),
2460 req.result.data() + req_offset,
2461 handle.size() + 1, expected);
2462 TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
2463 }
2464 }
2465
2466 if (s.ok()) {
2467 // It handles a rare case: compression is set and these is no compressed
2468 // cache (enable combined read). In this case, the scratch != nullptr.
2469 // At the same time, some blocks are actually not compressed,
2470 // since its compression space saving is smaller than the threshold. In
2471 // this case, if the block shares the scratch memory, we need to copy it
2472 // to the heap such that it can be added to the regular block cache.
2473 CompressionType compression_type =
2474 raw_block_contents.get_compression_type();
2475 if (scratch != nullptr && compression_type == kNoCompression) {
2476 Slice raw = Slice(req.scratch + req_offset, block_size(handle));
2477 raw_block_contents = BlockContents(
2478 CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
2479 handle.size());
2480 #ifndef NDEBUG
2481 raw_block_contents.is_raw_block = true;
2482 #endif
2483 }
2484 }
2485
2486 if (s.ok()) {
2487 if (options.fill_cache) {
2488 BlockCacheLookupContext lookup_data_block_context(
2489 TableReaderCaller::kUserMultiGet);
2490 CachableEntry<Block>* block_entry = &(*results)[idx_in_batch];
2491 // MaybeReadBlockAndLoadToCache will insert into the block caches if
2492 // necessary. Since we're passing the raw block contents, it will
2493 // avoid looking up the block cache
2494 s = MaybeReadBlockAndLoadToCache(
2495 nullptr, options, handle, uncompression_dict, block_entry,
2496 BlockType::kData, mget_iter->get_context,
2497 &lookup_data_block_context, &raw_block_contents);
2498
2499 // block_entry value could be null if no block cache is present, i.e
2500 // BlockBasedTableOptions::no_block_cache is true and no compressed
2501 // block cache is configured. In that case, fall
2502 // through and set up the block explicitly
2503 if (block_entry->GetValue() != nullptr) {
2504 continue;
2505 }
2506 }
2507
2508 CompressionType compression_type =
2509 raw_block_contents.get_compression_type();
2510 BlockContents contents;
2511 if (compression_type != kNoCompression) {
2512 UncompressionContext context(compression_type);
2513 UncompressionInfo info(context, uncompression_dict, compression_type);
2514 s = UncompressBlockContents(info, req.result.data() + req_offset,
2515 handle.size(), &contents, footer.version(),
2516 rep_->ioptions, memory_allocator);
2517 } else {
2518 // There are two cases here: 1) caller uses the scratch buffer; 2) we
2519 // use the requst buffer. If scratch buffer is used, we ensure that
2520 // all raw blocks are copyed to the heap as single blocks. If scratch
2521 // buffer is not used, we also have no combined read, so the raw
2522 // block can be used directly.
2523 contents = std::move(raw_block_contents);
2524 }
2525 if (s.ok()) {
2526 (*results)[idx_in_batch].SetOwnedValue(
2527 new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
2528 ioptions.statistics));
2529 }
2530 }
2531 (*statuses)[idx_in_batch] = s;
2532 }
2533 }
2534
2535 template <typename TBlocklike>
RetrieveBlock(FilePrefetchBuffer * prefetch_buffer,const ReadOptions & ro,const BlockHandle & handle,const UncompressionDict & uncompression_dict,CachableEntry<TBlocklike> * block_entry,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,bool for_compaction,bool use_cache) const2536 Status BlockBasedTable::RetrieveBlock(
2537 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2538 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2539 CachableEntry<TBlocklike>* block_entry, BlockType block_type,
2540 GetContext* get_context, BlockCacheLookupContext* lookup_context,
2541 bool for_compaction, bool use_cache) const {
2542 assert(block_entry);
2543 assert(block_entry->IsEmpty());
2544
2545 Status s;
2546 if (use_cache) {
2547 s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
2548 uncompression_dict, block_entry,
2549 block_type, get_context, lookup_context,
2550 /*contents=*/nullptr);
2551
2552 if (!s.ok()) {
2553 return s;
2554 }
2555
2556 if (block_entry->GetValue() != nullptr) {
2557 assert(s.ok());
2558 return s;
2559 }
2560 }
2561
2562 assert(block_entry->IsEmpty());
2563
2564 const bool no_io = ro.read_tier == kBlockCacheTier;
2565 if (no_io) {
2566 return Status::Incomplete("no blocking io");
2567 }
2568
2569 const bool maybe_compressed =
2570 block_type != BlockType::kFilter &&
2571 block_type != BlockType::kCompressionDictionary &&
2572 rep_->blocks_maybe_compressed;
2573 const bool do_uncompress = maybe_compressed;
2574 std::unique_ptr<TBlocklike> block;
2575
2576 {
2577 StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
2578 READ_BLOCK_GET_MICROS);
2579 s = ReadBlockFromFile(
2580 rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
2581 rep_->ioptions, do_uncompress, maybe_compressed, block_type,
2582 uncompression_dict, rep_->persistent_cache_options,
2583 rep_->get_global_seqno(block_type),
2584 block_type == BlockType::kData
2585 ? rep_->table_options.read_amp_bytes_per_bit
2586 : 0,
2587 GetMemoryAllocator(rep_->table_options), for_compaction,
2588 rep_->blocks_definitely_zstd_compressed,
2589 rep_->table_options.filter_policy.get());
2590 }
2591
2592 if (!s.ok()) {
2593 return s;
2594 }
2595
2596 block_entry->SetOwnedValue(block.release());
2597
2598 assert(s.ok());
2599 return s;
2600 }
2601
2602 // Explicitly instantiate templates for both "blocklike" types we use.
2603 // This makes it possible to keep the template definitions in the .cc file.
2604 template Status BlockBasedTable::RetrieveBlock<BlockContents>(
2605 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2606 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2607 CachableEntry<BlockContents>* block_entry, BlockType block_type,
2608 GetContext* get_context, BlockCacheLookupContext* lookup_context,
2609 bool for_compaction, bool use_cache) const;
2610
2611 template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
2612 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2613 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2614 CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type,
2615 GetContext* get_context, BlockCacheLookupContext* lookup_context,
2616 bool for_compaction, bool use_cache) const;
2617
2618 template Status BlockBasedTable::RetrieveBlock<Block>(
2619 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2620 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2621 CachableEntry<Block>* block_entry, BlockType block_type,
2622 GetContext* get_context, BlockCacheLookupContext* lookup_context,
2623 bool for_compaction, bool use_cache) const;
2624
2625 template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
2626 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2627 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2628 CachableEntry<UncompressionDict>* block_entry, BlockType block_type,
2629 GetContext* get_context, BlockCacheLookupContext* lookup_context,
2630 bool for_compaction, bool use_cache) const;
2631
PartitionedIndexIteratorState(const BlockBasedTable * table,std::unordered_map<uint64_t,CachableEntry<Block>> * block_map)2632 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
2633 const BlockBasedTable* table,
2634 std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
2635 : table_(table), block_map_(block_map) {}
2636
2637 InternalIteratorBase<IndexValue>*
NewSecondaryIterator(const BlockHandle & handle)2638 BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
2639 const BlockHandle& handle) {
2640 // Return a block iterator on the index partition
2641 auto block = block_map_->find(handle.offset());
2642 // This is a possible scenario since block cache might not have had space
2643 // for the partition
2644 if (block != block_map_->end()) {
2645 const Rep* rep = table_->get_rep();
2646 assert(rep);
2647
2648 Statistics* kNullStats = nullptr;
2649 // We don't return pinned data from index blocks, so no need
2650 // to set `block_contents_pinned`.
2651 return block->second.GetValue()->NewIndexIterator(
2652 &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2653 nullptr, kNullStats, true, rep->index_has_first_key,
2654 rep->index_key_includes_seq, rep->index_value_is_full);
2655 }
2656 // Create an empty iterator
2657 return new IndexBlockIter();
2658 }
2659
2660 // This will be broken if the user specifies an unusual implementation
2661 // of Options.comparator, or if the user specifies an unusual
2662 // definition of prefixes in BlockBasedTableOptions.filter_policy.
2663 // In particular, we require the following three properties:
2664 //
2665 // 1) key.starts_with(prefix(key))
2666 // 2) Compare(prefix(key), key) <= 0.
2667 // 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
2668 //
2669 // Otherwise, this method guarantees no I/O will be incurred.
2670 //
2671 // REQUIRES: this method shouldn't be called while the DB lock is held.
PrefixMayMatch(const Slice & internal_key,const ReadOptions & read_options,const SliceTransform * options_prefix_extractor,const bool need_upper_bound_check,BlockCacheLookupContext * lookup_context) const2672 bool BlockBasedTable::PrefixMayMatch(
2673 const Slice& internal_key, const ReadOptions& read_options,
2674 const SliceTransform* options_prefix_extractor,
2675 const bool need_upper_bound_check,
2676 BlockCacheLookupContext* lookup_context) const {
2677 if (!rep_->filter_policy) {
2678 return true;
2679 }
2680
2681 const SliceTransform* prefix_extractor;
2682
2683 if (rep_->table_prefix_extractor == nullptr) {
2684 if (need_upper_bound_check) {
2685 return true;
2686 }
2687 prefix_extractor = options_prefix_extractor;
2688 } else {
2689 prefix_extractor = rep_->table_prefix_extractor.get();
2690 }
2691 auto user_key = ExtractUserKey(internal_key);
2692 if (!prefix_extractor->InDomain(user_key)) {
2693 return true;
2694 }
2695
2696 bool may_match = true;
2697 Status s;
2698
2699 // First, try check with full filter
2700 FilterBlockReader* const filter = rep_->filter.get();
2701 bool filter_checked = true;
2702 if (filter != nullptr) {
2703 if (!filter->IsBlockBased()) {
2704 const Slice* const const_ikey_ptr = &internal_key;
2705 may_match = filter->RangeMayExist(
2706 read_options.iterate_upper_bound, user_key, prefix_extractor,
2707 rep_->internal_comparator.user_comparator(), const_ikey_ptr,
2708 &filter_checked, need_upper_bound_check, lookup_context);
2709 } else {
2710 // if prefix_extractor changed for block based filter, skip filter
2711 if (need_upper_bound_check) {
2712 return true;
2713 }
2714 auto prefix = prefix_extractor->Transform(user_key);
2715 InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
2716 auto internal_prefix = internal_key_prefix.Encode();
2717
2718 // To prevent any io operation in this method, we set `read_tier` to make
2719 // sure we always read index or filter only when they have already been
2720 // loaded to memory.
2721 ReadOptions no_io_read_options;
2722 no_io_read_options.read_tier = kBlockCacheTier;
2723
2724 // Then, try find it within each block
2725 // we already know prefix_extractor and prefix_extractor_name must match
2726 // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
2727 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
2728 no_io_read_options,
2729 /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
2730 /*get_context=*/nullptr, lookup_context));
2731 iiter->Seek(internal_prefix);
2732
2733 if (!iiter->Valid()) {
2734 // we're past end of file
2735 // if it's incomplete, it means that we avoided I/O
2736 // and we're not really sure that we're past the end
2737 // of the file
2738 may_match = iiter->status().IsIncomplete();
2739 } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key())
2740 : iiter->key())
2741 .starts_with(ExtractUserKey(internal_prefix))) {
2742 // we need to check for this subtle case because our only
2743 // guarantee is that "the key is a string >= last key in that data
2744 // block" according to the doc/table_format.txt spec.
2745 //
2746 // Suppose iiter->key() starts with the desired prefix; it is not
2747 // necessarily the case that the corresponding data block will
2748 // contain the prefix, since iiter->key() need not be in the
2749 // block. However, the next data block may contain the prefix, so
2750 // we return true to play it safe.
2751 may_match = true;
2752 } else if (filter->IsBlockBased()) {
2753 // iiter->key() does NOT start with the desired prefix. Because
2754 // Seek() finds the first key that is >= the seek target, this
2755 // means that iiter->key() > prefix. Thus, any data blocks coming
2756 // after the data block corresponding to iiter->key() cannot
2757 // possibly contain the key. Thus, the corresponding data block
2758 // is the only on could potentially contain the prefix.
2759 BlockHandle handle = iiter->value().handle;
2760 may_match = filter->PrefixMayMatch(
2761 prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
2762 /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context);
2763 }
2764 }
2765 }
2766
2767 if (filter_checked) {
2768 Statistics* statistics = rep_->ioptions.statistics;
2769 RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
2770 if (!may_match) {
2771 RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
2772 }
2773 }
2774
2775 return may_match;
2776 }
2777
2778 template <class TBlockIter, typename TValue>
Seek(const Slice & target)2779 void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
2780 SeekImpl(&target);
2781 }
2782
2783 template <class TBlockIter, typename TValue>
SeekToFirst()2784 void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
2785 SeekImpl(nullptr);
2786 }
2787
2788 template <class TBlockIter, typename TValue>
SeekImpl(const Slice * target)2789 void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
2790 const Slice* target) {
2791 is_out_of_bound_ = false;
2792 is_at_first_key_from_index_ = false;
2793 if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) {
2794 ResetDataIter();
2795 return;
2796 }
2797
2798 bool need_seek_index = true;
2799 if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
2800 // Reseek.
2801 prev_block_offset_ = index_iter_->value().handle.offset();
2802
2803 if (target) {
2804 // We can avoid an index seek if:
2805 // 1. The new seek key is larger than the current key
2806 // 2. The new seek key is within the upper bound of the block
2807 // Since we don't necessarily know the internal key for either
2808 // the current key or the upper bound, we check user keys and
2809 // exclude the equality case. Considering internal keys can
2810 // improve for the boundary cases, but it would complicate the
2811 // code.
2812 if (user_comparator_.Compare(ExtractUserKey(*target),
2813 block_iter_.user_key()) > 0 &&
2814 user_comparator_.Compare(ExtractUserKey(*target),
2815 index_iter_->user_key()) < 0) {
2816 need_seek_index = false;
2817 }
2818 }
2819 }
2820
2821 if (need_seek_index) {
2822 if (target) {
2823 index_iter_->Seek(*target);
2824 } else {
2825 index_iter_->SeekToFirst();
2826 }
2827
2828 if (!index_iter_->Valid()) {
2829 ResetDataIter();
2830 return;
2831 }
2832 }
2833
2834 IndexValue v = index_iter_->value();
2835 const bool same_block = block_iter_points_to_real_block_ &&
2836 v.handle.offset() == prev_block_offset_;
2837
2838 // TODO(kolmike): Remove the != kBlockCacheTier condition.
2839 if (!v.first_internal_key.empty() && !same_block &&
2840 (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
2841 read_options_.read_tier != kBlockCacheTier) {
2842 // Index contains the first key of the block, and it's >= target.
2843 // We can defer reading the block.
2844 is_at_first_key_from_index_ = true;
2845 // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
2846 // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
2847 // as that will be done later when the data block is actually read.
2848 ResetDataIter();
2849 } else {
2850 // Need to use the data block.
2851 if (!same_block) {
2852 InitDataBlock();
2853 } else {
2854 // When the user does a reseek, the iterate_upper_bound might have
2855 // changed. CheckDataBlockWithinUpperBound() needs to be called
2856 // explicitly if the reseek ends up in the same data block.
2857 // If the reseek ends up in a different block, InitDataBlock() will do
2858 // the iterator upper bound check.
2859 CheckDataBlockWithinUpperBound();
2860 }
2861
2862 if (target) {
2863 block_iter_.Seek(*target);
2864 } else {
2865 block_iter_.SeekToFirst();
2866 }
2867 FindKeyForward();
2868 }
2869
2870 CheckOutOfBound();
2871
2872 if (target) {
2873 assert(!Valid() || ((block_type_ == BlockType::kIndex &&
2874 !table_->get_rep()->index_key_includes_seq)
2875 ? (user_comparator_.Compare(ExtractUserKey(*target),
2876 key()) <= 0)
2877 : (icomp_.Compare(*target, key()) <= 0)));
2878 }
2879 }
2880
2881 template <class TBlockIter, typename TValue>
SeekForPrev(const Slice & target)2882 void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
2883 const Slice& target) {
2884 is_out_of_bound_ = false;
2885 is_at_first_key_from_index_ = false;
2886 // For now totally disable prefix seek in auto prefix mode because we don't
2887 // have logic
2888 if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) {
2889 ResetDataIter();
2890 return;
2891 }
2892
2893 SavePrevIndexValue();
2894
2895 // Call Seek() rather than SeekForPrev() in the index block, because the
2896 // target data block will likely to contain the position for `target`, the
2897 // same as Seek(), rather than than before.
2898 // For example, if we have three data blocks, each containing two keys:
2899 // [2, 4] [6, 8] [10, 12]
2900 // (the keys in the index block would be [4, 8, 12])
2901 // and the user calls SeekForPrev(7), we need to go to the second block,
2902 // just like if they call Seek(7).
2903 // The only case where the block is difference is when they seek to a position
2904 // in the boundary. For example, if they SeekForPrev(5), we should go to the
2905 // first block, rather than the second. However, we don't have the information
2906 // to distinguish the two unless we read the second block. In this case, we'll
2907 // end up with reading two blocks.
2908 index_iter_->Seek(target);
2909
2910 if (!index_iter_->Valid()) {
2911 auto seek_status = index_iter_->status();
2912 // Check for IO error
2913 if (!seek_status.IsNotFound() && !seek_status.ok()) {
2914 ResetDataIter();
2915 return;
2916 }
2917
2918 // With prefix index, Seek() returns NotFound if the prefix doesn't exist
2919 if (seek_status.IsNotFound()) {
2920 // Any key less than the target is fine for prefix seek
2921 ResetDataIter();
2922 return;
2923 } else {
2924 index_iter_->SeekToLast();
2925 }
2926 // Check for IO error
2927 if (!index_iter_->Valid()) {
2928 ResetDataIter();
2929 return;
2930 }
2931 }
2932
2933 InitDataBlock();
2934
2935 block_iter_.SeekForPrev(target);
2936
2937 FindKeyBackward();
2938 CheckDataBlockWithinUpperBound();
2939 assert(!block_iter_.Valid() ||
2940 icomp_.Compare(target, block_iter_.key()) >= 0);
2941 }
2942
2943 template <class TBlockIter, typename TValue>
SeekToLast()2944 void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
2945 is_out_of_bound_ = false;
2946 is_at_first_key_from_index_ = false;
2947 SavePrevIndexValue();
2948 index_iter_->SeekToLast();
2949 if (!index_iter_->Valid()) {
2950 ResetDataIter();
2951 return;
2952 }
2953 InitDataBlock();
2954 block_iter_.SeekToLast();
2955 FindKeyBackward();
2956 CheckDataBlockWithinUpperBound();
2957 }
2958
2959 template <class TBlockIter, typename TValue>
Next()2960 void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
2961 if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
2962 return;
2963 }
2964 assert(block_iter_points_to_real_block_);
2965 block_iter_.Next();
2966 FindKeyForward();
2967 CheckOutOfBound();
2968 }
2969
2970 template <class TBlockIter, typename TValue>
NextAndGetResult(IterateResult * result)2971 bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
2972 IterateResult* result) {
2973 Next();
2974 bool is_valid = Valid();
2975 if (is_valid) {
2976 result->key = key();
2977 result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
2978 }
2979 return is_valid;
2980 }
2981
2982 template <class TBlockIter, typename TValue>
Prev()2983 void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
2984 if (is_at_first_key_from_index_) {
2985 is_at_first_key_from_index_ = false;
2986
2987 index_iter_->Prev();
2988 if (!index_iter_->Valid()) {
2989 return;
2990 }
2991
2992 InitDataBlock();
2993 block_iter_.SeekToLast();
2994 } else {
2995 assert(block_iter_points_to_real_block_);
2996 block_iter_.Prev();
2997 }
2998
2999 FindKeyBackward();
3000 }
3001
3002 template <class TBlockIter, typename TValue>
InitDataBlock()3003 void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
3004 BlockHandle data_block_handle = index_iter_->value().handle;
3005 if (!block_iter_points_to_real_block_ ||
3006 data_block_handle.offset() != prev_block_offset_ ||
3007 // if previous attempt of reading the block missed cache, try again
3008 block_iter_.status().IsIncomplete()) {
3009 if (block_iter_points_to_real_block_) {
3010 ResetDataIter();
3011 }
3012 auto* rep = table_->get_rep();
3013
3014 // Prefetch additional data for range scans (iterators). Enabled only for
3015 // user reads.
3016 // Implicit auto readahead:
3017 // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
3018 // Explicit user requested readahead:
3019 // Enabled from the very first IO when ReadOptions.readahead_size is set.
3020 if (lookup_context_.caller != TableReaderCaller::kCompaction) {
3021 if (read_options_.readahead_size == 0) {
3022 // Implicit auto readahead
3023 num_file_reads_++;
3024 if (num_file_reads_ >
3025 BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
3026 if (!rep->file->use_direct_io() &&
3027 (data_block_handle.offset() +
3028 static_cast<size_t>(block_size(data_block_handle)) >
3029 readahead_limit_)) {
3030 // Buffered I/O
3031 // Discarding the return status of Prefetch calls intentionally, as
3032 // we can fallback to reading from disk if Prefetch fails.
3033 rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
3034 readahead_limit_ = static_cast<size_t>(data_block_handle.offset() +
3035 readahead_size_);
3036 // Keep exponentially increasing readahead size until
3037 // kMaxAutoReadaheadSize.
3038 readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize,
3039 readahead_size_ * 2);
3040 } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
3041 // Direct I/O
3042 // Let FilePrefetchBuffer take care of the readahead.
3043 rep->CreateFilePrefetchBuffer(
3044 BlockBasedTable::kInitAutoReadaheadSize,
3045 BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
3046 }
3047 }
3048 } else if (!prefetch_buffer_) {
3049 // Explicit user requested readahead
3050 // The actual condition is:
3051 // if (read_options_.readahead_size != 0 && !prefetch_buffer_)
3052 rep->CreateFilePrefetchBuffer(read_options_.readahead_size,
3053 read_options_.readahead_size,
3054 &prefetch_buffer_);
3055 }
3056 } else if (!prefetch_buffer_) {
3057 rep->CreateFilePrefetchBuffer(compaction_readahead_size_,
3058 compaction_readahead_size_,
3059 &prefetch_buffer_);
3060 }
3061
3062 Status s;
3063 table_->NewDataBlockIterator<TBlockIter>(
3064 read_options_, data_block_handle, &block_iter_, block_type_,
3065 /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(),
3066 /*for_compaction=*/lookup_context_.caller ==
3067 TableReaderCaller::kCompaction);
3068 block_iter_points_to_real_block_ = true;
3069 CheckDataBlockWithinUpperBound();
3070 }
3071 }
3072
3073 template <class TBlockIter, typename TValue>
MaterializeCurrentBlock()3074 bool BlockBasedTableIterator<TBlockIter, TValue>::MaterializeCurrentBlock() {
3075 assert(is_at_first_key_from_index_);
3076 assert(!block_iter_points_to_real_block_);
3077 assert(index_iter_->Valid());
3078
3079 is_at_first_key_from_index_ = false;
3080 InitDataBlock();
3081 assert(block_iter_points_to_real_block_);
3082 block_iter_.SeekToFirst();
3083
3084 if (!block_iter_.Valid() ||
3085 icomp_.Compare(block_iter_.key(),
3086 index_iter_->value().first_internal_key) != 0) {
3087 // Uh oh.
3088 block_iter_.Invalidate(Status::Corruption(
3089 "first key in index doesn't match first key in block"));
3090 return false;
3091 }
3092
3093 return true;
3094 }
3095
3096 template <class TBlockIter, typename TValue>
FindKeyForward()3097 void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
3098 // This method's code is kept short to make it likely to be inlined.
3099
3100 assert(!is_out_of_bound_);
3101 assert(block_iter_points_to_real_block_);
3102
3103 if (!block_iter_.Valid()) {
3104 // This is the only call site of FindBlockForward(), but it's extracted into
3105 // a separate method to keep FindKeyForward() short and likely to be
3106 // inlined. When transitioning to a different block, we call
3107 // FindBlockForward(), which is much longer and is probably not inlined.
3108 FindBlockForward();
3109 } else {
3110 // This is the fast path that avoids a function call.
3111 }
3112 }
3113
3114 template <class TBlockIter, typename TValue>
FindBlockForward()3115 void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
3116 // TODO the while loop inherits from two-level-iterator. We don't know
3117 // whether a block can be empty so it can be replaced by an "if".
3118 do {
3119 if (!block_iter_.status().ok()) {
3120 return;
3121 }
3122 // Whether next data block is out of upper bound, if there is one.
3123 const bool next_block_is_out_of_bound =
3124 read_options_.iterate_upper_bound != nullptr &&
3125 block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
3126 assert(!next_block_is_out_of_bound ||
3127 user_comparator_.Compare(*read_options_.iterate_upper_bound,
3128 index_iter_->user_key()) <= 0);
3129 ResetDataIter();
3130 index_iter_->Next();
3131 if (next_block_is_out_of_bound) {
3132 // The next block is out of bound. No need to read it.
3133 TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
3134 // We need to make sure this is not the last data block before setting
3135 // is_out_of_bound_, since the index key for the last data block can be
3136 // larger than smallest key of the next file on the same level.
3137 if (index_iter_->Valid()) {
3138 is_out_of_bound_ = true;
3139 }
3140 return;
3141 }
3142
3143 if (!index_iter_->Valid()) {
3144 return;
3145 }
3146
3147 IndexValue v = index_iter_->value();
3148
3149 // TODO(kolmike): Remove the != kBlockCacheTier condition.
3150 if (!v.first_internal_key.empty() &&
3151 read_options_.read_tier != kBlockCacheTier) {
3152 // Index contains the first key of the block. Defer reading the block.
3153 is_at_first_key_from_index_ = true;
3154 return;
3155 }
3156
3157 InitDataBlock();
3158 block_iter_.SeekToFirst();
3159 } while (!block_iter_.Valid());
3160 }
3161
3162 template <class TBlockIter, typename TValue>
FindKeyBackward()3163 void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
3164 while (!block_iter_.Valid()) {
3165 if (!block_iter_.status().ok()) {
3166 return;
3167 }
3168
3169 ResetDataIter();
3170 index_iter_->Prev();
3171
3172 if (index_iter_->Valid()) {
3173 InitDataBlock();
3174 block_iter_.SeekToLast();
3175 } else {
3176 return;
3177 }
3178 }
3179
3180 // We could have check lower bound here too, but we opt not to do it for
3181 // code simplicity.
3182 }
3183
3184 template <class TBlockIter, typename TValue>
CheckOutOfBound()3185 void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
3186 if (read_options_.iterate_upper_bound != nullptr && Valid()) {
3187 is_out_of_bound_ = user_comparator_.Compare(
3188 *read_options_.iterate_upper_bound, user_key()) <= 0;
3189 }
3190 }
3191
3192 template <class TBlockIter, typename TValue>
3193 void BlockBasedTableIterator<TBlockIter,
CheckDataBlockWithinUpperBound()3194 TValue>::CheckDataBlockWithinUpperBound() {
3195 if (read_options_.iterate_upper_bound != nullptr &&
3196 block_iter_points_to_real_block_) {
3197 data_block_within_upper_bound_ =
3198 (user_comparator_.Compare(*read_options_.iterate_upper_bound,
3199 index_iter_->user_key()) > 0);
3200 }
3201 }
3202
NewIterator(const ReadOptions & read_options,const SliceTransform * prefix_extractor,Arena * arena,bool skip_filters,TableReaderCaller caller,size_t compaction_readahead_size)3203 InternalIterator* BlockBasedTable::NewIterator(
3204 const ReadOptions& read_options, const SliceTransform* prefix_extractor,
3205 Arena* arena, bool skip_filters, TableReaderCaller caller,
3206 size_t compaction_readahead_size) {
3207 BlockCacheLookupContext lookup_context{caller};
3208 bool need_upper_bound_check =
3209 read_options.auto_prefix_mode ||
3210 PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
3211 if (arena == nullptr) {
3212 return new BlockBasedTableIterator<DataBlockIter>(
3213 this, read_options, rep_->internal_comparator,
3214 NewIndexIterator(
3215 read_options,
3216 need_upper_bound_check &&
3217 rep_->index_type == BlockBasedTableOptions::kHashSearch,
3218 /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
3219 !skip_filters && !read_options.total_order_seek &&
3220 prefix_extractor != nullptr,
3221 need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
3222 compaction_readahead_size);
3223 } else {
3224 auto* mem =
3225 arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
3226 return new (mem) BlockBasedTableIterator<DataBlockIter>(
3227 this, read_options, rep_->internal_comparator,
3228 NewIndexIterator(
3229 read_options,
3230 need_upper_bound_check &&
3231 rep_->index_type == BlockBasedTableOptions::kHashSearch,
3232 /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
3233 !skip_filters && !read_options.total_order_seek &&
3234 prefix_extractor != nullptr,
3235 need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
3236 compaction_readahead_size);
3237 }
3238 }
3239
NewRangeTombstoneIterator(const ReadOptions & read_options)3240 FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
3241 const ReadOptions& read_options) {
3242 if (rep_->fragmented_range_dels == nullptr) {
3243 return nullptr;
3244 }
3245 SequenceNumber snapshot = kMaxSequenceNumber;
3246 if (read_options.snapshot != nullptr) {
3247 snapshot = read_options.snapshot->GetSequenceNumber();
3248 }
3249 return new FragmentedRangeTombstoneIterator(
3250 rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
3251 }
3252
FullFilterKeyMayMatch(const ReadOptions & read_options,FilterBlockReader * filter,const Slice & internal_key,const bool no_io,const SliceTransform * prefix_extractor,GetContext * get_context,BlockCacheLookupContext * lookup_context) const3253 bool BlockBasedTable::FullFilterKeyMayMatch(
3254 const ReadOptions& read_options, FilterBlockReader* filter,
3255 const Slice& internal_key, const bool no_io,
3256 const SliceTransform* prefix_extractor, GetContext* get_context,
3257 BlockCacheLookupContext* lookup_context) const {
3258 if (filter == nullptr || filter->IsBlockBased()) {
3259 return true;
3260 }
3261 Slice user_key = ExtractUserKey(internal_key);
3262 const Slice* const const_ikey_ptr = &internal_key;
3263 bool may_match = true;
3264 if (rep_->whole_key_filtering) {
3265 size_t ts_sz =
3266 rep_->internal_comparator.user_comparator()->timestamp_size();
3267 Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
3268 may_match =
3269 filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
3270 no_io, const_ikey_ptr, get_context, lookup_context);
3271 } else if (!read_options.total_order_seek && prefix_extractor &&
3272 rep_->table_properties->prefix_extractor_name.compare(
3273 prefix_extractor->Name()) == 0 &&
3274 prefix_extractor->InDomain(user_key) &&
3275 !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
3276 prefix_extractor, kNotValid, no_io,
3277 const_ikey_ptr, get_context,
3278 lookup_context)) {
3279 may_match = false;
3280 }
3281 if (may_match) {
3282 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
3283 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
3284 }
3285 return may_match;
3286 }
3287
FullFilterKeysMayMatch(const ReadOptions & read_options,FilterBlockReader * filter,MultiGetRange * range,const bool no_io,const SliceTransform * prefix_extractor,BlockCacheLookupContext * lookup_context) const3288 void BlockBasedTable::FullFilterKeysMayMatch(
3289 const ReadOptions& read_options, FilterBlockReader* filter,
3290 MultiGetRange* range, const bool no_io,
3291 const SliceTransform* prefix_extractor,
3292 BlockCacheLookupContext* lookup_context) const {
3293 if (filter == nullptr || filter->IsBlockBased()) {
3294 return;
3295 }
3296 if (rep_->whole_key_filtering) {
3297 filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
3298 lookup_context);
3299 } else if (!read_options.total_order_seek && prefix_extractor &&
3300 rep_->table_properties->prefix_extractor_name.compare(
3301 prefix_extractor->Name()) == 0) {
3302 filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
3303 lookup_context);
3304 }
3305 }
3306
Get(const ReadOptions & read_options,const Slice & key,GetContext * get_context,const SliceTransform * prefix_extractor,bool skip_filters)3307 Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
3308 GetContext* get_context,
3309 const SliceTransform* prefix_extractor,
3310 bool skip_filters) {
3311 assert(key.size() >= 8); // key must be internal key
3312 assert(get_context != nullptr);
3313 Status s;
3314 const bool no_io = read_options.read_tier == kBlockCacheTier;
3315
3316 FilterBlockReader* const filter =
3317 !skip_filters ? rep_->filter.get() : nullptr;
3318
3319 // First check the full filter
3320 // If full filter not useful, Then go into each block
3321 uint64_t tracing_get_id = get_context->get_tracing_get_id();
3322 BlockCacheLookupContext lookup_context{
3323 TableReaderCaller::kUserGet, tracing_get_id,
3324 /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
3325 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
3326 // Trace the key since it contains both user key and sequence number.
3327 lookup_context.referenced_key = key.ToString();
3328 lookup_context.get_from_user_specified_snapshot =
3329 read_options.snapshot != nullptr;
3330 }
3331 const bool may_match =
3332 FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor,
3333 get_context, &lookup_context);
3334 if (!may_match) {
3335 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
3336 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
3337 } else {
3338 IndexBlockIter iiter_on_stack;
3339 // if prefix_extractor found in block differs from options, disable
3340 // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
3341 bool need_upper_bound_check = false;
3342 if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
3343 need_upper_bound_check = PrefixExtractorChanged(
3344 rep_->table_properties.get(), prefix_extractor);
3345 }
3346 auto iiter =
3347 NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
3348 get_context, &lookup_context);
3349 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3350 if (iiter != &iiter_on_stack) {
3351 iiter_unique_ptr.reset(iiter);
3352 }
3353
3354 size_t ts_sz =
3355 rep_->internal_comparator.user_comparator()->timestamp_size();
3356 bool matched = false; // if such user key mathced a key in SST
3357 bool done = false;
3358 for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
3359 IndexValue v = iiter->value();
3360
3361 bool not_exist_in_filter =
3362 filter != nullptr && filter->IsBlockBased() == true &&
3363 !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
3364 prefix_extractor, v.handle.offset(), no_io,
3365 /*const_ikey_ptr=*/nullptr, get_context,
3366 &lookup_context);
3367
3368 if (not_exist_in_filter) {
3369 // Not found
3370 // TODO: think about interaction with Merge. If a user key cannot
3371 // cross one data block, we should be fine.
3372 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
3373 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
3374 break;
3375 }
3376
3377 if (!v.first_internal_key.empty() && !skip_filters &&
3378 UserComparatorWrapper(rep_->internal_comparator.user_comparator())
3379 .Compare(ExtractUserKey(key),
3380 ExtractUserKey(v.first_internal_key)) < 0) {
3381 // The requested key falls between highest key in previous block and
3382 // lowest key in current block.
3383 break;
3384 }
3385
3386 BlockCacheLookupContext lookup_data_block_context{
3387 TableReaderCaller::kUserGet, tracing_get_id,
3388 /*get_from_user_specified_snapshot=*/read_options.snapshot !=
3389 nullptr};
3390 bool does_referenced_key_exist = false;
3391 DataBlockIter biter;
3392 uint64_t referenced_data_size = 0;
3393 NewDataBlockIterator<DataBlockIter>(
3394 read_options, v.handle, &biter, BlockType::kData, get_context,
3395 &lookup_data_block_context,
3396 /*s=*/Status(), /*prefetch_buffer*/ nullptr);
3397
3398 if (no_io && biter.status().IsIncomplete()) {
3399 // couldn't get block from block_cache
3400 // Update Saver.state to Found because we are only looking for
3401 // whether we can guarantee the key is not there when "no_io" is set
3402 get_context->MarkKeyMayExist();
3403 break;
3404 }
3405 if (!biter.status().ok()) {
3406 s = biter.status();
3407 break;
3408 }
3409
3410 bool may_exist = biter.SeekForGet(key);
3411 // If user-specified timestamp is supported, we cannot end the search
3412 // just because hash index lookup indicates the key+ts does not exist.
3413 if (!may_exist && ts_sz == 0) {
3414 // HashSeek cannot find the key this block and the the iter is not
3415 // the end of the block, i.e. cannot be in the following blocks
3416 // either. In this case, the seek_key cannot be found, so we break
3417 // from the top level for-loop.
3418 done = true;
3419 } else {
3420 // Call the *saver function on each entry/block until it returns false
3421 for (; biter.Valid(); biter.Next()) {
3422 ParsedInternalKey parsed_key;
3423 if (!ParseInternalKey(biter.key(), &parsed_key)) {
3424 s = Status::Corruption(Slice());
3425 }
3426
3427 if (!get_context->SaveValue(
3428 parsed_key, biter.value(), &matched,
3429 biter.IsValuePinned() ? &biter : nullptr)) {
3430 if (get_context->State() == GetContext::GetState::kFound) {
3431 does_referenced_key_exist = true;
3432 referenced_data_size = biter.key().size() + biter.value().size();
3433 }
3434 done = true;
3435 break;
3436 }
3437 }
3438 s = biter.status();
3439 }
3440 // Write the block cache access record.
3441 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
3442 // Avoid making copy of block_key, cf_name, and referenced_key when
3443 // constructing the access record.
3444 Slice referenced_key;
3445 if (does_referenced_key_exist) {
3446 referenced_key = biter.key();
3447 } else {
3448 referenced_key = key;
3449 }
3450 BlockCacheTraceRecord access_record(
3451 rep_->ioptions.env->NowMicros(),
3452 /*block_key=*/"", lookup_data_block_context.block_type,
3453 lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
3454 /*cf_name=*/"", rep_->level_for_tracing(),
3455 rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
3456 lookup_data_block_context.is_cache_hit,
3457 lookup_data_block_context.no_insert,
3458 lookup_data_block_context.get_id,
3459 lookup_data_block_context.get_from_user_specified_snapshot,
3460 /*referenced_key=*/"", referenced_data_size,
3461 lookup_data_block_context.num_keys_in_block,
3462 does_referenced_key_exist);
3463 block_cache_tracer_->WriteBlockAccess(
3464 access_record, lookup_data_block_context.block_key,
3465 rep_->cf_name_for_tracing(), referenced_key);
3466 }
3467
3468 if (done) {
3469 // Avoid the extra Next which is expensive in two-level indexes
3470 break;
3471 }
3472 }
3473 if (matched && filter != nullptr && !filter->IsBlockBased()) {
3474 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
3475 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
3476 rep_->level);
3477 }
3478 if (s.ok() && !iiter->status().IsNotFound()) {
3479 s = iiter->status();
3480 }
3481 }
3482
3483 return s;
3484 }
3485
3486 using MultiGetRange = MultiGetContext::Range;
MultiGet(const ReadOptions & read_options,const MultiGetRange * mget_range,const SliceTransform * prefix_extractor,bool skip_filters)3487 void BlockBasedTable::MultiGet(const ReadOptions& read_options,
3488 const MultiGetRange* mget_range,
3489 const SliceTransform* prefix_extractor,
3490 bool skip_filters) {
3491 FilterBlockReader* const filter =
3492 !skip_filters ? rep_->filter.get() : nullptr;
3493 MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
3494 mget_range->end());
3495
3496 // First check the full filter
3497 // If full filter not useful, Then go into each block
3498 const bool no_io = read_options.read_tier == kBlockCacheTier;
3499 uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
3500 if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
3501 tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
3502 }
3503 BlockCacheLookupContext lookup_context{
3504 TableReaderCaller::kUserMultiGet, tracing_mget_id,
3505 /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
3506 FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
3507 prefix_extractor, &lookup_context);
3508
3509 if (skip_filters || !sst_file_range.empty()) {
3510 IndexBlockIter iiter_on_stack;
3511 // if prefix_extractor found in block differs from options, disable
3512 // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
3513 bool need_upper_bound_check = false;
3514 if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
3515 need_upper_bound_check = PrefixExtractorChanged(
3516 rep_->table_properties.get(), prefix_extractor);
3517 }
3518 auto iiter =
3519 NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
3520 sst_file_range.begin()->get_context, &lookup_context);
3521 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3522 if (iiter != &iiter_on_stack) {
3523 iiter_unique_ptr.reset(iiter);
3524 }
3525
3526 uint64_t offset = std::numeric_limits<uint64_t>::max();
3527 autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles;
3528 autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results;
3529 autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
3530 char stack_buf[kMultiGetReadStackBufSize];
3531 std::unique_ptr<char[]> block_buf;
3532 {
3533 MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
3534 sst_file_range.end());
3535
3536 CachableEntry<UncompressionDict> uncompression_dict;
3537 Status uncompression_dict_status;
3538 if (rep_->uncompression_dict_reader) {
3539 uncompression_dict_status =
3540 rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
3541 nullptr /* prefetch_buffer */, no_io,
3542 sst_file_range.begin()->get_context, &lookup_context,
3543 &uncompression_dict);
3544 }
3545
3546 const UncompressionDict& dict = uncompression_dict.GetValue()
3547 ? *uncompression_dict.GetValue()
3548 : UncompressionDict::GetEmptyDict();
3549
3550 size_t total_len = 0;
3551 ReadOptions ro = read_options;
3552 ro.read_tier = kBlockCacheTier;
3553
3554 for (auto miter = data_block_range.begin();
3555 miter != data_block_range.end(); ++miter) {
3556 const Slice& key = miter->ikey;
3557 iiter->Seek(miter->ikey);
3558
3559 IndexValue v;
3560 if (iiter->Valid()) {
3561 v = iiter->value();
3562 }
3563 if (!iiter->Valid() ||
3564 (!v.first_internal_key.empty() && !skip_filters &&
3565 UserComparatorWrapper(rep_->internal_comparator.user_comparator())
3566 .Compare(ExtractUserKey(key),
3567 ExtractUserKey(v.first_internal_key)) < 0)) {
3568 // The requested key falls between highest key in previous block and
3569 // lowest key in current block.
3570 *(miter->s) = iiter->status();
3571 data_block_range.SkipKey(miter);
3572 sst_file_range.SkipKey(miter);
3573 continue;
3574 }
3575
3576 if (!uncompression_dict_status.ok()) {
3577 *(miter->s) = uncompression_dict_status;
3578 data_block_range.SkipKey(miter);
3579 sst_file_range.SkipKey(miter);
3580 continue;
3581 }
3582
3583 statuses.emplace_back();
3584 results.emplace_back();
3585 if (v.handle.offset() == offset) {
3586 // We're going to reuse the block for this key later on. No need to
3587 // look it up now. Place a null handle
3588 block_handles.emplace_back(BlockHandle::NullBlockHandle());
3589 continue;
3590 }
3591 // Lookup the cache for the given data block referenced by an index
3592 // iterator value (i.e BlockHandle). If it exists in the cache,
3593 // initialize block to the contents of the data block.
3594 offset = v.handle.offset();
3595 BlockHandle handle = v.handle;
3596 BlockCacheLookupContext lookup_data_block_context(
3597 TableReaderCaller::kUserMultiGet);
3598 Status s = RetrieveBlock(
3599 nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
3600 miter->get_context, &lookup_data_block_context,
3601 /* for_compaction */ false, /* use_cache */ true);
3602 if (s.IsIncomplete()) {
3603 s = Status::OK();
3604 }
3605 if (s.ok() && !results.back().IsEmpty()) {
3606 // Found it in the cache. Add NULL handle to indicate there is
3607 // nothing to read from disk
3608 block_handles.emplace_back(BlockHandle::NullBlockHandle());
3609 } else {
3610 block_handles.emplace_back(handle);
3611 total_len += block_size(handle);
3612 }
3613 }
3614
3615 if (total_len) {
3616 char* scratch = nullptr;
3617 // If the blocks need to be uncompressed and we don't need the
3618 // compressed blocks, then we can use a contiguous block of
3619 // memory to read in all the blocks as it will be temporary
3620 // storage
3621 // 1. If blocks are compressed and compressed block cache is there,
3622 // alloc heap bufs
3623 // 2. If blocks are uncompressed, alloc heap bufs
3624 // 3. If blocks are compressed and no compressed block cache, use
3625 // stack buf
3626 if (rep_->table_options.block_cache_compressed == nullptr &&
3627 rep_->blocks_maybe_compressed) {
3628 if (total_len <= kMultiGetReadStackBufSize) {
3629 scratch = stack_buf;
3630 } else {
3631 scratch = new char[total_len];
3632 block_buf.reset(scratch);
3633 }
3634 }
3635 RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles,
3636 &statuses, &results, scratch, dict);
3637 }
3638 }
3639
3640 DataBlockIter first_biter;
3641 DataBlockIter next_biter;
3642 size_t idx_in_batch = 0;
3643 for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
3644 ++miter) {
3645 Status s;
3646 GetContext* get_context = miter->get_context;
3647 const Slice& key = miter->ikey;
3648 bool matched = false; // if such user key matched a key in SST
3649 bool done = false;
3650 bool first_block = true;
3651 do {
3652 DataBlockIter* biter = nullptr;
3653 bool reusing_block = true;
3654 uint64_t referenced_data_size = 0;
3655 bool does_referenced_key_exist = false;
3656 BlockCacheLookupContext lookup_data_block_context(
3657 TableReaderCaller::kUserMultiGet, tracing_mget_id,
3658 /*get_from_user_specified_snapshot=*/read_options.snapshot !=
3659 nullptr);
3660 if (first_block) {
3661 if (!block_handles[idx_in_batch].IsNull() ||
3662 !results[idx_in_batch].IsEmpty()) {
3663 first_biter.Invalidate(Status::OK());
3664 NewDataBlockIterator<DataBlockIter>(
3665 read_options, results[idx_in_batch], &first_biter,
3666 statuses[idx_in_batch]);
3667 reusing_block = false;
3668 }
3669 biter = &first_biter;
3670 idx_in_batch++;
3671 } else {
3672 IndexValue v = iiter->value();
3673 if (!v.first_internal_key.empty() && !skip_filters &&
3674 UserComparatorWrapper(rep_->internal_comparator.user_comparator())
3675 .Compare(ExtractUserKey(key),
3676 ExtractUserKey(v.first_internal_key)) < 0) {
3677 // The requested key falls between highest key in previous block and
3678 // lowest key in current block.
3679 break;
3680 }
3681
3682 next_biter.Invalidate(Status::OK());
3683 NewDataBlockIterator<DataBlockIter>(
3684 read_options, iiter->value().handle, &next_biter,
3685 BlockType::kData, get_context, &lookup_data_block_context,
3686 Status(), nullptr);
3687 biter = &next_biter;
3688 reusing_block = false;
3689 }
3690
3691 if (read_options.read_tier == kBlockCacheTier &&
3692 biter->status().IsIncomplete()) {
3693 // couldn't get block from block_cache
3694 // Update Saver.state to Found because we are only looking for
3695 // whether we can guarantee the key is not there when "no_io" is set
3696 get_context->MarkKeyMayExist();
3697 break;
3698 }
3699 if (!biter->status().ok()) {
3700 s = biter->status();
3701 break;
3702 }
3703
3704 bool may_exist = biter->SeekForGet(key);
3705 if (!may_exist) {
3706 // HashSeek cannot find the key this block and the the iter is not
3707 // the end of the block, i.e. cannot be in the following blocks
3708 // either. In this case, the seek_key cannot be found, so we break
3709 // from the top level for-loop.
3710 break;
3711 }
3712
3713 // Call the *saver function on each entry/block until it returns false
3714 for (; biter->Valid(); biter->Next()) {
3715 ParsedInternalKey parsed_key;
3716 Cleanable dummy;
3717 Cleanable* value_pinner = nullptr;
3718 if (!ParseInternalKey(biter->key(), &parsed_key)) {
3719 s = Status::Corruption(Slice());
3720 }
3721 if (biter->IsValuePinned()) {
3722 if (reusing_block) {
3723 Cache* block_cache = rep_->table_options.block_cache.get();
3724 assert(biter->cache_handle() != nullptr);
3725 block_cache->Ref(biter->cache_handle());
3726 dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
3727 biter->cache_handle());
3728 value_pinner = &dummy;
3729 } else {
3730 value_pinner = biter;
3731 }
3732 }
3733 if (!get_context->SaveValue(parsed_key, biter->value(), &matched,
3734 value_pinner)) {
3735 if (get_context->State() == GetContext::GetState::kFound) {
3736 does_referenced_key_exist = true;
3737 referenced_data_size =
3738 biter->key().size() + biter->value().size();
3739 }
3740 done = true;
3741 break;
3742 }
3743 s = biter->status();
3744 }
3745 // Write the block cache access.
3746 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
3747 // Avoid making copy of block_key, cf_name, and referenced_key when
3748 // constructing the access record.
3749 Slice referenced_key;
3750 if (does_referenced_key_exist) {
3751 referenced_key = biter->key();
3752 } else {
3753 referenced_key = key;
3754 }
3755 BlockCacheTraceRecord access_record(
3756 rep_->ioptions.env->NowMicros(),
3757 /*block_key=*/"", lookup_data_block_context.block_type,
3758 lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
3759 /*cf_name=*/"", rep_->level_for_tracing(),
3760 rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
3761 lookup_data_block_context.is_cache_hit,
3762 lookup_data_block_context.no_insert,
3763 lookup_data_block_context.get_id,
3764 lookup_data_block_context.get_from_user_specified_snapshot,
3765 /*referenced_key=*/"", referenced_data_size,
3766 lookup_data_block_context.num_keys_in_block,
3767 does_referenced_key_exist);
3768 block_cache_tracer_->WriteBlockAccess(
3769 access_record, lookup_data_block_context.block_key,
3770 rep_->cf_name_for_tracing(), referenced_key);
3771 }
3772 s = biter->status();
3773 if (done) {
3774 // Avoid the extra Next which is expensive in two-level indexes
3775 break;
3776 }
3777 if (first_block) {
3778 iiter->Seek(key);
3779 }
3780 first_block = false;
3781 iiter->Next();
3782 } while (iiter->Valid());
3783
3784 if (matched && filter != nullptr && !filter->IsBlockBased()) {
3785 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
3786 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
3787 rep_->level);
3788 }
3789 if (s.ok()) {
3790 s = iiter->status();
3791 }
3792 *(miter->s) = s;
3793 }
3794 }
3795 }
3796
Prefetch(const Slice * const begin,const Slice * const end)3797 Status BlockBasedTable::Prefetch(const Slice* const begin,
3798 const Slice* const end) {
3799 auto& comparator = rep_->internal_comparator;
3800 UserComparatorWrapper user_comparator(comparator.user_comparator());
3801 // pre-condition
3802 if (begin && end && comparator.Compare(*begin, *end) > 0) {
3803 return Status::InvalidArgument(*begin, *end);
3804 }
3805 BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
3806 IndexBlockIter iiter_on_stack;
3807 auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
3808 &iiter_on_stack, /*get_context=*/nullptr,
3809 &lookup_context);
3810 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3811 if (iiter != &iiter_on_stack) {
3812 iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
3813 }
3814
3815 if (!iiter->status().ok()) {
3816 // error opening index iterator
3817 return iiter->status();
3818 }
3819
3820 // indicates if we are on the last page that need to be pre-fetched
3821 bool prefetching_boundary_page = false;
3822
3823 for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
3824 iiter->Next()) {
3825 BlockHandle block_handle = iiter->value().handle;
3826 const bool is_user_key = !rep_->index_key_includes_seq;
3827 if (end &&
3828 ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
3829 (is_user_key &&
3830 user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
3831 if (prefetching_boundary_page) {
3832 break;
3833 }
3834
3835 // The index entry represents the last key in the data block.
3836 // We should load this page into memory as well, but no more
3837 prefetching_boundary_page = true;
3838 }
3839
3840 // Load the block specified by the block_handle into the block cache
3841 DataBlockIter biter;
3842
3843 NewDataBlockIterator<DataBlockIter>(
3844 ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
3845 /*get_context=*/nullptr, &lookup_context, Status(),
3846 /*prefetch_buffer=*/nullptr);
3847
3848 if (!biter.status().ok()) {
3849 // there was an unexpected error while pre-fetching
3850 return biter.status();
3851 }
3852 }
3853
3854 return Status::OK();
3855 }
3856
VerifyChecksum(const ReadOptions & read_options,TableReaderCaller caller)3857 Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
3858 TableReaderCaller caller) {
3859 Status s;
3860 // Check Meta blocks
3861 std::unique_ptr<Block> metaindex;
3862 std::unique_ptr<InternalIterator> metaindex_iter;
3863 s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex,
3864 &metaindex_iter);
3865 if (s.ok()) {
3866 s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
3867 if (!s.ok()) {
3868 return s;
3869 }
3870 } else {
3871 return s;
3872 }
3873 // Check Data blocks
3874 IndexBlockIter iiter_on_stack;
3875 BlockCacheLookupContext context{caller};
3876 InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
3877 read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
3878 /*get_context=*/nullptr, &context);
3879 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3880 if (iiter != &iiter_on_stack) {
3881 iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
3882 }
3883 if (!iiter->status().ok()) {
3884 // error opening index iterator
3885 return iiter->status();
3886 }
3887 s = VerifyChecksumInBlocks(read_options, iiter);
3888 return s;
3889 }
3890
VerifyChecksumInBlocks(const ReadOptions & read_options,InternalIteratorBase<IndexValue> * index_iter)3891 Status BlockBasedTable::VerifyChecksumInBlocks(
3892 const ReadOptions& read_options,
3893 InternalIteratorBase<IndexValue>* index_iter) {
3894 Status s;
3895 // We are scanning the whole file, so no need to do exponential
3896 // increasing of the buffer size.
3897 size_t readahead_size = (read_options.readahead_size != 0)
3898 ? read_options.readahead_size
3899 : kMaxAutoReadaheadSize;
3900 // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
3901 // needed there.
3902 FilePrefetchBuffer prefetch_buffer(
3903 rep_->file.get(), readahead_size /* readadhead_size */,
3904 readahead_size /* max_readahead_size */,
3905 !rep_->ioptions.allow_mmap_reads /* enable */);
3906
3907 for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
3908 s = index_iter->status();
3909 if (!s.ok()) {
3910 break;
3911 }
3912 BlockHandle handle = index_iter->value().handle;
3913 BlockContents contents;
3914 BlockFetcher block_fetcher(
3915 rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle,
3916 &contents, rep_->ioptions, false /* decompress */,
3917 false /*maybe_compressed*/, BlockType::kData,
3918 UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
3919 s = block_fetcher.ReadBlockContents();
3920 if (!s.ok()) {
3921 break;
3922 }
3923 }
3924 return s;
3925 }
3926
GetBlockTypeForMetaBlockByName(const Slice & meta_block_name)3927 BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
3928 const Slice& meta_block_name) {
3929 if (meta_block_name.starts_with(kFilterBlockPrefix) ||
3930 meta_block_name.starts_with(kFullFilterBlockPrefix) ||
3931 meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
3932 return BlockType::kFilter;
3933 }
3934
3935 if (meta_block_name == kPropertiesBlock) {
3936 return BlockType::kProperties;
3937 }
3938
3939 if (meta_block_name == kCompressionDictBlock) {
3940 return BlockType::kCompressionDictionary;
3941 }
3942
3943 if (meta_block_name == kRangeDelBlock) {
3944 return BlockType::kRangeDeletion;
3945 }
3946
3947 if (meta_block_name == kHashIndexPrefixesBlock) {
3948 return BlockType::kHashIndexPrefixes;
3949 }
3950
3951 if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
3952 return BlockType::kHashIndexMetadata;
3953 }
3954
3955 assert(false);
3956 return BlockType::kInvalid;
3957 }
3958
VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice> * index_iter)3959 Status BlockBasedTable::VerifyChecksumInMetaBlocks(
3960 InternalIteratorBase<Slice>* index_iter) {
3961 Status s;
3962 for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
3963 s = index_iter->status();
3964 if (!s.ok()) {
3965 break;
3966 }
3967 BlockHandle handle;
3968 Slice input = index_iter->value();
3969 s = handle.DecodeFrom(&input);
3970 BlockContents contents;
3971 const Slice meta_block_name = index_iter->key();
3972 BlockFetcher block_fetcher(
3973 rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
3974 ReadOptions(), handle, &contents, rep_->ioptions,
3975 false /* decompress */, false /*maybe_compressed*/,
3976 GetBlockTypeForMetaBlockByName(meta_block_name),
3977 UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
3978 s = block_fetcher.ReadBlockContents();
3979 if (s.IsCorruption() && meta_block_name == kPropertiesBlock) {
3980 TableProperties* table_properties;
3981 s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
3982 index_iter->value(),
3983 &table_properties);
3984 delete table_properties;
3985 }
3986 if (!s.ok()) {
3987 break;
3988 }
3989 }
3990 return s;
3991 }
3992
TEST_BlockInCache(const BlockHandle & handle) const3993 bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
3994 assert(rep_ != nullptr);
3995
3996 Cache* const cache = rep_->table_options.block_cache.get();
3997 if (cache == nullptr) {
3998 return false;
3999 }
4000
4001 char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
4002 Slice cache_key =
4003 GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
4004 cache_key_storage);
4005
4006 Cache::Handle* const cache_handle = cache->Lookup(cache_key);
4007 if (cache_handle == nullptr) {
4008 return false;
4009 }
4010
4011 cache->Release(cache_handle);
4012
4013 return true;
4014 }
4015
TEST_KeyInCache(const ReadOptions & options,const Slice & key)4016 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
4017 const Slice& key) {
4018 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
4019 options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
4020 /*get_context=*/nullptr, /*lookup_context=*/nullptr));
4021 iiter->Seek(key);
4022 assert(iiter->Valid());
4023
4024 return TEST_BlockInCache(iiter->value().handle);
4025 }
4026
4027 // REQUIRES: The following fields of rep_ should have already been populated:
4028 // 1. file
4029 // 2. index_handle,
4030 // 3. options
4031 // 4. internal_comparator
4032 // 5. index_type
CreateIndexReader(FilePrefetchBuffer * prefetch_buffer,InternalIterator * preloaded_meta_index_iter,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)4033 Status BlockBasedTable::CreateIndexReader(
4034 FilePrefetchBuffer* prefetch_buffer,
4035 InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
4036 bool pin, BlockCacheLookupContext* lookup_context,
4037 std::unique_ptr<IndexReader>* index_reader) {
4038 // kHashSearch requires non-empty prefix_extractor but bypass checking
4039 // prefix_extractor here since we have no access to MutableCFOptions.
4040 // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator.
4041 // If prefix_extractor does not match prefix_extractor_name from table
4042 // properties, turn off Hash Index by setting total_order_seek to true
4043
4044 switch (rep_->index_type) {
4045 case BlockBasedTableOptions::kTwoLevelIndexSearch: {
4046 return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
4047 prefetch, pin, lookup_context,
4048 index_reader);
4049 }
4050 case BlockBasedTableOptions::kBinarySearch:
4051 FALLTHROUGH_INTENDED;
4052 case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
4053 return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
4054 prefetch, pin, lookup_context,
4055 index_reader);
4056 }
4057 case BlockBasedTableOptions::kHashSearch: {
4058 std::unique_ptr<Block> metaindex_guard;
4059 std::unique_ptr<InternalIterator> metaindex_iter_guard;
4060 auto meta_index_iter = preloaded_meta_index_iter;
4061 bool should_fallback = false;
4062 if (rep_->internal_prefix_transform.get() == nullptr) {
4063 ROCKS_LOG_WARN(rep_->ioptions.info_log,
4064 "No prefix extractor passed in. Fall back to binary"
4065 " search index.");
4066 should_fallback = true;
4067 } else if (meta_index_iter == nullptr) {
4068 auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard,
4069 &metaindex_iter_guard);
4070 if (!s.ok()) {
4071 // we simply fall back to binary search in case there is any
4072 // problem with prefix hash index loading.
4073 ROCKS_LOG_WARN(rep_->ioptions.info_log,
4074 "Unable to read the metaindex block."
4075 " Fall back to binary search index.");
4076 should_fallback = true;
4077 }
4078 meta_index_iter = metaindex_iter_guard.get();
4079 }
4080
4081 if (should_fallback) {
4082 return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
4083 prefetch, pin, lookup_context,
4084 index_reader);
4085 } else {
4086 return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
4087 use_cache, prefetch, pin, lookup_context,
4088 index_reader);
4089 }
4090 }
4091 default: {
4092 std::string error_message =
4093 "Unrecognized index type: " + ToString(rep_->index_type);
4094 return Status::InvalidArgument(error_message.c_str());
4095 }
4096 }
4097 }
4098
ApproximateOffsetOf(const InternalIteratorBase<IndexValue> & index_iter) const4099 uint64_t BlockBasedTable::ApproximateOffsetOf(
4100 const InternalIteratorBase<IndexValue>& index_iter) const {
4101 uint64_t result = 0;
4102 if (index_iter.Valid()) {
4103 BlockHandle handle = index_iter.value().handle;
4104 result = handle.offset();
4105 } else {
4106 // The iterator is past the last key in the file. If table_properties is not
4107 // available, approximate the offset by returning the offset of the
4108 // metaindex block (which is right near the end of the file).
4109 if (rep_->table_properties) {
4110 result = rep_->table_properties->data_size;
4111 }
4112 // table_properties is not present in the table.
4113 if (result == 0) {
4114 result = rep_->footer.metaindex_handle().offset();
4115 }
4116 }
4117
4118 return result;
4119 }
4120
ApproximateOffsetOf(const Slice & key,TableReaderCaller caller)4121 uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
4122 TableReaderCaller caller) {
4123 BlockCacheLookupContext context(caller);
4124 IndexBlockIter iiter_on_stack;
4125 ReadOptions ro;
4126 ro.total_order_seek = true;
4127 auto index_iter =
4128 NewIndexIterator(ro, /*disable_prefix_seek=*/true,
4129 /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
4130 /*lookup_context=*/&context);
4131 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
4132 if (index_iter != &iiter_on_stack) {
4133 iiter_unique_ptr.reset(index_iter);
4134 }
4135
4136 index_iter->Seek(key);
4137 return ApproximateOffsetOf(*index_iter);
4138 }
4139
ApproximateSize(const Slice & start,const Slice & end,TableReaderCaller caller)4140 uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
4141 TableReaderCaller caller) {
4142 assert(rep_->internal_comparator.Compare(start, end) <= 0);
4143
4144 BlockCacheLookupContext context(caller);
4145 IndexBlockIter iiter_on_stack;
4146 ReadOptions ro;
4147 ro.total_order_seek = true;
4148 auto index_iter =
4149 NewIndexIterator(ro, /*disable_prefix_seek=*/true,
4150 /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
4151 /*lookup_context=*/&context);
4152 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
4153 if (index_iter != &iiter_on_stack) {
4154 iiter_unique_ptr.reset(index_iter);
4155 }
4156
4157 index_iter->Seek(start);
4158 uint64_t start_offset = ApproximateOffsetOf(*index_iter);
4159 index_iter->Seek(end);
4160 uint64_t end_offset = ApproximateOffsetOf(*index_iter);
4161
4162 assert(end_offset >= start_offset);
4163 return end_offset - start_offset;
4164 }
4165
TEST_FilterBlockInCache() const4166 bool BlockBasedTable::TEST_FilterBlockInCache() const {
4167 assert(rep_ != nullptr);
4168 return TEST_BlockInCache(rep_->filter_handle);
4169 }
4170
TEST_IndexBlockInCache() const4171 bool BlockBasedTable::TEST_IndexBlockInCache() const {
4172 assert(rep_ != nullptr);
4173
4174 return TEST_BlockInCache(rep_->footer.index_handle());
4175 }
4176
GetKVPairsFromDataBlocks(std::vector<KVPairBlock> * kv_pair_blocks)4177 Status BlockBasedTable::GetKVPairsFromDataBlocks(
4178 std::vector<KVPairBlock>* kv_pair_blocks) {
4179 std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
4180 NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
4181 /*input_iter=*/nullptr, /*get_context=*/nullptr,
4182 /*lookup_contex=*/nullptr));
4183
4184 Status s = blockhandles_iter->status();
4185 if (!s.ok()) {
4186 // Cannot read Index Block
4187 return s;
4188 }
4189
4190 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
4191 blockhandles_iter->Next()) {
4192 s = blockhandles_iter->status();
4193
4194 if (!s.ok()) {
4195 break;
4196 }
4197
4198 std::unique_ptr<InternalIterator> datablock_iter;
4199 datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
4200 ReadOptions(), blockhandles_iter->value().handle,
4201 /*input_iter=*/nullptr, /*type=*/BlockType::kData,
4202 /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
4203 /*prefetch_buffer=*/nullptr));
4204 s = datablock_iter->status();
4205
4206 if (!s.ok()) {
4207 // Error reading the block - Skipped
4208 continue;
4209 }
4210
4211 KVPairBlock kv_pair_block;
4212 for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
4213 datablock_iter->Next()) {
4214 s = datablock_iter->status();
4215 if (!s.ok()) {
4216 // Error reading the block - Skipped
4217 break;
4218 }
4219 const Slice& key = datablock_iter->key();
4220 const Slice& value = datablock_iter->value();
4221 std::string key_copy = std::string(key.data(), key.size());
4222 std::string value_copy = std::string(value.data(), value.size());
4223
4224 kv_pair_block.push_back(
4225 std::make_pair(std::move(key_copy), std::move(value_copy)));
4226 }
4227 kv_pair_blocks->push_back(std::move(kv_pair_block));
4228 }
4229 return Status::OK();
4230 }
4231
DumpTable(WritableFile * out_file)4232 Status BlockBasedTable::DumpTable(WritableFile* out_file) {
4233 // Output Footer
4234 out_file->Append(
4235 "Footer Details:\n"
4236 "--------------------------------------\n"
4237 " ");
4238 out_file->Append(rep_->footer.ToString().c_str());
4239 out_file->Append("\n");
4240
4241 // Output MetaIndex
4242 out_file->Append(
4243 "Metaindex Details:\n"
4244 "--------------------------------------\n");
4245 std::unique_ptr<Block> metaindex;
4246 std::unique_ptr<InternalIterator> metaindex_iter;
4247 Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex,
4248 &metaindex_iter);
4249 if (s.ok()) {
4250 for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
4251 metaindex_iter->Next()) {
4252 s = metaindex_iter->status();
4253 if (!s.ok()) {
4254 return s;
4255 }
4256 if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kPropertiesBlock) {
4257 out_file->Append(" Properties block handle: ");
4258 out_file->Append(metaindex_iter->value().ToString(true).c_str());
4259 out_file->Append("\n");
4260 } else if (metaindex_iter->key() ==
4261 ROCKSDB_NAMESPACE::kCompressionDictBlock) {
4262 out_file->Append(" Compression dictionary block handle: ");
4263 out_file->Append(metaindex_iter->value().ToString(true).c_str());
4264 out_file->Append("\n");
4265 } else if (strstr(metaindex_iter->key().ToString().c_str(),
4266 "filter.rocksdb.") != nullptr) {
4267 out_file->Append(" Filter block handle: ");
4268 out_file->Append(metaindex_iter->value().ToString(true).c_str());
4269 out_file->Append("\n");
4270 } else if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kRangeDelBlock) {
4271 out_file->Append(" Range deletion block handle: ");
4272 out_file->Append(metaindex_iter->value().ToString(true).c_str());
4273 out_file->Append("\n");
4274 }
4275 }
4276 out_file->Append("\n");
4277 } else {
4278 return s;
4279 }
4280
4281 // Output TableProperties
4282 const ROCKSDB_NAMESPACE::TableProperties* table_properties;
4283 table_properties = rep_->table_properties.get();
4284
4285 if (table_properties != nullptr) {
4286 out_file->Append(
4287 "Table Properties:\n"
4288 "--------------------------------------\n"
4289 " ");
4290 out_file->Append(table_properties->ToString("\n ", ": ").c_str());
4291 out_file->Append("\n");
4292 }
4293
4294 if (rep_->filter) {
4295 out_file->Append(
4296 "Filter Details:\n"
4297 "--------------------------------------\n"
4298 " ");
4299 out_file->Append(rep_->filter->ToString().c_str());
4300 out_file->Append("\n");
4301 }
4302
4303 // Output Index block
4304 s = DumpIndexBlock(out_file);
4305 if (!s.ok()) {
4306 return s;
4307 }
4308
4309 // Output compression dictionary
4310 if (rep_->uncompression_dict_reader) {
4311 CachableEntry<UncompressionDict> uncompression_dict;
4312 s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
4313 nullptr /* prefetch_buffer */, false /* no_io */,
4314 nullptr /* get_context */, nullptr /* lookup_context */,
4315 &uncompression_dict);
4316 if (!s.ok()) {
4317 return s;
4318 }
4319
4320 assert(uncompression_dict.GetValue());
4321
4322 const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
4323 out_file->Append(
4324 "Compression Dictionary:\n"
4325 "--------------------------------------\n");
4326 out_file->Append(" size (bytes): ");
4327 out_file->Append(ROCKSDB_NAMESPACE::ToString(raw_dict.size()));
4328 out_file->Append("\n\n");
4329 out_file->Append(" HEX ");
4330 out_file->Append(raw_dict.ToString(true).c_str());
4331 out_file->Append("\n\n");
4332 }
4333
4334 // Output range deletions block
4335 auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
4336 if (range_del_iter != nullptr) {
4337 range_del_iter->SeekToFirst();
4338 if (range_del_iter->Valid()) {
4339 out_file->Append(
4340 "Range deletions:\n"
4341 "--------------------------------------\n"
4342 " ");
4343 for (; range_del_iter->Valid(); range_del_iter->Next()) {
4344 DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
4345 }
4346 out_file->Append("\n");
4347 }
4348 delete range_del_iter;
4349 }
4350 // Output Data blocks
4351 s = DumpDataBlocks(out_file);
4352
4353 return s;
4354 }
4355
DumpIndexBlock(WritableFile * out_file)4356 Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
4357 out_file->Append(
4358 "Index Details:\n"
4359 "--------------------------------------\n");
4360 std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
4361 NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
4362 /*input_iter=*/nullptr, /*get_context=*/nullptr,
4363 /*lookup_contex=*/nullptr));
4364 Status s = blockhandles_iter->status();
4365 if (!s.ok()) {
4366 out_file->Append("Can not read Index Block \n\n");
4367 return s;
4368 }
4369
4370 out_file->Append(" Block key hex dump: Data block handle\n");
4371 out_file->Append(" Block key ascii\n\n");
4372 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
4373 blockhandles_iter->Next()) {
4374 s = blockhandles_iter->status();
4375 if (!s.ok()) {
4376 break;
4377 }
4378 Slice key = blockhandles_iter->key();
4379 Slice user_key;
4380 InternalKey ikey;
4381 if (!rep_->index_key_includes_seq) {
4382 user_key = key;
4383 } else {
4384 ikey.DecodeFrom(key);
4385 user_key = ikey.user_key();
4386 }
4387
4388 out_file->Append(" HEX ");
4389 out_file->Append(user_key.ToString(true).c_str());
4390 out_file->Append(": ");
4391 out_file->Append(blockhandles_iter->value()
4392 .ToString(true, rep_->index_has_first_key)
4393 .c_str());
4394 out_file->Append("\n");
4395
4396 std::string str_key = user_key.ToString();
4397 std::string res_key("");
4398 char cspace = ' ';
4399 for (size_t i = 0; i < str_key.size(); i++) {
4400 res_key.append(&str_key[i], 1);
4401 res_key.append(1, cspace);
4402 }
4403 out_file->Append(" ASCII ");
4404 out_file->Append(res_key.c_str());
4405 out_file->Append("\n ------\n");
4406 }
4407 out_file->Append("\n");
4408 return Status::OK();
4409 }
4410
DumpDataBlocks(WritableFile * out_file)4411 Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
4412 std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
4413 NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
4414 /*input_iter=*/nullptr, /*get_context=*/nullptr,
4415 /*lookup_contex=*/nullptr));
4416 Status s = blockhandles_iter->status();
4417 if (!s.ok()) {
4418 out_file->Append("Can not read Index Block \n\n");
4419 return s;
4420 }
4421
4422 uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
4423 uint64_t datablock_size_max = 0;
4424 uint64_t datablock_size_sum = 0;
4425
4426 size_t block_id = 1;
4427 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
4428 block_id++, blockhandles_iter->Next()) {
4429 s = blockhandles_iter->status();
4430 if (!s.ok()) {
4431 break;
4432 }
4433
4434 BlockHandle bh = blockhandles_iter->value().handle;
4435 uint64_t datablock_size = bh.size();
4436 datablock_size_min = std::min(datablock_size_min, datablock_size);
4437 datablock_size_max = std::max(datablock_size_max, datablock_size);
4438 datablock_size_sum += datablock_size;
4439
4440 out_file->Append("Data Block # ");
4441 out_file->Append(ROCKSDB_NAMESPACE::ToString(block_id));
4442 out_file->Append(" @ ");
4443 out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str());
4444 out_file->Append("\n");
4445 out_file->Append("--------------------------------------\n");
4446
4447 std::unique_ptr<InternalIterator> datablock_iter;
4448 datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
4449 ReadOptions(), blockhandles_iter->value().handle,
4450 /*input_iter=*/nullptr, /*type=*/BlockType::kData,
4451 /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
4452 /*prefetch_buffer=*/nullptr));
4453 s = datablock_iter->status();
4454
4455 if (!s.ok()) {
4456 out_file->Append("Error reading the block - Skipped \n\n");
4457 continue;
4458 }
4459
4460 for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
4461 datablock_iter->Next()) {
4462 s = datablock_iter->status();
4463 if (!s.ok()) {
4464 out_file->Append("Error reading the block - Skipped \n");
4465 break;
4466 }
4467 DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
4468 }
4469 out_file->Append("\n");
4470 }
4471
4472 uint64_t num_datablocks = block_id - 1;
4473 if (num_datablocks) {
4474 double datablock_size_avg =
4475 static_cast<double>(datablock_size_sum) / num_datablocks;
4476 out_file->Append("Data Block Summary:\n");
4477 out_file->Append("--------------------------------------");
4478 out_file->Append("\n # data blocks: ");
4479 out_file->Append(ROCKSDB_NAMESPACE::ToString(num_datablocks));
4480 out_file->Append("\n min data block size: ");
4481 out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_min));
4482 out_file->Append("\n max data block size: ");
4483 out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_max));
4484 out_file->Append("\n avg data block size: ");
4485 out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_avg));
4486 out_file->Append("\n");
4487 }
4488
4489 return Status::OK();
4490 }
4491
DumpKeyValue(const Slice & key,const Slice & value,WritableFile * out_file)4492 void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
4493 WritableFile* out_file) {
4494 InternalKey ikey;
4495 ikey.DecodeFrom(key);
4496
4497 out_file->Append(" HEX ");
4498 out_file->Append(ikey.user_key().ToString(true).c_str());
4499 out_file->Append(": ");
4500 out_file->Append(value.ToString(true).c_str());
4501 out_file->Append("\n");
4502
4503 std::string str_key = ikey.user_key().ToString();
4504 std::string str_value = value.ToString();
4505 std::string res_key(""), res_value("");
4506 char cspace = ' ';
4507 for (size_t i = 0; i < str_key.size(); i++) {
4508 if (str_key[i] == '\0') {
4509 res_key.append("\\0", 2);
4510 } else {
4511 res_key.append(&str_key[i], 1);
4512 }
4513 res_key.append(1, cspace);
4514 }
4515 for (size_t i = 0; i < str_value.size(); i++) {
4516 if (str_value[i] == '\0') {
4517 res_value.append("\\0", 2);
4518 } else {
4519 res_value.append(&str_value[i], 1);
4520 }
4521 res_value.append(1, cspace);
4522 }
4523
4524 out_file->Append(" ASCII ");
4525 out_file->Append(res_key.c_str());
4526 out_file->Append(": ");
4527 out_file->Append(res_value.c_str());
4528 out_file->Append("\n ------\n");
4529 }
4530
4531 } // namespace ROCKSDB_NAMESPACE
4532