1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 #include "table/block_based/block_based_table_reader.h"
10 #include <algorithm>
11 #include <array>
12 #include <limits>
13 #include <string>
14 #include <utility>
15 #include <vector>
16
17 #include "db/dbformat.h"
18 #include "db/pinned_iterators_manager.h"
19
20 #include "file/file_prefetch_buffer.h"
21 #include "file/random_access_file_reader.h"
22
23 #include "rocksdb/cache.h"
24 #include "rocksdb/comparator.h"
25 #include "rocksdb/env.h"
26 #include "rocksdb/file_system.h"
27 #include "rocksdb/filter_policy.h"
28 #include "rocksdb/iterator.h"
29 #include "rocksdb/options.h"
30 #include "rocksdb/statistics.h"
31 #include "rocksdb/table.h"
32 #include "rocksdb/table_properties.h"
33
34 #include "table/block_based/block.h"
35 #include "table/block_based/block_based_filter_block.h"
36 #include "table/block_based/block_based_table_factory.h"
37 #include "table/block_based/block_prefix_index.h"
38 #include "table/block_based/filter_block.h"
39 #include "table/block_based/full_filter_block.h"
40 #include "table/block_based/partitioned_filter_block.h"
41 #include "table/block_fetcher.h"
42 #include "table/format.h"
43 #include "table/get_context.h"
44 #include "table/internal_iterator.h"
45 #include "table/meta_blocks.h"
46 #include "table/multiget_context.h"
47 #include "table/persistent_cache_helper.h"
48 #include "table/sst_file_writer_collectors.h"
49 #include "table/two_level_iterator.h"
50
51 #include "monitoring/perf_context_imp.h"
52 #include "test_util/sync_point.h"
53 #include "util/coding.h"
54 #include "util/crc32c.h"
55 #include "util/stop_watch.h"
56 #include "util/string_util.h"
57 #include "util/xxhash.h"
58
59 namespace rocksdb {
60
61 extern const uint64_t kBlockBasedTableMagicNumber;
62 extern const std::string kHashIndexPrefixesBlock;
63 extern const std::string kHashIndexPrefixesMetadataBlock;
64
65 typedef BlockBasedTable::IndexReader IndexReader;
66
67 // Found that 256 KB readahead size provides the best performance, based on
68 // experiments, for auto readahead. Experiment data is in PR #3282.
69 const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
70
~BlockBasedTable()71 BlockBasedTable::~BlockBasedTable() {
72 delete rep_;
73 }
74
75 std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
76
77 template <typename TBlocklike>
78 class BlocklikeTraits;
79
80 template <>
81 class BlocklikeTraits<BlockContents> {
82 public:
Create(BlockContents && contents,SequenceNumber,size_t,Statistics *,bool,const FilterPolicy *)83 static BlockContents* Create(BlockContents&& contents,
84 SequenceNumber /* global_seqno */,
85 size_t /* read_amp_bytes_per_bit */,
86 Statistics* /* statistics */,
87 bool /* using_zstd */,
88 const FilterPolicy* /* filter_policy */) {
89 return new BlockContents(std::move(contents));
90 }
91
GetNumRestarts(const BlockContents &)92 static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
93 return 0;
94 }
95 };
96
97 template <>
98 class BlocklikeTraits<ParsedFullFilterBlock> {
99 public:
Create(BlockContents && contents,SequenceNumber,size_t,Statistics *,bool,const FilterPolicy * filter_policy)100 static ParsedFullFilterBlock* Create(BlockContents&& contents,
101 SequenceNumber /* global_seqno */,
102 size_t /* read_amp_bytes_per_bit */,
103 Statistics* /* statistics */,
104 bool /* using_zstd */,
105 const FilterPolicy* filter_policy) {
106 return new ParsedFullFilterBlock(filter_policy, std::move(contents));
107 }
108
GetNumRestarts(const ParsedFullFilterBlock &)109 static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
110 return 0;
111 }
112 };
113
114 template <>
115 class BlocklikeTraits<Block> {
116 public:
Create(BlockContents && contents,SequenceNumber global_seqno,size_t read_amp_bytes_per_bit,Statistics * statistics,bool,const FilterPolicy *)117 static Block* Create(BlockContents&& contents, SequenceNumber global_seqno,
118 size_t read_amp_bytes_per_bit, Statistics* statistics,
119 bool /* using_zstd */,
120 const FilterPolicy* /* filter_policy */) {
121 return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
122 statistics);
123 }
124
GetNumRestarts(const Block & block)125 static uint32_t GetNumRestarts(const Block& block) {
126 return block.NumRestarts();
127 }
128 };
129
130 template <>
131 class BlocklikeTraits<UncompressionDict> {
132 public:
Create(BlockContents && contents,SequenceNumber,size_t,Statistics *,bool using_zstd,const FilterPolicy *)133 static UncompressionDict* Create(BlockContents&& contents,
134 SequenceNumber /* global_seqno */,
135 size_t /* read_amp_bytes_per_bit */,
136 Statistics* /* statistics */,
137 bool using_zstd,
138 const FilterPolicy* /* filter_policy */) {
139 return new UncompressionDict(contents.data, std::move(contents.allocation),
140 using_zstd);
141 }
142
GetNumRestarts(const UncompressionDict &)143 static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
144 return 0;
145 }
146 };
147
148 namespace {
149 // Read the block identified by "handle" from "file".
150 // The only relevant option is options.verify_checksums for now.
151 // On failure return non-OK.
152 // On success fill *result and return OK - caller owns *result
153 // @param uncompression_dict Data for presetting the compression library's
154 // dictionary.
155 template <typename TBlocklike>
ReadBlockFromFile(RandomAccessFileReader * file,FilePrefetchBuffer * prefetch_buffer,const Footer & footer,const ReadOptions & options,const BlockHandle & handle,std::unique_ptr<TBlocklike> * result,const ImmutableCFOptions & ioptions,bool do_uncompress,bool maybe_compressed,BlockType block_type,const UncompressionDict & uncompression_dict,const PersistentCacheOptions & cache_options,SequenceNumber global_seqno,size_t read_amp_bytes_per_bit,MemoryAllocator * memory_allocator,bool for_compaction,bool using_zstd,const FilterPolicy * filter_policy)156 Status ReadBlockFromFile(
157 RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
158 const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
159 std::unique_ptr<TBlocklike>* result, const ImmutableCFOptions& ioptions,
160 bool do_uncompress, bool maybe_compressed, BlockType block_type,
161 const UncompressionDict& uncompression_dict,
162 const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
163 size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
164 bool for_compaction, bool using_zstd, const FilterPolicy* filter_policy) {
165 assert(result);
166
167 BlockContents contents;
168 BlockFetcher block_fetcher(
169 file, prefetch_buffer, footer, options, handle, &contents, ioptions,
170 do_uncompress, maybe_compressed, block_type, uncompression_dict,
171 cache_options, memory_allocator, nullptr, for_compaction);
172 Status s = block_fetcher.ReadBlockContents();
173 if (s.ok()) {
174 result->reset(BlocklikeTraits<TBlocklike>::Create(
175 std::move(contents), global_seqno, read_amp_bytes_per_bit,
176 ioptions.statistics, using_zstd, filter_policy));
177 }
178
179 return s;
180 }
181
GetMemoryAllocator(const BlockBasedTableOptions & table_options)182 inline MemoryAllocator* GetMemoryAllocator(
183 const BlockBasedTableOptions& table_options) {
184 return table_options.block_cache.get()
185 ? table_options.block_cache->memory_allocator()
186 : nullptr;
187 }
188
GetMemoryAllocatorForCompressedBlock(const BlockBasedTableOptions & table_options)189 inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
190 const BlockBasedTableOptions& table_options) {
191 return table_options.block_cache_compressed.get()
192 ? table_options.block_cache_compressed->memory_allocator()
193 : nullptr;
194 }
195
196 // Delete the entry resided in the cache.
197 template <class Entry>
DeleteCachedEntry(const Slice &,void * value)198 void DeleteCachedEntry(const Slice& /*key*/, void* value) {
199 auto entry = reinterpret_cast<Entry*>(value);
200 delete entry;
201 }
202
203 // Release the cached entry and decrement its ref count.
ForceReleaseCachedEntry(void * arg,void * h)204 void ForceReleaseCachedEntry(void* arg, void* h) {
205 Cache* cache = reinterpret_cast<Cache*>(arg);
206 Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
207 cache->Release(handle, true /* force_erase */);
208 }
209
210 // Release the cached entry and decrement its ref count.
211 // Do not force erase
ReleaseCachedEntry(void * arg,void * h)212 void ReleaseCachedEntry(void* arg, void* h) {
213 Cache* cache = reinterpret_cast<Cache*>(arg);
214 Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
215 cache->Release(handle, false /* force_erase */);
216 }
217
218 // For hash based index, return true if prefix_extractor and
219 // prefix_extractor_block mismatch, false otherwise. This flag will be used
220 // as total_order_seek via NewIndexIterator
PrefixExtractorChanged(const TableProperties * table_properties,const SliceTransform * prefix_extractor)221 bool PrefixExtractorChanged(const TableProperties* table_properties,
222 const SliceTransform* prefix_extractor) {
223 // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
224 // Turn off hash index in prefix_extractor is not set; if prefix_extractor
225 // is set but prefix_extractor_block is not set, also disable hash index
226 if (prefix_extractor == nullptr || table_properties == nullptr ||
227 table_properties->prefix_extractor_name.empty()) {
228 return true;
229 }
230
231 // prefix_extractor and prefix_extractor_block are both non-empty
232 if (table_properties->prefix_extractor_name.compare(
233 prefix_extractor->Name()) != 0) {
234 return true;
235 } else {
236 return false;
237 }
238 }
239
CopyBufferToHeap(MemoryAllocator * allocator,Slice & buf)240 CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
241 CacheAllocationPtr heap_buf;
242 heap_buf = AllocateBlock(buf.size(), allocator);
243 memcpy(heap_buf.get(), buf.data(), buf.size());
244 return heap_buf;
245 }
246
247 } // namespace
248
249 // Encapsulates common functionality for the various index reader
250 // implementations. Provides access to the index block regardless of whether
251 // it is owned by the reader or stored in the cache, or whether it is pinned
252 // in the cache or not.
253 class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
254 public:
IndexReaderCommon(const BlockBasedTable * t,CachableEntry<Block> && index_block)255 IndexReaderCommon(const BlockBasedTable* t,
256 CachableEntry<Block>&& index_block)
257 : table_(t), index_block_(std::move(index_block)) {
258 assert(table_ != nullptr);
259 }
260
261 protected:
262 static Status ReadIndexBlock(const BlockBasedTable* table,
263 FilePrefetchBuffer* prefetch_buffer,
264 const ReadOptions& read_options, bool use_cache,
265 GetContext* get_context,
266 BlockCacheLookupContext* lookup_context,
267 CachableEntry<Block>* index_block);
268
table() const269 const BlockBasedTable* table() const { return table_; }
270
internal_comparator() const271 const InternalKeyComparator* internal_comparator() const {
272 assert(table_ != nullptr);
273 assert(table_->get_rep() != nullptr);
274
275 return &table_->get_rep()->internal_comparator;
276 }
277
index_has_first_key() const278 bool index_has_first_key() const {
279 assert(table_ != nullptr);
280 assert(table_->get_rep() != nullptr);
281 return table_->get_rep()->index_has_first_key;
282 }
283
index_key_includes_seq() const284 bool index_key_includes_seq() const {
285 assert(table_ != nullptr);
286 assert(table_->get_rep() != nullptr);
287 return table_->get_rep()->index_key_includes_seq;
288 }
289
index_value_is_full() const290 bool index_value_is_full() const {
291 assert(table_ != nullptr);
292 assert(table_->get_rep() != nullptr);
293 return table_->get_rep()->index_value_is_full;
294 }
295
cache_index_blocks() const296 bool cache_index_blocks() const {
297 assert(table_ != nullptr);
298 assert(table_->get_rep() != nullptr);
299 return table_->get_rep()->table_options.cache_index_and_filter_blocks;
300 }
301
302 Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
303 BlockCacheLookupContext* lookup_context,
304 CachableEntry<Block>* index_block) const;
305
ApproximateIndexBlockMemoryUsage() const306 size_t ApproximateIndexBlockMemoryUsage() const {
307 assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
308 return index_block_.GetOwnValue()
309 ? index_block_.GetValue()->ApproximateMemoryUsage()
310 : 0;
311 }
312
313 private:
314 const BlockBasedTable* table_;
315 CachableEntry<Block> index_block_;
316 };
317
ReadIndexBlock(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,const ReadOptions & read_options,bool use_cache,GetContext * get_context,BlockCacheLookupContext * lookup_context,CachableEntry<Block> * index_block)318 Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
319 const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
320 const ReadOptions& read_options, bool use_cache, GetContext* get_context,
321 BlockCacheLookupContext* lookup_context,
322 CachableEntry<Block>* index_block) {
323 PERF_TIMER_GUARD(read_index_block_nanos);
324
325 assert(table != nullptr);
326 assert(index_block != nullptr);
327 assert(index_block->IsEmpty());
328
329 const Rep* const rep = table->get_rep();
330 assert(rep != nullptr);
331
332 const Status s = table->RetrieveBlock(
333 prefetch_buffer, read_options, rep->footer.index_handle(),
334 UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
335 get_context, lookup_context, /* for_compaction */ false, use_cache);
336
337 return s;
338 }
339
GetOrReadIndexBlock(bool no_io,GetContext * get_context,BlockCacheLookupContext * lookup_context,CachableEntry<Block> * index_block) const340 Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
341 bool no_io, GetContext* get_context,
342 BlockCacheLookupContext* lookup_context,
343 CachableEntry<Block>* index_block) const {
344 assert(index_block != nullptr);
345
346 if (!index_block_.IsEmpty()) {
347 index_block->SetUnownedValue(index_block_.GetValue());
348 return Status::OK();
349 }
350
351 ReadOptions read_options;
352 if (no_io) {
353 read_options.read_tier = kBlockCacheTier;
354 }
355
356 return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
357 cache_index_blocks(), get_context, lookup_context,
358 index_block);
359 }
360
361 // Index that allows binary search lookup in a two-level index structure.
362 class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
363 public:
364 // Read the partition index from the file and create an instance for
365 // `PartitionIndexReader`.
366 // On success, index_reader will be populated; otherwise it will remain
367 // unmodified.
Create(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)368 static Status Create(const BlockBasedTable* table,
369 FilePrefetchBuffer* prefetch_buffer, bool use_cache,
370 bool prefetch, bool pin,
371 BlockCacheLookupContext* lookup_context,
372 std::unique_ptr<IndexReader>* index_reader) {
373 assert(table != nullptr);
374 assert(table->get_rep());
375 assert(!pin || prefetch);
376 assert(index_reader != nullptr);
377
378 CachableEntry<Block> index_block;
379 if (prefetch || !use_cache) {
380 const Status s =
381 ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
382 /*get_context=*/nullptr, lookup_context, &index_block);
383 if (!s.ok()) {
384 return s;
385 }
386
387 if (use_cache && !pin) {
388 index_block.Reset();
389 }
390 }
391
392 index_reader->reset(
393 new PartitionIndexReader(table, std::move(index_block)));
394
395 return Status::OK();
396 }
397
398 // return a two-level iterator: first level is on the partition index
NewIterator(const ReadOptions & read_options,bool,IndexBlockIter * iter,GetContext * get_context,BlockCacheLookupContext * lookup_context)399 InternalIteratorBase<IndexValue>* NewIterator(
400 const ReadOptions& read_options, bool /* disable_prefix_seek */,
401 IndexBlockIter* iter, GetContext* get_context,
402 BlockCacheLookupContext* lookup_context) override {
403 const bool no_io = (read_options.read_tier == kBlockCacheTier);
404 CachableEntry<Block> index_block;
405 const Status s =
406 GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
407 if (!s.ok()) {
408 if (iter != nullptr) {
409 iter->Invalidate(s);
410 return iter;
411 }
412
413 return NewErrorInternalIterator<IndexValue>(s);
414 }
415
416 InternalIteratorBase<IndexValue>* it = nullptr;
417
418 Statistics* kNullStats = nullptr;
419 // Filters are already checked before seeking the index
420 if (!partition_map_.empty()) {
421 // We don't return pinned data from index blocks, so no need
422 // to set `block_contents_pinned`.
423 it = NewTwoLevelIterator(
424 new BlockBasedTable::PartitionedIndexIteratorState(table(),
425 &partition_map_),
426 index_block.GetValue()->NewIndexIterator(
427 internal_comparator(), internal_comparator()->user_comparator(),
428 nullptr, kNullStats, true, index_has_first_key(),
429 index_key_includes_seq(), index_value_is_full()));
430 } else {
431 ReadOptions ro;
432 ro.fill_cache = read_options.fill_cache;
433 // We don't return pinned data from index blocks, so no need
434 // to set `block_contents_pinned`.
435 it = new BlockBasedTableIterator<IndexBlockIter, IndexValue>(
436 table(), ro, *internal_comparator(),
437 index_block.GetValue()->NewIndexIterator(
438 internal_comparator(), internal_comparator()->user_comparator(),
439 nullptr, kNullStats, true, index_has_first_key(),
440 index_key_includes_seq(), index_value_is_full()),
441 false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
442 lookup_context ? lookup_context->caller
443 : TableReaderCaller::kUncategorized);
444 }
445
446 assert(it != nullptr);
447 index_block.TransferTo(it);
448
449 return it;
450
451 // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
452 // on-stack BlockIter while the state is on heap. Currentlly it assumes
453 // the first level iter is always on heap and will attempt to delete it
454 // in its destructor.
455 }
456
CacheDependencies(bool pin)457 void CacheDependencies(bool pin) override {
458 // Before read partitions, prefetch them to avoid lots of IOs
459 BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
460 const BlockBasedTable::Rep* rep = table()->rep_;
461 IndexBlockIter biter;
462 BlockHandle handle;
463 Statistics* kNullStats = nullptr;
464
465 CachableEntry<Block> index_block;
466 Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
467 &lookup_context, &index_block);
468 if (!s.ok()) {
469 ROCKS_LOG_WARN(rep->ioptions.info_log,
470 "Error retrieving top-level index block while trying to "
471 "cache index partitions: %s",
472 s.ToString().c_str());
473 return;
474 }
475
476 // We don't return pinned data from index blocks, so no need
477 // to set `block_contents_pinned`.
478 index_block.GetValue()->NewIndexIterator(
479 internal_comparator(), internal_comparator()->user_comparator(), &biter,
480 kNullStats, true, index_has_first_key(), index_key_includes_seq(),
481 index_value_is_full());
482 // Index partitions are assumed to be consecuitive. Prefetch them all.
483 // Read the first block offset
484 biter.SeekToFirst();
485 if (!biter.Valid()) {
486 // Empty index.
487 return;
488 }
489 handle = biter.value().handle;
490 uint64_t prefetch_off = handle.offset();
491
492 // Read the last block's offset
493 biter.SeekToLast();
494 if (!biter.Valid()) {
495 // Empty index.
496 return;
497 }
498 handle = biter.value().handle;
499 uint64_t last_off = handle.offset() + block_size(handle);
500 uint64_t prefetch_len = last_off - prefetch_off;
501 std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
502 rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer);
503 s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off,
504 static_cast<size_t>(prefetch_len));
505
506 // After prefetch, read the partitions one by one
507 biter.SeekToFirst();
508 auto ro = ReadOptions();
509 for (; biter.Valid(); biter.Next()) {
510 handle = biter.value().handle;
511 CachableEntry<Block> block;
512 // TODO: Support counter batch update for partitioned index and
513 // filter blocks
514 s = table()->MaybeReadBlockAndLoadToCache(
515 prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
516 &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context,
517 /*contents=*/nullptr);
518
519 assert(s.ok() || block.GetValue() == nullptr);
520 if (s.ok() && block.GetValue() != nullptr) {
521 if (block.IsCached()) {
522 if (pin) {
523 partition_map_[handle.offset()] = std::move(block);
524 }
525 }
526 }
527 }
528 }
529
ApproximateMemoryUsage() const530 size_t ApproximateMemoryUsage() const override {
531 size_t usage = ApproximateIndexBlockMemoryUsage();
532 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
533 usage += malloc_usable_size(const_cast<PartitionIndexReader*>(this));
534 #else
535 usage += sizeof(*this);
536 #endif // ROCKSDB_MALLOC_USABLE_SIZE
537 // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
538 return usage;
539 }
540
541 private:
PartitionIndexReader(const BlockBasedTable * t,CachableEntry<Block> && index_block)542 PartitionIndexReader(const BlockBasedTable* t,
543 CachableEntry<Block>&& index_block)
544 : IndexReaderCommon(t, std::move(index_block)) {}
545
546 std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
547 };
548
549 // Index that allows binary search lookup for the first key of each block.
550 // This class can be viewed as a thin wrapper for `Block` class which already
551 // supports binary search.
552 class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
553 public:
554 // Read index from the file and create an intance for
555 // `BinarySearchIndexReader`.
556 // On success, index_reader will be populated; otherwise it will remain
557 // unmodified.
Create(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)558 static Status Create(const BlockBasedTable* table,
559 FilePrefetchBuffer* prefetch_buffer, bool use_cache,
560 bool prefetch, bool pin,
561 BlockCacheLookupContext* lookup_context,
562 std::unique_ptr<IndexReader>* index_reader) {
563 assert(table != nullptr);
564 assert(table->get_rep());
565 assert(!pin || prefetch);
566 assert(index_reader != nullptr);
567
568 CachableEntry<Block> index_block;
569 if (prefetch || !use_cache) {
570 const Status s =
571 ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
572 /*get_context=*/nullptr, lookup_context, &index_block);
573 if (!s.ok()) {
574 return s;
575 }
576
577 if (use_cache && !pin) {
578 index_block.Reset();
579 }
580 }
581
582 index_reader->reset(
583 new BinarySearchIndexReader(table, std::move(index_block)));
584
585 return Status::OK();
586 }
587
NewIterator(const ReadOptions & read_options,bool,IndexBlockIter * iter,GetContext * get_context,BlockCacheLookupContext * lookup_context)588 InternalIteratorBase<IndexValue>* NewIterator(
589 const ReadOptions& read_options, bool /* disable_prefix_seek */,
590 IndexBlockIter* iter, GetContext* get_context,
591 BlockCacheLookupContext* lookup_context) override {
592 const bool no_io = (read_options.read_tier == kBlockCacheTier);
593 CachableEntry<Block> index_block;
594 const Status s =
595 GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
596 if (!s.ok()) {
597 if (iter != nullptr) {
598 iter->Invalidate(s);
599 return iter;
600 }
601
602 return NewErrorInternalIterator<IndexValue>(s);
603 }
604
605 Statistics* kNullStats = nullptr;
606 // We don't return pinned data from index blocks, so no need
607 // to set `block_contents_pinned`.
608 auto it = index_block.GetValue()->NewIndexIterator(
609 internal_comparator(), internal_comparator()->user_comparator(), iter,
610 kNullStats, true, index_has_first_key(), index_key_includes_seq(),
611 index_value_is_full());
612
613 assert(it != nullptr);
614 index_block.TransferTo(it);
615
616 return it;
617 }
618
ApproximateMemoryUsage() const619 size_t ApproximateMemoryUsage() const override {
620 size_t usage = ApproximateIndexBlockMemoryUsage();
621 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
622 usage += malloc_usable_size(const_cast<BinarySearchIndexReader*>(this));
623 #else
624 usage += sizeof(*this);
625 #endif // ROCKSDB_MALLOC_USABLE_SIZE
626 return usage;
627 }
628
629 private:
BinarySearchIndexReader(const BlockBasedTable * t,CachableEntry<Block> && index_block)630 BinarySearchIndexReader(const BlockBasedTable* t,
631 CachableEntry<Block>&& index_block)
632 : IndexReaderCommon(t, std::move(index_block)) {}
633 };
634
635 // Index that leverages an internal hash table to quicken the lookup for a given
636 // key.
637 class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
638 public:
Create(const BlockBasedTable * table,FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_index_iter,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)639 static Status Create(const BlockBasedTable* table,
640 FilePrefetchBuffer* prefetch_buffer,
641 InternalIterator* meta_index_iter, bool use_cache,
642 bool prefetch, bool pin,
643 BlockCacheLookupContext* lookup_context,
644 std::unique_ptr<IndexReader>* index_reader) {
645 assert(table != nullptr);
646 assert(index_reader != nullptr);
647 assert(!pin || prefetch);
648
649 const BlockBasedTable::Rep* rep = table->get_rep();
650 assert(rep != nullptr);
651
652 CachableEntry<Block> index_block;
653 if (prefetch || !use_cache) {
654 const Status s =
655 ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
656 /*get_context=*/nullptr, lookup_context, &index_block);
657 if (!s.ok()) {
658 return s;
659 }
660
661 if (use_cache && !pin) {
662 index_block.Reset();
663 }
664 }
665
666 // Note, failure to create prefix hash index does not need to be a
667 // hard error. We can still fall back to the original binary search index.
668 // So, Create will succeed regardless, from this point on.
669
670 index_reader->reset(new HashIndexReader(table, std::move(index_block)));
671
672 // Get prefixes block
673 BlockHandle prefixes_handle;
674 Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
675 &prefixes_handle);
676 if (!s.ok()) {
677 // TODO: log error
678 return Status::OK();
679 }
680
681 // Get index metadata block
682 BlockHandle prefixes_meta_handle;
683 s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
684 &prefixes_meta_handle);
685 if (!s.ok()) {
686 // TODO: log error
687 return Status::OK();
688 }
689
690 RandomAccessFileReader* const file = rep->file.get();
691 const Footer& footer = rep->footer;
692 const ImmutableCFOptions& ioptions = rep->ioptions;
693 const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
694 MemoryAllocator* const memory_allocator =
695 GetMemoryAllocator(rep->table_options);
696
697 // Read contents for the blocks
698 BlockContents prefixes_contents;
699 BlockFetcher prefixes_block_fetcher(
700 file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
701 &prefixes_contents, ioptions, true /*decompress*/,
702 true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
703 UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
704 s = prefixes_block_fetcher.ReadBlockContents();
705 if (!s.ok()) {
706 return s;
707 }
708 BlockContents prefixes_meta_contents;
709 BlockFetcher prefixes_meta_block_fetcher(
710 file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
711 &prefixes_meta_contents, ioptions, true /*decompress*/,
712 true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
713 UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
714 s = prefixes_meta_block_fetcher.ReadBlockContents();
715 if (!s.ok()) {
716 // TODO: log error
717 return Status::OK();
718 }
719
720 BlockPrefixIndex* prefix_index = nullptr;
721 s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
722 prefixes_contents.data,
723 prefixes_meta_contents.data, &prefix_index);
724 // TODO: log error
725 if (s.ok()) {
726 HashIndexReader* const hash_index_reader =
727 static_cast<HashIndexReader*>(index_reader->get());
728 hash_index_reader->prefix_index_.reset(prefix_index);
729 }
730
731 return Status::OK();
732 }
733
NewIterator(const ReadOptions & read_options,bool disable_prefix_seek,IndexBlockIter * iter,GetContext * get_context,BlockCacheLookupContext * lookup_context)734 InternalIteratorBase<IndexValue>* NewIterator(
735 const ReadOptions& read_options, bool disable_prefix_seek,
736 IndexBlockIter* iter, GetContext* get_context,
737 BlockCacheLookupContext* lookup_context) override {
738 const bool no_io = (read_options.read_tier == kBlockCacheTier);
739 CachableEntry<Block> index_block;
740 const Status s =
741 GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
742 if (!s.ok()) {
743 if (iter != nullptr) {
744 iter->Invalidate(s);
745 return iter;
746 }
747
748 return NewErrorInternalIterator<IndexValue>(s);
749 }
750
751 Statistics* kNullStats = nullptr;
752 const bool total_order_seek =
753 read_options.total_order_seek || disable_prefix_seek;
754 // We don't return pinned data from index blocks, so no need
755 // to set `block_contents_pinned`.
756 auto it = index_block.GetValue()->NewIndexIterator(
757 internal_comparator(), internal_comparator()->user_comparator(), iter,
758 kNullStats, total_order_seek, index_has_first_key(),
759 index_key_includes_seq(), index_value_is_full(),
760 false /* block_contents_pinned */, prefix_index_.get());
761
762 assert(it != nullptr);
763 index_block.TransferTo(it);
764
765 return it;
766 }
767
ApproximateMemoryUsage() const768 size_t ApproximateMemoryUsage() const override {
769 size_t usage = ApproximateIndexBlockMemoryUsage();
770 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
771 usage += malloc_usable_size(const_cast<HashIndexReader*>(this));
772 #else
773 if (prefix_index_) {
774 usage += prefix_index_->ApproximateMemoryUsage();
775 }
776 usage += sizeof(*this);
777 #endif // ROCKSDB_MALLOC_USABLE_SIZE
778 return usage;
779 }
780
781 private:
HashIndexReader(const BlockBasedTable * t,CachableEntry<Block> && index_block)782 HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
783 : IndexReaderCommon(t, std::move(index_block)) {}
784
785 std::unique_ptr<BlockPrefixIndex> prefix_index_;
786 };
787
UpdateCacheHitMetrics(BlockType block_type,GetContext * get_context,size_t usage) const788 void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
789 GetContext* get_context,
790 size_t usage) const {
791 Statistics* const statistics = rep_->ioptions.statistics;
792
793 PERF_COUNTER_ADD(block_cache_hit_count, 1);
794 PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
795 static_cast<uint32_t>(rep_->level));
796
797 if (get_context) {
798 ++get_context->get_context_stats_.num_cache_hit;
799 get_context->get_context_stats_.num_cache_bytes_read += usage;
800 } else {
801 RecordTick(statistics, BLOCK_CACHE_HIT);
802 RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
803 }
804
805 switch (block_type) {
806 case BlockType::kFilter:
807 PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
808
809 if (get_context) {
810 ++get_context->get_context_stats_.num_cache_filter_hit;
811 } else {
812 RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
813 }
814 break;
815
816 case BlockType::kCompressionDictionary:
817 // TODO: introduce perf counter for compression dictionary hit count
818 if (get_context) {
819 ++get_context->get_context_stats_.num_cache_compression_dict_hit;
820 } else {
821 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
822 }
823 break;
824
825 case BlockType::kIndex:
826 PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
827
828 if (get_context) {
829 ++get_context->get_context_stats_.num_cache_index_hit;
830 } else {
831 RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
832 }
833 break;
834
835 default:
836 // TODO: introduce dedicated tickers/statistics/counters
837 // for range tombstones
838 if (get_context) {
839 ++get_context->get_context_stats_.num_cache_data_hit;
840 } else {
841 RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
842 }
843 break;
844 }
845 }
846
UpdateCacheMissMetrics(BlockType block_type,GetContext * get_context) const847 void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
848 GetContext* get_context) const {
849 Statistics* const statistics = rep_->ioptions.statistics;
850
851 // TODO: introduce aggregate (not per-level) block cache miss count
852 PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
853 static_cast<uint32_t>(rep_->level));
854
855 if (get_context) {
856 ++get_context->get_context_stats_.num_cache_miss;
857 } else {
858 RecordTick(statistics, BLOCK_CACHE_MISS);
859 }
860
861 // TODO: introduce perf counters for misses per block type
862 switch (block_type) {
863 case BlockType::kFilter:
864 if (get_context) {
865 ++get_context->get_context_stats_.num_cache_filter_miss;
866 } else {
867 RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
868 }
869 break;
870
871 case BlockType::kCompressionDictionary:
872 if (get_context) {
873 ++get_context->get_context_stats_.num_cache_compression_dict_miss;
874 } else {
875 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
876 }
877 break;
878
879 case BlockType::kIndex:
880 if (get_context) {
881 ++get_context->get_context_stats_.num_cache_index_miss;
882 } else {
883 RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
884 }
885 break;
886
887 default:
888 // TODO: introduce dedicated tickers/statistics/counters
889 // for range tombstones
890 if (get_context) {
891 ++get_context->get_context_stats_.num_cache_data_miss;
892 } else {
893 RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
894 }
895 break;
896 }
897 }
898
UpdateCacheInsertionMetrics(BlockType block_type,GetContext * get_context,size_t usage) const899 void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
900 GetContext* get_context,
901 size_t usage) const {
902 Statistics* const statistics = rep_->ioptions.statistics;
903
904 // TODO: introduce perf counters for block cache insertions
905 if (get_context) {
906 ++get_context->get_context_stats_.num_cache_add;
907 get_context->get_context_stats_.num_cache_bytes_write += usage;
908 } else {
909 RecordTick(statistics, BLOCK_CACHE_ADD);
910 RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
911 }
912
913 switch (block_type) {
914 case BlockType::kFilter:
915 if (get_context) {
916 ++get_context->get_context_stats_.num_cache_filter_add;
917 get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
918 } else {
919 RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
920 RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
921 }
922 break;
923
924 case BlockType::kCompressionDictionary:
925 if (get_context) {
926 ++get_context->get_context_stats_.num_cache_compression_dict_add;
927 get_context->get_context_stats_
928 .num_cache_compression_dict_bytes_insert += usage;
929 } else {
930 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
931 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
932 usage);
933 }
934 break;
935
936 case BlockType::kIndex:
937 if (get_context) {
938 ++get_context->get_context_stats_.num_cache_index_add;
939 get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
940 } else {
941 RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
942 RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
943 }
944 break;
945
946 default:
947 // TODO: introduce dedicated tickers/statistics/counters
948 // for range tombstones
949 if (get_context) {
950 ++get_context->get_context_stats_.num_cache_data_add;
951 get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
952 } else {
953 RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
954 RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
955 }
956 break;
957 }
958 }
959
GetEntryFromCache(Cache * block_cache,const Slice & key,BlockType block_type,GetContext * get_context) const960 Cache::Handle* BlockBasedTable::GetEntryFromCache(
961 Cache* block_cache, const Slice& key, BlockType block_type,
962 GetContext* get_context) const {
963 auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);
964
965 if (cache_handle != nullptr) {
966 UpdateCacheHitMetrics(block_type, get_context,
967 block_cache->GetUsage(cache_handle));
968 } else {
969 UpdateCacheMissMetrics(block_type, get_context);
970 }
971
972 return cache_handle;
973 }
974
975 // Helper function to setup the cache key's prefix for the Table.
SetupCacheKeyPrefix(Rep * rep)976 void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
977 assert(kMaxCacheKeyPrefixSize >= 10);
978 rep->cache_key_prefix_size = 0;
979 rep->compressed_cache_key_prefix_size = 0;
980 if (rep->table_options.block_cache != nullptr) {
981 GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
982 &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
983 }
984 if (rep->table_options.persistent_cache != nullptr) {
985 GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
986 &rep->persistent_cache_key_prefix[0],
987 &rep->persistent_cache_key_prefix_size);
988 }
989 if (rep->table_options.block_cache_compressed != nullptr) {
990 GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
991 rep->file->file(), &rep->compressed_cache_key_prefix[0],
992 &rep->compressed_cache_key_prefix_size);
993 }
994 }
995
GenerateCachePrefix(Cache * cc,FSRandomAccessFile * file,char * buffer,size_t * size)996 void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
997 char* buffer, size_t* size) {
998 // generate an id from the file
999 *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
1000
1001 // If the prefix wasn't generated or was too long,
1002 // create one from the cache.
1003 if (cc != nullptr && *size == 0) {
1004 char* end = EncodeVarint64(buffer, cc->NewId());
1005 *size = static_cast<size_t>(end - buffer);
1006 }
1007 }
1008
GenerateCachePrefix(Cache * cc,FSWritableFile * file,char * buffer,size_t * size)1009 void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSWritableFile* file,
1010 char* buffer, size_t* size) {
1011 // generate an id from the file
1012 *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
1013
1014 // If the prefix wasn't generated or was too long,
1015 // create one from the cache.
1016 if (cc != nullptr && *size == 0) {
1017 char* end = EncodeVarint64(buffer, cc->NewId());
1018 *size = static_cast<size_t>(end - buffer);
1019 }
1020 }
1021
1022 namespace {
1023 // Return True if table_properties has `user_prop_name` has a `true` value
1024 // or it doesn't contain this property (for backward compatible).
IsFeatureSupported(const TableProperties & table_properties,const std::string & user_prop_name,Logger * info_log)1025 bool IsFeatureSupported(const TableProperties& table_properties,
1026 const std::string& user_prop_name, Logger* info_log) {
1027 auto& props = table_properties.user_collected_properties;
1028 auto pos = props.find(user_prop_name);
1029 // Older version doesn't have this value set. Skip this check.
1030 if (pos != props.end()) {
1031 if (pos->second == kPropFalse) {
1032 return false;
1033 } else if (pos->second != kPropTrue) {
1034 ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
1035 user_prop_name.c_str(), pos->second.c_str());
1036 }
1037 }
1038 return true;
1039 }
1040
1041 // Caller has to ensure seqno is not nullptr.
GetGlobalSequenceNumber(const TableProperties & table_properties,SequenceNumber largest_seqno,SequenceNumber * seqno)1042 Status GetGlobalSequenceNumber(const TableProperties& table_properties,
1043 SequenceNumber largest_seqno,
1044 SequenceNumber* seqno) {
1045 const auto& props = table_properties.user_collected_properties;
1046 const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
1047 const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
1048
1049 *seqno = kDisableGlobalSequenceNumber;
1050 if (version_pos == props.end()) {
1051 if (seqno_pos != props.end()) {
1052 std::array<char, 200> msg_buf;
1053 // This is not an external sst file, global_seqno is not supported.
1054 snprintf(
1055 msg_buf.data(), msg_buf.max_size(),
1056 "A non-external sst file have global seqno property with value %s",
1057 seqno_pos->second.c_str());
1058 return Status::Corruption(msg_buf.data());
1059 }
1060 return Status::OK();
1061 }
1062
1063 uint32_t version = DecodeFixed32(version_pos->second.c_str());
1064 if (version < 2) {
1065 if (seqno_pos != props.end() || version != 1) {
1066 std::array<char, 200> msg_buf;
1067 // This is a v1 external sst file, global_seqno is not supported.
1068 snprintf(msg_buf.data(), msg_buf.max_size(),
1069 "An external sst file with version %u have global seqno "
1070 "property with value %s",
1071 version, seqno_pos->second.c_str());
1072 return Status::Corruption(msg_buf.data());
1073 }
1074 return Status::OK();
1075 }
1076
1077 // Since we have a plan to deprecate global_seqno, we do not return failure
1078 // if seqno_pos == props.end(). We rely on version_pos to detect whether the
1079 // SST is external.
1080 SequenceNumber global_seqno(0);
1081 if (seqno_pos != props.end()) {
1082 global_seqno = DecodeFixed64(seqno_pos->second.c_str());
1083 }
1084 // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
1085 // to denote it is unknown.
1086 if (largest_seqno < kMaxSequenceNumber) {
1087 if (global_seqno == 0) {
1088 global_seqno = largest_seqno;
1089 }
1090 if (global_seqno != largest_seqno) {
1091 std::array<char, 200> msg_buf;
1092 snprintf(
1093 msg_buf.data(), msg_buf.max_size(),
1094 "An external sst file with version %u have global seqno property "
1095 "with value %s, while largest seqno in the file is %llu",
1096 version, seqno_pos->second.c_str(),
1097 static_cast<unsigned long long>(largest_seqno));
1098 return Status::Corruption(msg_buf.data());
1099 }
1100 }
1101 *seqno = global_seqno;
1102
1103 if (global_seqno > kMaxSequenceNumber) {
1104 std::array<char, 200> msg_buf;
1105 snprintf(msg_buf.data(), msg_buf.max_size(),
1106 "An external sst file with version %u have global seqno property "
1107 "with value %llu, which is greater than kMaxSequenceNumber",
1108 version, static_cast<unsigned long long>(global_seqno));
1109 return Status::Corruption(msg_buf.data());
1110 }
1111
1112 return Status::OK();
1113 }
1114 } // namespace
1115
GetCacheKey(const char * cache_key_prefix,size_t cache_key_prefix_size,const BlockHandle & handle,char * cache_key)1116 Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
1117 size_t cache_key_prefix_size,
1118 const BlockHandle& handle, char* cache_key) {
1119 assert(cache_key != nullptr);
1120 assert(cache_key_prefix_size != 0);
1121 assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
1122 memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
1123 char* end =
1124 EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
1125 return Slice(cache_key, static_cast<size_t>(end - cache_key));
1126 }
1127
Open(const ImmutableCFOptions & ioptions,const EnvOptions & env_options,const BlockBasedTableOptions & table_options,const InternalKeyComparator & internal_comparator,std::unique_ptr<RandomAccessFileReader> && file,uint64_t file_size,std::unique_ptr<TableReader> * table_reader,const SliceTransform * prefix_extractor,const bool prefetch_index_and_filter_in_cache,const bool skip_filters,const int level,const bool immortal_table,const SequenceNumber largest_seqno,TailPrefetchStats * tail_prefetch_stats,BlockCacheTracer * const block_cache_tracer)1128 Status BlockBasedTable::Open(
1129 const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
1130 const BlockBasedTableOptions& table_options,
1131 const InternalKeyComparator& internal_comparator,
1132 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
1133 std::unique_ptr<TableReader>* table_reader,
1134 const SliceTransform* prefix_extractor,
1135 const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
1136 const int level, const bool immortal_table,
1137 const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
1138 BlockCacheTracer* const block_cache_tracer) {
1139 table_reader->reset();
1140
1141 Status s;
1142 Footer footer;
1143 std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
1144
1145 // prefetch both index and filters, down to all partitions
1146 const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
1147 const bool preload_all = !table_options.cache_index_and_filter_blocks;
1148
1149 if (!ioptions.allow_mmap_reads) {
1150 s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
1151 preload_all, &prefetch_buffer);
1152 } else {
1153 // Should not prefetch for mmap mode.
1154 prefetch_buffer.reset(new FilePrefetchBuffer(
1155 nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
1156 }
1157
1158 // Read in the following order:
1159 // 1. Footer
1160 // 2. [metaindex block]
1161 // 3. [meta block: properties]
1162 // 4. [meta block: range deletion tombstone]
1163 // 5. [meta block: compression dictionary]
1164 // 6. [meta block: index]
1165 // 7. [meta block: filter]
1166 s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
1167 kBlockBasedTableMagicNumber);
1168 if (!s.ok()) {
1169 return s;
1170 }
1171 if (!BlockBasedTableSupportedVersion(footer.version())) {
1172 return Status::Corruption(
1173 "Unknown Footer version. Maybe this file was created with newer "
1174 "version of RocksDB?");
1175 }
1176
1177 // We've successfully read the footer. We are ready to serve requests.
1178 // Better not mutate rep_ after the creation. eg. internal_prefix_transform
1179 // raw pointer will be used to create HashIndexReader, whose reset may
1180 // access a dangling pointer.
1181 BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
1182 Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
1183 internal_comparator, skip_filters, level,
1184 immortal_table);
1185 rep->file = std::move(file);
1186 rep->footer = footer;
1187 rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
1188 // We need to wrap data with internal_prefix_transform to make sure it can
1189 // handle prefix correctly.
1190 rep->internal_prefix_transform.reset(
1191 new InternalKeySliceTransform(prefix_extractor));
1192 SetupCacheKeyPrefix(rep);
1193 std::unique_ptr<BlockBasedTable> new_table(
1194 new BlockBasedTable(rep, block_cache_tracer));
1195
1196 // page cache options
1197 rep->persistent_cache_options =
1198 PersistentCacheOptions(rep->table_options.persistent_cache,
1199 std::string(rep->persistent_cache_key_prefix,
1200 rep->persistent_cache_key_prefix_size),
1201 rep->ioptions.statistics);
1202
1203 // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
1204 // handle to null, otherwise it may be seen as uninitialized during the below
1205 // meta-block reads.
1206 rep->compression_dict_handle = BlockHandle::NullBlockHandle();
1207
1208 // Read metaindex
1209 std::unique_ptr<Block> metaindex;
1210 std::unique_ptr<InternalIterator> metaindex_iter;
1211 s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex,
1212 &metaindex_iter);
1213 if (!s.ok()) {
1214 return s;
1215 }
1216
1217 // Populates table_properties and some fields that depend on it,
1218 // such as index_type.
1219 s = new_table->ReadPropertiesBlock(prefetch_buffer.get(),
1220 metaindex_iter.get(), largest_seqno);
1221 if (!s.ok()) {
1222 return s;
1223 }
1224 s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(),
1225 internal_comparator, &lookup_context);
1226 if (!s.ok()) {
1227 return s;
1228 }
1229 s = new_table->PrefetchIndexAndFilterBlocks(
1230 prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
1231 prefetch_all, table_options, level, &lookup_context);
1232
1233 if (s.ok()) {
1234 // Update tail prefetch stats
1235 assert(prefetch_buffer.get() != nullptr);
1236 if (tail_prefetch_stats != nullptr) {
1237 assert(prefetch_buffer->min_offset_read() < file_size);
1238 tail_prefetch_stats->RecordEffectiveSize(
1239 static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
1240 }
1241
1242 *table_reader = std::move(new_table);
1243 }
1244
1245 return s;
1246 }
1247
PrefetchTail(RandomAccessFileReader * file,uint64_t file_size,TailPrefetchStats * tail_prefetch_stats,const bool prefetch_all,const bool preload_all,std::unique_ptr<FilePrefetchBuffer> * prefetch_buffer)1248 Status BlockBasedTable::PrefetchTail(
1249 RandomAccessFileReader* file, uint64_t file_size,
1250 TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
1251 const bool preload_all,
1252 std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
1253 size_t tail_prefetch_size = 0;
1254 if (tail_prefetch_stats != nullptr) {
1255 // Multiple threads may get a 0 (no history) when running in parallel,
1256 // but it will get cleared after the first of them finishes.
1257 tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
1258 }
1259 if (tail_prefetch_size == 0) {
1260 // Before read footer, readahead backwards to prefetch data. Do more
1261 // readahead if we're going to read index/filter.
1262 // TODO: This may incorrectly select small readahead in case partitioned
1263 // index/filter is enabled and top-level partition pinning is enabled.
1264 // That's because we need to issue readahead before we read the properties,
1265 // at which point we don't yet know the index type.
1266 tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
1267 }
1268 size_t prefetch_off;
1269 size_t prefetch_len;
1270 if (file_size < tail_prefetch_size) {
1271 prefetch_off = 0;
1272 prefetch_len = static_cast<size_t>(file_size);
1273 } else {
1274 prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
1275 prefetch_len = tail_prefetch_size;
1276 }
1277 TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
1278 &tail_prefetch_size);
1279 Status s;
1280 // TODO should not have this special logic in the future.
1281 if (!file->use_direct_io()) {
1282 prefetch_buffer->reset(new FilePrefetchBuffer(
1283 nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
1284 s = file->Prefetch(prefetch_off, prefetch_len);
1285 } else {
1286 prefetch_buffer->reset(new FilePrefetchBuffer(
1287 nullptr, 0, 0, true /* enable */, true /* track_min_offset */));
1288 s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
1289 }
1290 return s;
1291 }
1292
VerifyChecksum(const ChecksumType type,const char * buf,size_t len,uint32_t expected)1293 Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
1294 uint32_t expected) {
1295 Status s;
1296 uint32_t actual = 0;
1297 switch (type) {
1298 case kNoChecksum:
1299 break;
1300 case kCRC32c:
1301 expected = crc32c::Unmask(expected);
1302 actual = crc32c::Value(buf, len);
1303 break;
1304 case kxxHash:
1305 actual = XXH32(buf, static_cast<int>(len), 0);
1306 break;
1307 case kxxHash64:
1308 actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
1309 uint64_t{0xffffffff});
1310 break;
1311 default:
1312 s = Status::Corruption("unknown checksum type");
1313 }
1314 if (s.ok() && actual != expected) {
1315 s = Status::Corruption("properties block checksum mismatched");
1316 }
1317 return s;
1318 }
1319
TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer * prefetch_buffer,const Slice & handle_value,TableProperties ** table_properties)1320 Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
1321 FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
1322 TableProperties** table_properties) {
1323 assert(table_properties != nullptr);
1324 // If this is an external SST file ingested with write_global_seqno set to
1325 // true, then we expect the checksum mismatch because checksum was written
1326 // by SstFileWriter, but its global seqno in the properties block may have
1327 // been changed during ingestion. In this case, we read the properties
1328 // block, copy it to a memory buffer, change the global seqno to its
1329 // original value, i.e. 0, and verify the checksum again.
1330 BlockHandle props_block_handle;
1331 CacheAllocationPtr tmp_buf;
1332 Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
1333 rep_->footer, rep_->ioptions, table_properties,
1334 false /* verify_checksum */, &props_block_handle,
1335 &tmp_buf, false /* compression_type_missing */,
1336 nullptr /* memory_allocator */);
1337 if (s.ok() && tmp_buf) {
1338 const auto seqno_pos_iter =
1339 (*table_properties)
1340 ->properties_offsets.find(
1341 ExternalSstFilePropertyNames::kGlobalSeqno);
1342 size_t block_size = static_cast<size_t>(props_block_handle.size());
1343 if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
1344 uint64_t global_seqno_offset = seqno_pos_iter->second;
1345 EncodeFixed64(
1346 tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
1347 }
1348 uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
1349 s = rocksdb::VerifyChecksum(rep_->footer.checksum(), tmp_buf.get(),
1350 block_size + 1, value);
1351 }
1352 return s;
1353 }
1354
ReadPropertiesBlock(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,const SequenceNumber largest_seqno)1355 Status BlockBasedTable::ReadPropertiesBlock(
1356 FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1357 const SequenceNumber largest_seqno) {
1358 bool found_properties_block = true;
1359 Status s;
1360 s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
1361
1362 if (!s.ok()) {
1363 ROCKS_LOG_WARN(rep_->ioptions.info_log,
1364 "Error when seeking to properties block from file: %s",
1365 s.ToString().c_str());
1366 } else if (found_properties_block) {
1367 s = meta_iter->status();
1368 TableProperties* table_properties = nullptr;
1369 if (s.ok()) {
1370 s = ReadProperties(
1371 meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
1372 rep_->ioptions, &table_properties, true /* verify_checksum */,
1373 nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
1374 false /* compression_type_missing */, nullptr /* memory_allocator */);
1375 }
1376
1377 if (s.IsCorruption()) {
1378 s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
1379 &table_properties);
1380 }
1381 std::unique_ptr<TableProperties> props_guard;
1382 if (table_properties != nullptr) {
1383 props_guard.reset(table_properties);
1384 }
1385
1386 if (!s.ok()) {
1387 ROCKS_LOG_WARN(rep_->ioptions.info_log,
1388 "Encountered error while reading data from properties "
1389 "block %s",
1390 s.ToString().c_str());
1391 } else {
1392 assert(table_properties != nullptr);
1393 rep_->table_properties.reset(props_guard.release());
1394 rep_->blocks_maybe_compressed =
1395 rep_->table_properties->compression_name !=
1396 CompressionTypeToString(kNoCompression);
1397 rep_->blocks_definitely_zstd_compressed =
1398 (rep_->table_properties->compression_name ==
1399 CompressionTypeToString(kZSTD) ||
1400 rep_->table_properties->compression_name ==
1401 CompressionTypeToString(kZSTDNotFinalCompression));
1402 }
1403 } else {
1404 ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1405 "Cannot find Properties block from file.");
1406 }
1407 #ifndef ROCKSDB_LITE
1408 if (rep_->table_properties) {
1409 ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
1410 &(rep_->table_prefix_extractor));
1411 }
1412 #endif // ROCKSDB_LITE
1413
1414 // Read the table properties, if provided.
1415 if (rep_->table_properties) {
1416 rep_->whole_key_filtering &=
1417 IsFeatureSupported(*(rep_->table_properties),
1418 BlockBasedTablePropertyNames::kWholeKeyFiltering,
1419 rep_->ioptions.info_log);
1420 rep_->prefix_filtering &=
1421 IsFeatureSupported(*(rep_->table_properties),
1422 BlockBasedTablePropertyNames::kPrefixFiltering,
1423 rep_->ioptions.info_log);
1424
1425 rep_->index_key_includes_seq =
1426 rep_->table_properties->index_key_is_user_key == 0;
1427 rep_->index_value_is_full =
1428 rep_->table_properties->index_value_is_delta_encoded == 0;
1429
1430 // Update index_type with the true type.
1431 // If table properties don't contain index type, we assume that the table
1432 // is in very old format and has kBinarySearch index type.
1433 auto& props = rep_->table_properties->user_collected_properties;
1434 auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
1435 if (pos != props.end()) {
1436 rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
1437 DecodeFixed32(pos->second.c_str()));
1438 }
1439
1440 rep_->index_has_first_key =
1441 rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
1442
1443 s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
1444 &(rep_->global_seqno));
1445 if (!s.ok()) {
1446 ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
1447 }
1448 }
1449 return s;
1450 }
1451
ReadRangeDelBlock(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,const InternalKeyComparator & internal_comparator,BlockCacheLookupContext * lookup_context)1452 Status BlockBasedTable::ReadRangeDelBlock(
1453 FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1454 const InternalKeyComparator& internal_comparator,
1455 BlockCacheLookupContext* lookup_context) {
1456 Status s;
1457 bool found_range_del_block;
1458 BlockHandle range_del_handle;
1459 s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
1460 if (!s.ok()) {
1461 ROCKS_LOG_WARN(
1462 rep_->ioptions.info_log,
1463 "Error when seeking to range delete tombstones block from file: %s",
1464 s.ToString().c_str());
1465 } else if (found_range_del_block && !range_del_handle.IsNull()) {
1466 ReadOptions read_options;
1467 std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
1468 read_options, range_del_handle,
1469 /*input_iter=*/nullptr, BlockType::kRangeDeletion,
1470 /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
1471 assert(iter != nullptr);
1472 s = iter->status();
1473 if (!s.ok()) {
1474 ROCKS_LOG_WARN(
1475 rep_->ioptions.info_log,
1476 "Encountered error while reading data from range del block %s",
1477 s.ToString().c_str());
1478 } else {
1479 rep_->fragmented_range_dels =
1480 std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
1481 internal_comparator);
1482 }
1483 }
1484 return s;
1485 }
1486
PrefetchIndexAndFilterBlocks(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,BlockBasedTable * new_table,bool prefetch_all,const BlockBasedTableOptions & table_options,const int level,BlockCacheLookupContext * lookup_context)1487 Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
1488 FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1489 BlockBasedTable* new_table, bool prefetch_all,
1490 const BlockBasedTableOptions& table_options, const int level,
1491 BlockCacheLookupContext* lookup_context) {
1492 Status s;
1493
1494 // Find filter handle and filter type
1495 if (rep_->filter_policy) {
1496 for (auto filter_type :
1497 {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
1498 Rep::FilterType::kBlockFilter}) {
1499 std::string prefix;
1500 switch (filter_type) {
1501 case Rep::FilterType::kFullFilter:
1502 prefix = kFullFilterBlockPrefix;
1503 break;
1504 case Rep::FilterType::kPartitionedFilter:
1505 prefix = kPartitionedFilterBlockPrefix;
1506 break;
1507 case Rep::FilterType::kBlockFilter:
1508 prefix = kFilterBlockPrefix;
1509 break;
1510 default:
1511 assert(0);
1512 }
1513 std::string filter_block_key = prefix;
1514 filter_block_key.append(rep_->filter_policy->Name());
1515 if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
1516 .ok()) {
1517 rep_->filter_type = filter_type;
1518 break;
1519 }
1520 }
1521 }
1522
1523 // Find compression dictionary handle
1524 bool found_compression_dict = false;
1525 s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
1526 &rep_->compression_dict_handle);
1527 if (!s.ok()) {
1528 return s;
1529 }
1530
1531 BlockBasedTableOptions::IndexType index_type = rep_->index_type;
1532
1533 const bool use_cache = table_options.cache_index_and_filter_blocks;
1534
1535 // pin both index and filters, down to all partitions
1536 const bool pin_all =
1537 rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
1538
1539 // prefetch the first level of index
1540 const bool prefetch_index =
1541 prefetch_all ||
1542 (table_options.pin_top_level_index_and_filter &&
1543 index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
1544 // pin the first level of index
1545 const bool pin_index =
1546 pin_all || (table_options.pin_top_level_index_and_filter &&
1547 index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
1548
1549 std::unique_ptr<IndexReader> index_reader;
1550 s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
1551 prefetch_index, pin_index, lookup_context,
1552 &index_reader);
1553 if (!s.ok()) {
1554 return s;
1555 }
1556
1557 rep_->index_reader = std::move(index_reader);
1558
1559 // The partitions of partitioned index are always stored in cache. They
1560 // are hence follow the configuration for pin and prefetch regardless of
1561 // the value of cache_index_and_filter_blocks
1562 if (prefetch_all) {
1563 rep_->index_reader->CacheDependencies(pin_all);
1564 }
1565
1566 // prefetch the first level of filter
1567 const bool prefetch_filter =
1568 prefetch_all ||
1569 (table_options.pin_top_level_index_and_filter &&
1570 rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1571 // Partition fitlers cannot be enabled without partition indexes
1572 assert(!prefetch_filter || prefetch_index);
1573 // pin the first level of filter
1574 const bool pin_filter =
1575 pin_all || (table_options.pin_top_level_index_and_filter &&
1576 rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1577
1578 if (rep_->filter_policy) {
1579 auto filter = new_table->CreateFilterBlockReader(
1580 prefetch_buffer, use_cache, prefetch_filter, pin_filter,
1581 lookup_context);
1582 if (filter) {
1583 // Refer to the comment above about paritioned indexes always being cached
1584 if (prefetch_all) {
1585 filter->CacheDependencies(pin_all);
1586 }
1587
1588 rep_->filter = std::move(filter);
1589 }
1590 }
1591
1592 if (!rep_->compression_dict_handle.IsNull()) {
1593 std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
1594 s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache,
1595 prefetch_all, pin_all, lookup_context,
1596 &uncompression_dict_reader);
1597 if (!s.ok()) {
1598 return s;
1599 }
1600
1601 rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
1602 }
1603
1604 assert(s.ok());
1605 return s;
1606 }
1607
SetupForCompaction()1608 void BlockBasedTable::SetupForCompaction() {
1609 switch (rep_->ioptions.access_hint_on_compaction_start) {
1610 case Options::NONE:
1611 break;
1612 case Options::NORMAL:
1613 rep_->file->file()->Hint(FSRandomAccessFile::kNormal);
1614 break;
1615 case Options::SEQUENTIAL:
1616 rep_->file->file()->Hint(FSRandomAccessFile::kSequential);
1617 break;
1618 case Options::WILLNEED:
1619 rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed);
1620 break;
1621 default:
1622 assert(false);
1623 }
1624 }
1625
GetTableProperties() const1626 std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
1627 const {
1628 return rep_->table_properties;
1629 }
1630
ApproximateMemoryUsage() const1631 size_t BlockBasedTable::ApproximateMemoryUsage() const {
1632 size_t usage = 0;
1633 if (rep_->filter) {
1634 usage += rep_->filter->ApproximateMemoryUsage();
1635 }
1636 if (rep_->index_reader) {
1637 usage += rep_->index_reader->ApproximateMemoryUsage();
1638 }
1639 if (rep_->uncompression_dict_reader) {
1640 usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
1641 }
1642 return usage;
1643 }
1644
1645 // Load the meta-index-block from the file. On success, return the loaded
1646 // metaindex
1647 // block and its iterator.
ReadMetaIndexBlock(FilePrefetchBuffer * prefetch_buffer,std::unique_ptr<Block> * metaindex_block,std::unique_ptr<InternalIterator> * iter)1648 Status BlockBasedTable::ReadMetaIndexBlock(
1649 FilePrefetchBuffer* prefetch_buffer,
1650 std::unique_ptr<Block>* metaindex_block,
1651 std::unique_ptr<InternalIterator>* iter) {
1652 // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
1653 // it is an empty block.
1654 std::unique_ptr<Block> metaindex;
1655 Status s = ReadBlockFromFile(
1656 rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
1657 rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
1658 true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
1659 UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
1660 kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
1661 GetMemoryAllocator(rep_->table_options), false /* for_compaction */,
1662 rep_->blocks_definitely_zstd_compressed, nullptr /* filter_policy */);
1663
1664 if (!s.ok()) {
1665 ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1666 "Encountered error while reading data from properties"
1667 " block %s",
1668 s.ToString().c_str());
1669 return s;
1670 }
1671
1672 *metaindex_block = std::move(metaindex);
1673 // meta block uses bytewise comparator.
1674 iter->reset(metaindex_block->get()->NewDataIterator(BytewiseComparator(),
1675 BytewiseComparator()));
1676 return Status::OK();
1677 }
1678
1679 template <typename TBlocklike>
GetDataBlockFromCache(const Slice & block_cache_key,const Slice & compressed_block_cache_key,Cache * block_cache,Cache * block_cache_compressed,const ReadOptions & read_options,CachableEntry<TBlocklike> * block,const UncompressionDict & uncompression_dict,BlockType block_type,GetContext * get_context) const1680 Status BlockBasedTable::GetDataBlockFromCache(
1681 const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1682 Cache* block_cache, Cache* block_cache_compressed,
1683 const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
1684 const UncompressionDict& uncompression_dict, BlockType block_type,
1685 GetContext* get_context) const {
1686 const size_t read_amp_bytes_per_bit =
1687 block_type == BlockType::kData
1688 ? rep_->table_options.read_amp_bytes_per_bit
1689 : 0;
1690 assert(block);
1691 assert(block->IsEmpty());
1692
1693 Status s;
1694 BlockContents* compressed_block = nullptr;
1695 Cache::Handle* block_cache_compressed_handle = nullptr;
1696
1697 // Lookup uncompressed cache first
1698 if (block_cache != nullptr) {
1699 auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
1700 block_type, get_context);
1701 if (cache_handle != nullptr) {
1702 block->SetCachedValue(
1703 reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
1704 block_cache, cache_handle);
1705 return s;
1706 }
1707 }
1708
1709 // If not found, search from the compressed block cache.
1710 assert(block->IsEmpty());
1711
1712 if (block_cache_compressed == nullptr) {
1713 return s;
1714 }
1715
1716 assert(!compressed_block_cache_key.empty());
1717 block_cache_compressed_handle =
1718 block_cache_compressed->Lookup(compressed_block_cache_key);
1719
1720 Statistics* statistics = rep_->ioptions.statistics;
1721
1722 // if we found in the compressed cache, then uncompress and insert into
1723 // uncompressed cache
1724 if (block_cache_compressed_handle == nullptr) {
1725 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
1726 return s;
1727 }
1728
1729 // found compressed block
1730 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
1731 compressed_block = reinterpret_cast<BlockContents*>(
1732 block_cache_compressed->Value(block_cache_compressed_handle));
1733 CompressionType compression_type = compressed_block->get_compression_type();
1734 assert(compression_type != kNoCompression);
1735
1736 // Retrieve the uncompressed contents into a new buffer
1737 BlockContents contents;
1738 UncompressionContext context(compression_type);
1739 UncompressionInfo info(context, uncompression_dict, compression_type);
1740 s = UncompressBlockContents(
1741 info, compressed_block->data.data(), compressed_block->data.size(),
1742 &contents, rep_->table_options.format_version, rep_->ioptions,
1743 GetMemoryAllocator(rep_->table_options));
1744
1745 // Insert uncompressed block into block cache
1746 if (s.ok()) {
1747 std::unique_ptr<TBlocklike> block_holder(
1748 BlocklikeTraits<TBlocklike>::Create(
1749 std::move(contents), rep_->get_global_seqno(block_type),
1750 read_amp_bytes_per_bit, statistics,
1751 rep_->blocks_definitely_zstd_compressed,
1752 rep_->table_options.filter_policy.get())); // uncompressed block
1753
1754 if (block_cache != nullptr && block_holder->own_bytes() &&
1755 read_options.fill_cache) {
1756 size_t charge = block_holder->ApproximateMemoryUsage();
1757 Cache::Handle* cache_handle = nullptr;
1758 s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1759 &DeleteCachedEntry<TBlocklike>, &cache_handle);
1760 if (s.ok()) {
1761 assert(cache_handle != nullptr);
1762 block->SetCachedValue(block_holder.release(), block_cache,
1763 cache_handle);
1764
1765 UpdateCacheInsertionMetrics(block_type, get_context, charge);
1766 } else {
1767 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1768 }
1769 } else {
1770 block->SetOwnedValue(block_holder.release());
1771 }
1772 }
1773
1774 // Release hold on compressed cache entry
1775 block_cache_compressed->Release(block_cache_compressed_handle);
1776 return s;
1777 }
1778
1779 template <typename TBlocklike>
PutDataBlockToCache(const Slice & block_cache_key,const Slice & compressed_block_cache_key,Cache * block_cache,Cache * block_cache_compressed,CachableEntry<TBlocklike> * cached_block,BlockContents * raw_block_contents,CompressionType raw_block_comp_type,const UncompressionDict & uncompression_dict,SequenceNumber seq_no,MemoryAllocator * memory_allocator,BlockType block_type,GetContext * get_context) const1780 Status BlockBasedTable::PutDataBlockToCache(
1781 const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1782 Cache* block_cache, Cache* block_cache_compressed,
1783 CachableEntry<TBlocklike>* cached_block, BlockContents* raw_block_contents,
1784 CompressionType raw_block_comp_type,
1785 const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
1786 MemoryAllocator* memory_allocator, BlockType block_type,
1787 GetContext* get_context) const {
1788 const ImmutableCFOptions& ioptions = rep_->ioptions;
1789 const uint32_t format_version = rep_->table_options.format_version;
1790 const size_t read_amp_bytes_per_bit =
1791 block_type == BlockType::kData
1792 ? rep_->table_options.read_amp_bytes_per_bit
1793 : 0;
1794 const Cache::Priority priority =
1795 rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
1796 (block_type == BlockType::kFilter ||
1797 block_type == BlockType::kCompressionDictionary ||
1798 block_type == BlockType::kIndex)
1799 ? Cache::Priority::HIGH
1800 : Cache::Priority::LOW;
1801 assert(cached_block);
1802 assert(cached_block->IsEmpty());
1803
1804 Status s;
1805 Statistics* statistics = ioptions.statistics;
1806
1807 std::unique_ptr<TBlocklike> block_holder;
1808 if (raw_block_comp_type != kNoCompression) {
1809 // Retrieve the uncompressed contents into a new buffer
1810 BlockContents uncompressed_block_contents;
1811 UncompressionContext context(raw_block_comp_type);
1812 UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
1813 s = UncompressBlockContents(info, raw_block_contents->data.data(),
1814 raw_block_contents->data.size(),
1815 &uncompressed_block_contents, format_version,
1816 ioptions, memory_allocator);
1817 if (!s.ok()) {
1818 return s;
1819 }
1820
1821 block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
1822 std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit,
1823 statistics, rep_->blocks_definitely_zstd_compressed,
1824 rep_->table_options.filter_policy.get()));
1825 } else {
1826 block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
1827 std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit,
1828 statistics, rep_->blocks_definitely_zstd_compressed,
1829 rep_->table_options.filter_policy.get()));
1830 }
1831
1832 // Insert compressed block into compressed block cache.
1833 // Release the hold on the compressed cache entry immediately.
1834 if (block_cache_compressed != nullptr &&
1835 raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
1836 raw_block_contents->own_bytes()) {
1837 #ifndef NDEBUG
1838 assert(raw_block_contents->is_raw_block);
1839 #endif // NDEBUG
1840
1841 // We cannot directly put raw_block_contents because this could point to
1842 // an object in the stack.
1843 BlockContents* block_cont_for_comp_cache =
1844 new BlockContents(std::move(*raw_block_contents));
1845 s = block_cache_compressed->Insert(
1846 compressed_block_cache_key, block_cont_for_comp_cache,
1847 block_cont_for_comp_cache->ApproximateMemoryUsage(),
1848 &DeleteCachedEntry<BlockContents>);
1849 if (s.ok()) {
1850 // Avoid the following code to delete this cached block.
1851 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
1852 } else {
1853 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
1854 delete block_cont_for_comp_cache;
1855 }
1856 }
1857
1858 // insert into uncompressed block cache
1859 if (block_cache != nullptr && block_holder->own_bytes()) {
1860 size_t charge = block_holder->ApproximateMemoryUsage();
1861 Cache::Handle* cache_handle = nullptr;
1862 s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1863 &DeleteCachedEntry<TBlocklike>, &cache_handle,
1864 priority);
1865 if (s.ok()) {
1866 assert(cache_handle != nullptr);
1867 cached_block->SetCachedValue(block_holder.release(), block_cache,
1868 cache_handle);
1869
1870 UpdateCacheInsertionMetrics(block_type, get_context, charge);
1871 } else {
1872 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1873 }
1874 } else {
1875 cached_block->SetOwnedValue(block_holder.release());
1876 }
1877
1878 return s;
1879 }
1880
CreateFilterBlockReader(FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context)1881 std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
1882 FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
1883 bool pin, BlockCacheLookupContext* lookup_context) {
1884 auto& rep = rep_;
1885 auto filter_type = rep->filter_type;
1886 if (filter_type == Rep::FilterType::kNoFilter) {
1887 return std::unique_ptr<FilterBlockReader>();
1888 }
1889
1890 assert(rep->filter_policy);
1891
1892 switch (filter_type) {
1893 case Rep::FilterType::kPartitionedFilter:
1894 return PartitionedFilterBlockReader::Create(
1895 this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
1896
1897 case Rep::FilterType::kBlockFilter:
1898 return BlockBasedFilterBlockReader::Create(
1899 this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
1900
1901 case Rep::FilterType::kFullFilter:
1902 return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache,
1903 prefetch, pin, lookup_context);
1904
1905 default:
1906 // filter_type is either kNoFilter (exited the function at the first if),
1907 // or it must be covered in this switch block
1908 assert(false);
1909 return std::unique_ptr<FilterBlockReader>();
1910 }
1911 }
1912
1913 // disable_prefix_seek should be set to true when prefix_extractor found in SST
1914 // differs from the one in mutable_cf_options and index type is HashBasedIndex
NewIndexIterator(const ReadOptions & read_options,bool disable_prefix_seek,IndexBlockIter * input_iter,GetContext * get_context,BlockCacheLookupContext * lookup_context) const1915 InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
1916 const ReadOptions& read_options, bool disable_prefix_seek,
1917 IndexBlockIter* input_iter, GetContext* get_context,
1918 BlockCacheLookupContext* lookup_context) const {
1919 assert(rep_ != nullptr);
1920 assert(rep_->index_reader != nullptr);
1921
1922 // We don't return pinned data from index blocks, so no need
1923 // to set `block_contents_pinned`.
1924 return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
1925 input_iter, get_context,
1926 lookup_context);
1927 }
1928
1929 // Convert an index iterator value (i.e., an encoded BlockHandle)
1930 // into an iterator over the contents of the corresponding block.
1931 // If input_iter is null, new a iterator
1932 // If input_iter is not null, update this iter and return it
1933 template <typename TBlockIter>
NewDataBlockIterator(const ReadOptions & ro,const BlockHandle & handle,TBlockIter * input_iter,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,Status s,FilePrefetchBuffer * prefetch_buffer,bool for_compaction) const1934 TBlockIter* BlockBasedTable::NewDataBlockIterator(
1935 const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
1936 BlockType block_type, GetContext* get_context,
1937 BlockCacheLookupContext* lookup_context, Status s,
1938 FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const {
1939 PERF_TIMER_GUARD(new_table_block_iter_nanos);
1940
1941 TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
1942 if (!s.ok()) {
1943 iter->Invalidate(s);
1944 return iter;
1945 }
1946
1947 CachableEntry<UncompressionDict> uncompression_dict;
1948 if (rep_->uncompression_dict_reader) {
1949 const bool no_io = (ro.read_tier == kBlockCacheTier);
1950 s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
1951 prefetch_buffer, no_io, get_context, lookup_context,
1952 &uncompression_dict);
1953 if (!s.ok()) {
1954 iter->Invalidate(s);
1955 return iter;
1956 }
1957 }
1958
1959 const UncompressionDict& dict = uncompression_dict.GetValue()
1960 ? *uncompression_dict.GetValue()
1961 : UncompressionDict::GetEmptyDict();
1962
1963 CachableEntry<Block> block;
1964 s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
1965 get_context, lookup_context, for_compaction,
1966 /* use_cache */ true);
1967
1968 if (!s.ok()) {
1969 assert(block.IsEmpty());
1970 iter->Invalidate(s);
1971 return iter;
1972 }
1973
1974 assert(block.GetValue() != nullptr);
1975
1976 // Block contents are pinned and it is still pinned after the iterator
1977 // is destroyed as long as cleanup functions are moved to another object,
1978 // when:
1979 // 1. block cache handle is set to be released in cleanup function, or
1980 // 2. it's pointing to immortal source. If own_bytes is true then we are
1981 // not reading data from the original source, whether immortal or not.
1982 // Otherwise, the block is pinned iff the source is immortal.
1983 const bool block_contents_pinned =
1984 block.IsCached() ||
1985 (!block.GetValue()->own_bytes() && rep_->immortal_table);
1986 iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
1987 block_contents_pinned);
1988
1989 if (!block.IsCached()) {
1990 if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
1991 // insert a dummy record to block cache to track the memory usage
1992 Cache* const block_cache = rep_->table_options.block_cache.get();
1993 Cache::Handle* cache_handle = nullptr;
1994 // There are two other types of cache keys: 1) SST cache key added in
1995 // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
1996 // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
1997 // from SST cache key(31 bytes), and use non-zero prefix to
1998 // differentiate from `write_buffer_manager`
1999 const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
2000 char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
2001 // Prefix: use rep_->cache_key_prefix padded by 0s
2002 memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
2003 assert(rep_->cache_key_prefix_size != 0);
2004 assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
2005 memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
2006 char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
2007 next_cache_key_id_++);
2008 assert(end - cache_key <=
2009 static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
2010 const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
2011 s = block_cache->Insert(unique_key, nullptr,
2012 block.GetValue()->ApproximateMemoryUsage(),
2013 nullptr, &cache_handle);
2014
2015 if (s.ok()) {
2016 assert(cache_handle != nullptr);
2017 iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
2018 cache_handle);
2019 }
2020 }
2021 } else {
2022 iter->SetCacheHandle(block.GetCacheHandle());
2023 }
2024
2025 block.TransferTo(iter);
2026
2027 return iter;
2028 }
2029
2030 template <>
InitBlockIterator(const Rep * rep,Block * block,DataBlockIter * input_iter,bool block_contents_pinned)2031 DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
2032 const Rep* rep, Block* block, DataBlockIter* input_iter,
2033 bool block_contents_pinned) {
2034 return block->NewDataIterator(
2035 &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2036 input_iter, rep->ioptions.statistics, block_contents_pinned);
2037 }
2038
2039 template <>
InitBlockIterator(const Rep * rep,Block * block,IndexBlockIter * input_iter,bool block_contents_pinned)2040 IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
2041 const Rep* rep, Block* block, IndexBlockIter* input_iter,
2042 bool block_contents_pinned) {
2043 return block->NewIndexIterator(
2044 &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2045 input_iter, rep->ioptions.statistics, /* total_order_seek */ true,
2046 rep->index_has_first_key, rep->index_key_includes_seq,
2047 rep->index_value_is_full, block_contents_pinned);
2048 }
2049
2050 // Convert an uncompressed data block (i.e CachableEntry<Block>)
2051 // into an iterator over the contents of the corresponding block.
2052 // If input_iter is null, new a iterator
2053 // If input_iter is not null, update this iter and return it
2054 template <typename TBlockIter>
NewDataBlockIterator(const ReadOptions & ro,CachableEntry<Block> & block,TBlockIter * input_iter,Status s) const2055 TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro,
2056 CachableEntry<Block>& block,
2057 TBlockIter* input_iter,
2058 Status s) const {
2059 PERF_TIMER_GUARD(new_table_block_iter_nanos);
2060
2061 TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
2062 if (!s.ok()) {
2063 iter->Invalidate(s);
2064 return iter;
2065 }
2066
2067 assert(block.GetValue() != nullptr);
2068 // Block contents are pinned and it is still pinned after the iterator
2069 // is destroyed as long as cleanup functions are moved to another object,
2070 // when:
2071 // 1. block cache handle is set to be released in cleanup function, or
2072 // 2. it's pointing to immortal source. If own_bytes is true then we are
2073 // not reading data from the original source, whether immortal or not.
2074 // Otherwise, the block is pinned iff the source is immortal.
2075 const bool block_contents_pinned =
2076 block.IsCached() ||
2077 (!block.GetValue()->own_bytes() && rep_->immortal_table);
2078 iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
2079 block_contents_pinned);
2080
2081 if (!block.IsCached()) {
2082 if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
2083 // insert a dummy record to block cache to track the memory usage
2084 Cache* const block_cache = rep_->table_options.block_cache.get();
2085 Cache::Handle* cache_handle = nullptr;
2086 // There are two other types of cache keys: 1) SST cache key added in
2087 // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
2088 // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
2089 // from SST cache key(31 bytes), and use non-zero prefix to
2090 // differentiate from `write_buffer_manager`
2091 const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
2092 char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
2093 // Prefix: use rep_->cache_key_prefix padded by 0s
2094 memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
2095 assert(rep_->cache_key_prefix_size != 0);
2096 assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
2097 memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
2098 char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
2099 next_cache_key_id_++);
2100 assert(end - cache_key <=
2101 static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
2102 const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
2103 s = block_cache->Insert(unique_key, nullptr,
2104 block.GetValue()->ApproximateMemoryUsage(),
2105 nullptr, &cache_handle);
2106 if (s.ok()) {
2107 assert(cache_handle != nullptr);
2108 iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
2109 cache_handle);
2110 }
2111 }
2112 } else {
2113 iter->SetCacheHandle(block.GetCacheHandle());
2114 }
2115
2116 block.TransferTo(iter);
2117 return iter;
2118 }
2119
2120 // If contents is nullptr, this function looks up the block caches for the
2121 // data block referenced by handle, and read the block from disk if necessary.
2122 // If contents is non-null, it skips the cache lookup and disk read, since
2123 // the caller has already read it. In both cases, if ro.fill_cache is true,
2124 // it inserts the block into the block cache.
2125 template <typename TBlocklike>
MaybeReadBlockAndLoadToCache(FilePrefetchBuffer * prefetch_buffer,const ReadOptions & ro,const BlockHandle & handle,const UncompressionDict & uncompression_dict,CachableEntry<TBlocklike> * block_entry,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,BlockContents * contents) const2126 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
2127 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2128 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2129 CachableEntry<TBlocklike>* block_entry, BlockType block_type,
2130 GetContext* get_context, BlockCacheLookupContext* lookup_context,
2131 BlockContents* contents) const {
2132 assert(block_entry != nullptr);
2133 const bool no_io = (ro.read_tier == kBlockCacheTier);
2134 Cache* block_cache = rep_->table_options.block_cache.get();
2135 // No point to cache compressed blocks if it never goes away
2136 Cache* block_cache_compressed =
2137 rep_->immortal_table ? nullptr
2138 : rep_->table_options.block_cache_compressed.get();
2139
2140 // First, try to get the block from the cache
2141 //
2142 // If either block cache is enabled, we'll try to read from it.
2143 Status s;
2144 char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
2145 char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
2146 Slice key /* key to the block cache */;
2147 Slice ckey /* key to the compressed block cache */;
2148 bool is_cache_hit = false;
2149 if (block_cache != nullptr || block_cache_compressed != nullptr) {
2150 // create key for block cache
2151 if (block_cache != nullptr) {
2152 key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
2153 handle, cache_key);
2154 }
2155
2156 if (block_cache_compressed != nullptr) {
2157 ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
2158 rep_->compressed_cache_key_prefix_size, handle,
2159 compressed_cache_key);
2160 }
2161
2162 if (!contents) {
2163 s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
2164 ro, block_entry, uncompression_dict, block_type,
2165 get_context);
2166 if (block_entry->GetValue()) {
2167 // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
2168 // compressed block cache.
2169 is_cache_hit = true;
2170 }
2171 }
2172
2173 // Can't find the block from the cache. If I/O is allowed, read from the
2174 // file.
2175 if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
2176 Statistics* statistics = rep_->ioptions.statistics;
2177 const bool maybe_compressed =
2178 block_type != BlockType::kFilter &&
2179 block_type != BlockType::kCompressionDictionary &&
2180 rep_->blocks_maybe_compressed;
2181 const bool do_uncompress = maybe_compressed && !block_cache_compressed;
2182 CompressionType raw_block_comp_type;
2183 BlockContents raw_block_contents;
2184 if (!contents) {
2185 StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
2186 BlockFetcher block_fetcher(
2187 rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
2188 &raw_block_contents, rep_->ioptions, do_uncompress,
2189 maybe_compressed, block_type, uncompression_dict,
2190 rep_->persistent_cache_options,
2191 GetMemoryAllocator(rep_->table_options),
2192 GetMemoryAllocatorForCompressedBlock(rep_->table_options));
2193 s = block_fetcher.ReadBlockContents();
2194 raw_block_comp_type = block_fetcher.get_compression_type();
2195 contents = &raw_block_contents;
2196 } else {
2197 raw_block_comp_type = contents->get_compression_type();
2198 }
2199
2200 if (s.ok()) {
2201 SequenceNumber seq_no = rep_->get_global_seqno(block_type);
2202 // If filling cache is allowed and a cache is configured, try to put the
2203 // block to the cache.
2204 s = PutDataBlockToCache(
2205 key, ckey, block_cache, block_cache_compressed, block_entry,
2206 contents, raw_block_comp_type, uncompression_dict, seq_no,
2207 GetMemoryAllocator(rep_->table_options), block_type, get_context);
2208 }
2209 }
2210 }
2211
2212 // Fill lookup_context.
2213 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
2214 lookup_context) {
2215 size_t usage = 0;
2216 uint64_t nkeys = 0;
2217 if (block_entry->GetValue()) {
2218 // Approximate the number of keys in the block using restarts.
2219 nkeys =
2220 rep_->table_options.block_restart_interval *
2221 BlocklikeTraits<TBlocklike>::GetNumRestarts(*block_entry->GetValue());
2222 usage = block_entry->GetValue()->ApproximateMemoryUsage();
2223 }
2224 TraceType trace_block_type = TraceType::kTraceMax;
2225 switch (block_type) {
2226 case BlockType::kData:
2227 trace_block_type = TraceType::kBlockTraceDataBlock;
2228 break;
2229 case BlockType::kFilter:
2230 trace_block_type = TraceType::kBlockTraceFilterBlock;
2231 break;
2232 case BlockType::kCompressionDictionary:
2233 trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
2234 break;
2235 case BlockType::kRangeDeletion:
2236 trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
2237 break;
2238 case BlockType::kIndex:
2239 trace_block_type = TraceType::kBlockTraceIndexBlock;
2240 break;
2241 default:
2242 // This cannot happen.
2243 assert(false);
2244 break;
2245 }
2246 bool no_insert = no_io || !ro.fill_cache;
2247 if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
2248 trace_block_type, lookup_context->caller)) {
2249 // Defer logging the access to Get() and MultiGet() to trace additional
2250 // information, e.g., referenced_key_exist_in_block.
2251
2252 // Make a copy of the block key here since it will be logged later.
2253 lookup_context->FillLookupContext(
2254 is_cache_hit, no_insert, trace_block_type,
2255 /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
2256 } else {
2257 // Avoid making copy of block_key and cf_name when constructing the access
2258 // record.
2259 BlockCacheTraceRecord access_record(
2260 rep_->ioptions.env->NowMicros(),
2261 /*block_key=*/"", trace_block_type,
2262 /*block_size=*/usage, rep_->cf_id_for_tracing(),
2263 /*cf_name=*/"", rep_->level_for_tracing(),
2264 rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
2265 no_insert, lookup_context->get_id,
2266 lookup_context->get_from_user_specified_snapshot,
2267 /*referenced_key=*/"");
2268 block_cache_tracer_->WriteBlockAccess(access_record, key,
2269 rep_->cf_name_for_tracing(),
2270 lookup_context->referenced_key);
2271 }
2272 }
2273
2274 assert(s.ok() || block_entry->GetValue() == nullptr);
2275 return s;
2276 }
2277
2278 // This function reads multiple data blocks from disk using Env::MultiRead()
2279 // and optionally inserts them into the block cache. It uses the scratch
2280 // buffer provided by the caller, which is contiguous. If scratch is a nullptr
2281 // it allocates a separate buffer for each block. Typically, if the blocks
2282 // need to be uncompressed and there is no compressed block cache, callers
2283 // can allocate a temporary scratch buffer in order to minimize memory
2284 // allocations.
2285 // If options.fill_cache is true, it inserts the blocks into cache. If its
2286 // false and scratch is non-null and the blocks are uncompressed, it copies
2287 // the buffers to heap. In any case, the CachableEntry<Block> returned will
2288 // own the data bytes.
2289 // If compression is enabled and also there is no compressed block cache,
2290 // the adjacent blocks are read out in one IO (combined read)
2291 // batch - A MultiGetRange with only those keys with unique data blocks not
2292 // found in cache
2293 // handles - A vector of block handles. Some of them me be NULL handles
2294 // scratch - An optional contiguous buffer to read compressed blocks into
RetrieveMultipleBlocks(const ReadOptions & options,const MultiGetRange * batch,const autovector<BlockHandle,MultiGetContext::MAX_BATCH_SIZE> * handles,autovector<Status,MultiGetContext::MAX_BATCH_SIZE> * statuses,autovector<CachableEntry<Block>,MultiGetContext::MAX_BATCH_SIZE> * results,char * scratch,const UncompressionDict & uncompression_dict) const2295 void BlockBasedTable::RetrieveMultipleBlocks(
2296 const ReadOptions& options, const MultiGetRange* batch,
2297 const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
2298 autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
2299 autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
2300 char* scratch, const UncompressionDict& uncompression_dict) const {
2301 RandomAccessFileReader* file = rep_->file.get();
2302 const Footer& footer = rep_->footer;
2303 const ImmutableCFOptions& ioptions = rep_->ioptions;
2304 SequenceNumber global_seqno = rep_->get_global_seqno(BlockType::kData);
2305 size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
2306 MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
2307
2308 if (file->use_direct_io() || ioptions.allow_mmap_reads) {
2309 size_t idx_in_batch = 0;
2310 for (auto mget_iter = batch->begin(); mget_iter != batch->end();
2311 ++mget_iter, ++idx_in_batch) {
2312 BlockCacheLookupContext lookup_data_block_context(
2313 TableReaderCaller::kUserMultiGet);
2314 const BlockHandle& handle = (*handles)[idx_in_batch];
2315 if (handle.IsNull()) {
2316 continue;
2317 }
2318
2319 (*statuses)[idx_in_batch] =
2320 RetrieveBlock(nullptr, options, handle, uncompression_dict,
2321 &(*results)[idx_in_batch], BlockType::kData,
2322 mget_iter->get_context, &lookup_data_block_context,
2323 /* for_compaction */ false, /* use_cache */ true);
2324 }
2325 return;
2326 }
2327
2328 autovector<FSReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs;
2329 size_t buf_offset = 0;
2330 size_t idx_in_batch = 0;
2331
2332 uint64_t prev_offset = 0;
2333 size_t prev_len = 0;
2334 autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_idx_for_block;
2335 autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_offset_for_block;
2336 for (auto mget_iter = batch->begin(); mget_iter != batch->end();
2337 ++mget_iter, ++idx_in_batch) {
2338 const BlockHandle& handle = (*handles)[idx_in_batch];
2339 if (handle.IsNull()) {
2340 continue;
2341 }
2342
2343 size_t prev_end = static_cast<size_t>(prev_offset) + prev_len;
2344
2345 // If current block is adjacent to the previous one, at the same time,
2346 // compression is enabled and there is no compressed cache, we combine
2347 // the two block read as one.
2348 if (scratch != nullptr && prev_end == handle.offset()) {
2349 req_offset_for_block.emplace_back(prev_len);
2350 prev_len += block_size(handle);
2351 } else {
2352 // No compression or current block and previous one is not adjacent:
2353 // Step 1, create a new request for previous blocks
2354 if (prev_len != 0) {
2355 FSReadRequest req;
2356 req.offset = prev_offset;
2357 req.len = prev_len;
2358 if (scratch == nullptr) {
2359 req.scratch = new char[req.len];
2360 } else {
2361 req.scratch = scratch + buf_offset;
2362 buf_offset += req.len;
2363 }
2364 req.status = IOStatus::OK();
2365 read_reqs.emplace_back(req);
2366 }
2367
2368 // Step 2, remeber the previous block info
2369 prev_offset = handle.offset();
2370 prev_len = block_size(handle);
2371 req_offset_for_block.emplace_back(0);
2372 }
2373 req_idx_for_block.emplace_back(read_reqs.size());
2374 }
2375 // Handle the last block and process the pending last request
2376 if (prev_len != 0) {
2377 FSReadRequest req;
2378 req.offset = prev_offset;
2379 req.len = prev_len;
2380 if (scratch == nullptr) {
2381 req.scratch = new char[req.len];
2382 } else {
2383 req.scratch = scratch + buf_offset;
2384 }
2385 req.status = IOStatus::OK();
2386 read_reqs.emplace_back(req);
2387 }
2388
2389 file->MultiRead(&read_reqs[0], read_reqs.size());
2390
2391 idx_in_batch = 0;
2392 size_t valid_batch_idx = 0;
2393 for (auto mget_iter = batch->begin(); mget_iter != batch->end();
2394 ++mget_iter, ++idx_in_batch) {
2395 const BlockHandle& handle = (*handles)[idx_in_batch];
2396
2397 if (handle.IsNull()) {
2398 continue;
2399 }
2400
2401 assert(valid_batch_idx < req_idx_for_block.size());
2402 assert(valid_batch_idx < req_offset_for_block.size());
2403 assert(req_idx_for_block[valid_batch_idx] < read_reqs.size());
2404 size_t& req_idx = req_idx_for_block[valid_batch_idx];
2405 size_t& req_offset = req_offset_for_block[valid_batch_idx];
2406 valid_batch_idx++;
2407 FSReadRequest& req = read_reqs[req_idx];
2408 Status s = req.status;
2409 if (s.ok()) {
2410 if (req.result.size() != req.len) {
2411 s = Status::Corruption(
2412 "truncated block read from " + rep_->file->file_name() +
2413 " offset " + ToString(handle.offset()) + ", expected " +
2414 ToString(req.len) + " bytes, got " + ToString(req.result.size()));
2415 }
2416 }
2417
2418 BlockContents raw_block_contents;
2419 size_t cur_read_end = req_offset + block_size(handle);
2420 if (cur_read_end > req.result.size()) {
2421 s = Status::Corruption(
2422 "truncated block read from " + rep_->file->file_name() + " offset " +
2423 ToString(handle.offset()) + ", expected " + ToString(req.len) +
2424 " bytes, got " + ToString(req.result.size()));
2425 }
2426
2427 bool blocks_share_read_buffer = (req.result.size() != block_size(handle));
2428 if (s.ok()) {
2429 if (scratch == nullptr && !blocks_share_read_buffer) {
2430 // We allocated a buffer for this block. Give ownership of it to
2431 // BlockContents so it can free the memory
2432 assert(req.result.data() == req.scratch);
2433 std::unique_ptr<char[]> raw_block(req.scratch + req_offset);
2434 raw_block_contents = BlockContents(std::move(raw_block), handle.size());
2435 } else {
2436 // We used the scratch buffer which are shared by the blocks.
2437 // raw_block_contents does not have the ownership.
2438 raw_block_contents =
2439 BlockContents(Slice(req.scratch + req_offset, handle.size()));
2440 }
2441
2442 #ifndef NDEBUG
2443 raw_block_contents.is_raw_block = true;
2444 #endif
2445 if (options.verify_checksums) {
2446 PERF_TIMER_GUARD(block_checksum_time);
2447 const char* data = req.result.data();
2448 uint32_t expected =
2449 DecodeFixed32(data + req_offset + handle.size() + 1);
2450 // Since the scratch might be shared. the offset of the data block in
2451 // the buffer might not be 0. req.result.data() only point to the
2452 // begin address of each read request, we need to add the offset
2453 // in each read request. Checksum is stored in the block trailer,
2454 // which is handle.size() + 1.
2455 s = rocksdb::VerifyChecksum(footer.checksum(),
2456 req.result.data() + req_offset,
2457 handle.size() + 1, expected);
2458 TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
2459 }
2460 }
2461
2462 if (s.ok()) {
2463 // It handles a rare case: compression is set and these is no compressed
2464 // cache (enable combined read). In this case, the scratch != nullptr.
2465 // At the same time, some blocks are actually not compressed,
2466 // since its compression space saving is smaller than the threshold. In
2467 // this case, if the block shares the scratch memory, we need to copy it
2468 // to the heap such that it can be added to the regular block cache.
2469 CompressionType compression_type =
2470 raw_block_contents.get_compression_type();
2471 if (scratch != nullptr && compression_type == kNoCompression) {
2472 Slice raw = Slice(req.scratch + req_offset, block_size(handle));
2473 raw_block_contents = BlockContents(
2474 CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
2475 handle.size());
2476 #ifndef NDEBUG
2477 raw_block_contents.is_raw_block = true;
2478 #endif
2479 }
2480 }
2481
2482 if (s.ok()) {
2483 if (options.fill_cache) {
2484 BlockCacheLookupContext lookup_data_block_context(
2485 TableReaderCaller::kUserMultiGet);
2486 CachableEntry<Block>* block_entry = &(*results)[idx_in_batch];
2487 // MaybeReadBlockAndLoadToCache will insert into the block caches if
2488 // necessary. Since we're passing the raw block contents, it will
2489 // avoid looking up the block cache
2490 s = MaybeReadBlockAndLoadToCache(
2491 nullptr, options, handle, uncompression_dict, block_entry,
2492 BlockType::kData, mget_iter->get_context,
2493 &lookup_data_block_context, &raw_block_contents);
2494
2495 // block_entry value could be null if no block cache is present, i.e
2496 // BlockBasedTableOptions::no_block_cache is true and no compressed
2497 // block cache is configured. In that case, fall
2498 // through and set up the block explicitly
2499 if (block_entry->GetValue() != nullptr) {
2500 continue;
2501 }
2502 }
2503
2504 CompressionType compression_type =
2505 raw_block_contents.get_compression_type();
2506 BlockContents contents;
2507 if (compression_type != kNoCompression) {
2508 UncompressionContext context(compression_type);
2509 UncompressionInfo info(context, uncompression_dict, compression_type);
2510 s = UncompressBlockContents(info, req.result.data() + req_offset,
2511 handle.size(), &contents, footer.version(),
2512 rep_->ioptions, memory_allocator);
2513 } else {
2514 // There are two cases here: 1) caller uses the scratch buffer; 2) we
2515 // use the requst buffer. If scratch buffer is used, we ensure that
2516 // all raw blocks are copyed to the heap as single blocks. If scratch
2517 // buffer is not used, we also have no combined read, so the raw
2518 // block can be used directly.
2519 contents = std::move(raw_block_contents);
2520 }
2521 if (s.ok()) {
2522 (*results)[idx_in_batch].SetOwnedValue(
2523 new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
2524 ioptions.statistics));
2525 }
2526 }
2527 (*statuses)[idx_in_batch] = s;
2528 }
2529 }
2530
2531 template <typename TBlocklike>
RetrieveBlock(FilePrefetchBuffer * prefetch_buffer,const ReadOptions & ro,const BlockHandle & handle,const UncompressionDict & uncompression_dict,CachableEntry<TBlocklike> * block_entry,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,bool for_compaction,bool use_cache) const2532 Status BlockBasedTable::RetrieveBlock(
2533 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2534 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2535 CachableEntry<TBlocklike>* block_entry, BlockType block_type,
2536 GetContext* get_context, BlockCacheLookupContext* lookup_context,
2537 bool for_compaction, bool use_cache) const {
2538 assert(block_entry);
2539 assert(block_entry->IsEmpty());
2540
2541 Status s;
2542 if (use_cache) {
2543 s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
2544 uncompression_dict, block_entry,
2545 block_type, get_context, lookup_context,
2546 /*contents=*/nullptr);
2547
2548 if (!s.ok()) {
2549 return s;
2550 }
2551
2552 if (block_entry->GetValue() != nullptr) {
2553 assert(s.ok());
2554 return s;
2555 }
2556 }
2557
2558 assert(block_entry->IsEmpty());
2559
2560 const bool no_io = ro.read_tier == kBlockCacheTier;
2561 if (no_io) {
2562 return Status::Incomplete("no blocking io");
2563 }
2564
2565 const bool maybe_compressed =
2566 block_type != BlockType::kFilter &&
2567 block_type != BlockType::kCompressionDictionary &&
2568 rep_->blocks_maybe_compressed;
2569 const bool do_uncompress = maybe_compressed;
2570 std::unique_ptr<TBlocklike> block;
2571
2572 {
2573 StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
2574 READ_BLOCK_GET_MICROS);
2575 s = ReadBlockFromFile(
2576 rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
2577 rep_->ioptions, do_uncompress, maybe_compressed, block_type,
2578 uncompression_dict, rep_->persistent_cache_options,
2579 rep_->get_global_seqno(block_type),
2580 block_type == BlockType::kData
2581 ? rep_->table_options.read_amp_bytes_per_bit
2582 : 0,
2583 GetMemoryAllocator(rep_->table_options), for_compaction,
2584 rep_->blocks_definitely_zstd_compressed,
2585 rep_->table_options.filter_policy.get());
2586 }
2587
2588 if (!s.ok()) {
2589 return s;
2590 }
2591
2592 block_entry->SetOwnedValue(block.release());
2593
2594 assert(s.ok());
2595 return s;
2596 }
2597
2598 // Explicitly instantiate templates for both "blocklike" types we use.
2599 // This makes it possible to keep the template definitions in the .cc file.
2600 template Status BlockBasedTable::RetrieveBlock<BlockContents>(
2601 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2602 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2603 CachableEntry<BlockContents>* block_entry, BlockType block_type,
2604 GetContext* get_context, BlockCacheLookupContext* lookup_context,
2605 bool for_compaction, bool use_cache) const;
2606
2607 template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
2608 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2609 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2610 CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type,
2611 GetContext* get_context, BlockCacheLookupContext* lookup_context,
2612 bool for_compaction, bool use_cache) const;
2613
2614 template Status BlockBasedTable::RetrieveBlock<Block>(
2615 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2616 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2617 CachableEntry<Block>* block_entry, BlockType block_type,
2618 GetContext* get_context, BlockCacheLookupContext* lookup_context,
2619 bool for_compaction, bool use_cache) const;
2620
2621 template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
2622 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2623 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2624 CachableEntry<UncompressionDict>* block_entry, BlockType block_type,
2625 GetContext* get_context, BlockCacheLookupContext* lookup_context,
2626 bool for_compaction, bool use_cache) const;
2627
PartitionedIndexIteratorState(const BlockBasedTable * table,std::unordered_map<uint64_t,CachableEntry<Block>> * block_map)2628 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
2629 const BlockBasedTable* table,
2630 std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
2631 : table_(table), block_map_(block_map) {}
2632
2633 InternalIteratorBase<IndexValue>*
NewSecondaryIterator(const BlockHandle & handle)2634 BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
2635 const BlockHandle& handle) {
2636 // Return a block iterator on the index partition
2637 auto block = block_map_->find(handle.offset());
2638 // This is a possible scenario since block cache might not have had space
2639 // for the partition
2640 if (block != block_map_->end()) {
2641 const Rep* rep = table_->get_rep();
2642 assert(rep);
2643
2644 Statistics* kNullStats = nullptr;
2645 // We don't return pinned data from index blocks, so no need
2646 // to set `block_contents_pinned`.
2647 return block->second.GetValue()->NewIndexIterator(
2648 &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2649 nullptr, kNullStats, true, rep->index_has_first_key,
2650 rep->index_key_includes_seq, rep->index_value_is_full);
2651 }
2652 // Create an empty iterator
2653 return new IndexBlockIter();
2654 }
2655
2656 // This will be broken if the user specifies an unusual implementation
2657 // of Options.comparator, or if the user specifies an unusual
2658 // definition of prefixes in BlockBasedTableOptions.filter_policy.
2659 // In particular, we require the following three properties:
2660 //
2661 // 1) key.starts_with(prefix(key))
2662 // 2) Compare(prefix(key), key) <= 0.
2663 // 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
2664 //
2665 // Otherwise, this method guarantees no I/O will be incurred.
2666 //
2667 // REQUIRES: this method shouldn't be called while the DB lock is held.
PrefixMayMatch(const Slice & internal_key,const ReadOptions & read_options,const SliceTransform * options_prefix_extractor,const bool need_upper_bound_check,BlockCacheLookupContext * lookup_context) const2668 bool BlockBasedTable::PrefixMayMatch(
2669 const Slice& internal_key, const ReadOptions& read_options,
2670 const SliceTransform* options_prefix_extractor,
2671 const bool need_upper_bound_check,
2672 BlockCacheLookupContext* lookup_context) const {
2673 if (!rep_->filter_policy) {
2674 return true;
2675 }
2676
2677 const SliceTransform* prefix_extractor;
2678
2679 if (rep_->table_prefix_extractor == nullptr) {
2680 if (need_upper_bound_check) {
2681 return true;
2682 }
2683 prefix_extractor = options_prefix_extractor;
2684 } else {
2685 prefix_extractor = rep_->table_prefix_extractor.get();
2686 }
2687 auto user_key = ExtractUserKey(internal_key);
2688 if (!prefix_extractor->InDomain(user_key)) {
2689 return true;
2690 }
2691
2692 bool may_match = true;
2693 Status s;
2694
2695 // First, try check with full filter
2696 FilterBlockReader* const filter = rep_->filter.get();
2697 bool filter_checked = true;
2698 if (filter != nullptr) {
2699 if (!filter->IsBlockBased()) {
2700 const Slice* const const_ikey_ptr = &internal_key;
2701 may_match = filter->RangeMayExist(
2702 read_options.iterate_upper_bound, user_key, prefix_extractor,
2703 rep_->internal_comparator.user_comparator(), const_ikey_ptr,
2704 &filter_checked, need_upper_bound_check, lookup_context);
2705 } else {
2706 // if prefix_extractor changed for block based filter, skip filter
2707 if (need_upper_bound_check) {
2708 return true;
2709 }
2710 auto prefix = prefix_extractor->Transform(user_key);
2711 InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
2712 auto internal_prefix = internal_key_prefix.Encode();
2713
2714 // To prevent any io operation in this method, we set `read_tier` to make
2715 // sure we always read index or filter only when they have already been
2716 // loaded to memory.
2717 ReadOptions no_io_read_options;
2718 no_io_read_options.read_tier = kBlockCacheTier;
2719
2720 // Then, try find it within each block
2721 // we already know prefix_extractor and prefix_extractor_name must match
2722 // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
2723 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
2724 no_io_read_options,
2725 /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
2726 /*get_context=*/nullptr, lookup_context));
2727 iiter->Seek(internal_prefix);
2728
2729 if (!iiter->Valid()) {
2730 // we're past end of file
2731 // if it's incomplete, it means that we avoided I/O
2732 // and we're not really sure that we're past the end
2733 // of the file
2734 may_match = iiter->status().IsIncomplete();
2735 } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key())
2736 : iiter->key())
2737 .starts_with(ExtractUserKey(internal_prefix))) {
2738 // we need to check for this subtle case because our only
2739 // guarantee is that "the key is a string >= last key in that data
2740 // block" according to the doc/table_format.txt spec.
2741 //
2742 // Suppose iiter->key() starts with the desired prefix; it is not
2743 // necessarily the case that the corresponding data block will
2744 // contain the prefix, since iiter->key() need not be in the
2745 // block. However, the next data block may contain the prefix, so
2746 // we return true to play it safe.
2747 may_match = true;
2748 } else if (filter->IsBlockBased()) {
2749 // iiter->key() does NOT start with the desired prefix. Because
2750 // Seek() finds the first key that is >= the seek target, this
2751 // means that iiter->key() > prefix. Thus, any data blocks coming
2752 // after the data block corresponding to iiter->key() cannot
2753 // possibly contain the key. Thus, the corresponding data block
2754 // is the only on could potentially contain the prefix.
2755 BlockHandle handle = iiter->value().handle;
2756 may_match = filter->PrefixMayMatch(
2757 prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
2758 /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context);
2759 }
2760 }
2761 }
2762
2763 if (filter_checked) {
2764 Statistics* statistics = rep_->ioptions.statistics;
2765 RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
2766 if (!may_match) {
2767 RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
2768 }
2769 }
2770
2771 return may_match;
2772 }
2773
2774 template <class TBlockIter, typename TValue>
Seek(const Slice & target)2775 void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
2776 SeekImpl(&target);
2777 }
2778
2779 template <class TBlockIter, typename TValue>
SeekToFirst()2780 void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
2781 SeekImpl(nullptr);
2782 }
2783
2784 template <class TBlockIter, typename TValue>
SeekImpl(const Slice * target)2785 void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
2786 const Slice* target) {
2787 is_out_of_bound_ = false;
2788 is_at_first_key_from_index_ = false;
2789 if (target && !CheckPrefixMayMatch(*target)) {
2790 ResetDataIter();
2791 return;
2792 }
2793
2794 bool need_seek_index = true;
2795 if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
2796 // Reseek.
2797 prev_block_offset_ = index_iter_->value().handle.offset();
2798
2799 if (target) {
2800 // We can avoid an index seek if:
2801 // 1. The new seek key is larger than the current key
2802 // 2. The new seek key is within the upper bound of the block
2803 // Since we don't necessarily know the internal key for either
2804 // the current key or the upper bound, we check user keys and
2805 // exclude the equality case. Considering internal keys can
2806 // improve for the boundary cases, but it would complicate the
2807 // code.
2808 if (user_comparator_.Compare(ExtractUserKey(*target),
2809 block_iter_.user_key()) > 0 &&
2810 user_comparator_.Compare(ExtractUserKey(*target),
2811 index_iter_->user_key()) < 0) {
2812 need_seek_index = false;
2813 }
2814 }
2815 }
2816
2817 if (need_seek_index) {
2818 if (target) {
2819 index_iter_->Seek(*target);
2820 } else {
2821 index_iter_->SeekToFirst();
2822 }
2823
2824 if (!index_iter_->Valid()) {
2825 ResetDataIter();
2826 return;
2827 }
2828 }
2829
2830 IndexValue v = index_iter_->value();
2831 const bool same_block = block_iter_points_to_real_block_ &&
2832 v.handle.offset() == prev_block_offset_;
2833
2834 // TODO(kolmike): Remove the != kBlockCacheTier condition.
2835 if (!v.first_internal_key.empty() && !same_block &&
2836 (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
2837 read_options_.read_tier != kBlockCacheTier) {
2838 // Index contains the first key of the block, and it's >= target.
2839 // We can defer reading the block.
2840 is_at_first_key_from_index_ = true;
2841 // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
2842 // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
2843 // as that will be done later when the data block is actually read.
2844 ResetDataIter();
2845 } else {
2846 // Need to use the data block.
2847 if (!same_block) {
2848 InitDataBlock();
2849 } else {
2850 // When the user does a reseek, the iterate_upper_bound might have
2851 // changed. CheckDataBlockWithinUpperBound() needs to be called
2852 // explicitly if the reseek ends up in the same data block.
2853 // If the reseek ends up in a different block, InitDataBlock() will do
2854 // the iterator upper bound check.
2855 CheckDataBlockWithinUpperBound();
2856 }
2857
2858 if (target) {
2859 block_iter_.Seek(*target);
2860 } else {
2861 block_iter_.SeekToFirst();
2862 }
2863 FindKeyForward();
2864 }
2865
2866 CheckOutOfBound();
2867
2868 if (target) {
2869 assert(!Valid() || ((block_type_ == BlockType::kIndex &&
2870 !table_->get_rep()->index_key_includes_seq)
2871 ? (user_comparator_.Compare(ExtractUserKey(*target),
2872 key()) <= 0)
2873 : (icomp_.Compare(*target, key()) <= 0)));
2874 }
2875 }
2876
2877 template <class TBlockIter, typename TValue>
SeekForPrev(const Slice & target)2878 void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
2879 const Slice& target) {
2880 is_out_of_bound_ = false;
2881 is_at_first_key_from_index_ = false;
2882 if (!CheckPrefixMayMatch(target)) {
2883 ResetDataIter();
2884 return;
2885 }
2886
2887 SavePrevIndexValue();
2888
2889 // Call Seek() rather than SeekForPrev() in the index block, because the
2890 // target data block will likely to contain the position for `target`, the
2891 // same as Seek(), rather than than before.
2892 // For example, if we have three data blocks, each containing two keys:
2893 // [2, 4] [6, 8] [10, 12]
2894 // (the keys in the index block would be [4, 8, 12])
2895 // and the user calls SeekForPrev(7), we need to go to the second block,
2896 // just like if they call Seek(7).
2897 // The only case where the block is difference is when they seek to a position
2898 // in the boundary. For example, if they SeekForPrev(5), we should go to the
2899 // first block, rather than the second. However, we don't have the information
2900 // to distinguish the two unless we read the second block. In this case, we'll
2901 // end up with reading two blocks.
2902 index_iter_->Seek(target);
2903
2904 if (!index_iter_->Valid()) {
2905 if (!index_iter_->status().ok()) {
2906 ResetDataIter();
2907 return;
2908 }
2909
2910 index_iter_->SeekToLast();
2911 if (!index_iter_->Valid()) {
2912 ResetDataIter();
2913 return;
2914 }
2915 }
2916
2917 InitDataBlock();
2918
2919 block_iter_.SeekForPrev(target);
2920
2921 FindKeyBackward();
2922 CheckDataBlockWithinUpperBound();
2923 assert(!block_iter_.Valid() ||
2924 icomp_.Compare(target, block_iter_.key()) >= 0);
2925 }
2926
2927 template <class TBlockIter, typename TValue>
SeekToLast()2928 void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
2929 is_out_of_bound_ = false;
2930 is_at_first_key_from_index_ = false;
2931 SavePrevIndexValue();
2932 index_iter_->SeekToLast();
2933 if (!index_iter_->Valid()) {
2934 ResetDataIter();
2935 return;
2936 }
2937 InitDataBlock();
2938 block_iter_.SeekToLast();
2939 FindKeyBackward();
2940 CheckDataBlockWithinUpperBound();
2941 }
2942
2943 template <class TBlockIter, typename TValue>
Next()2944 void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
2945 if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
2946 return;
2947 }
2948 assert(block_iter_points_to_real_block_);
2949 block_iter_.Next();
2950 FindKeyForward();
2951 CheckOutOfBound();
2952 }
2953
2954 template <class TBlockIter, typename TValue>
NextAndGetResult(IterateResult * result)2955 bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
2956 IterateResult* result) {
2957 Next();
2958 bool is_valid = Valid();
2959 if (is_valid) {
2960 result->key = key();
2961 result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
2962 }
2963 return is_valid;
2964 }
2965
2966 template <class TBlockIter, typename TValue>
Prev()2967 void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
2968 if (is_at_first_key_from_index_) {
2969 is_at_first_key_from_index_ = false;
2970
2971 index_iter_->Prev();
2972 if (!index_iter_->Valid()) {
2973 return;
2974 }
2975
2976 InitDataBlock();
2977 block_iter_.SeekToLast();
2978 } else {
2979 assert(block_iter_points_to_real_block_);
2980 block_iter_.Prev();
2981 }
2982
2983 FindKeyBackward();
2984 }
2985
2986 template <class TBlockIter, typename TValue>
InitDataBlock()2987 void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
2988 BlockHandle data_block_handle = index_iter_->value().handle;
2989 if (!block_iter_points_to_real_block_ ||
2990 data_block_handle.offset() != prev_block_offset_ ||
2991 // if previous attempt of reading the block missed cache, try again
2992 block_iter_.status().IsIncomplete()) {
2993 if (block_iter_points_to_real_block_) {
2994 ResetDataIter();
2995 }
2996 auto* rep = table_->get_rep();
2997
2998 // Prefetch additional data for range scans (iterators). Enabled only for
2999 // user reads.
3000 // Implicit auto readahead:
3001 // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
3002 // Explicit user requested readahead:
3003 // Enabled from the very first IO when ReadOptions.readahead_size is set.
3004 if (lookup_context_.caller != TableReaderCaller::kCompaction) {
3005 if (read_options_.readahead_size == 0) {
3006 // Implicit auto readahead
3007 num_file_reads_++;
3008 if (num_file_reads_ >
3009 BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
3010 if (!rep->file->use_direct_io() &&
3011 (data_block_handle.offset() +
3012 static_cast<size_t>(block_size(data_block_handle)) >
3013 readahead_limit_)) {
3014 // Buffered I/O
3015 // Discarding the return status of Prefetch calls intentionally, as
3016 // we can fallback to reading from disk if Prefetch fails.
3017 rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
3018 readahead_limit_ = static_cast<size_t>(data_block_handle.offset() +
3019 readahead_size_);
3020 // Keep exponentially increasing readahead size until
3021 // kMaxAutoReadaheadSize.
3022 readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize,
3023 readahead_size_ * 2);
3024 } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
3025 // Direct I/O
3026 // Let FilePrefetchBuffer take care of the readahead.
3027 rep->CreateFilePrefetchBuffer(
3028 BlockBasedTable::kInitAutoReadaheadSize,
3029 BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
3030 }
3031 }
3032 } else if (!prefetch_buffer_) {
3033 // Explicit user requested readahead
3034 // The actual condition is:
3035 // if (read_options_.readahead_size != 0 && !prefetch_buffer_)
3036 rep->CreateFilePrefetchBuffer(read_options_.readahead_size,
3037 read_options_.readahead_size,
3038 &prefetch_buffer_);
3039 }
3040 } else if (!prefetch_buffer_) {
3041 rep->CreateFilePrefetchBuffer(compaction_readahead_size_,
3042 compaction_readahead_size_,
3043 &prefetch_buffer_);
3044 }
3045
3046 Status s;
3047 table_->NewDataBlockIterator<TBlockIter>(
3048 read_options_, data_block_handle, &block_iter_, block_type_,
3049 /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(),
3050 /*for_compaction=*/lookup_context_.caller ==
3051 TableReaderCaller::kCompaction);
3052 block_iter_points_to_real_block_ = true;
3053 CheckDataBlockWithinUpperBound();
3054 }
3055 }
3056
3057 template <class TBlockIter, typename TValue>
MaterializeCurrentBlock()3058 bool BlockBasedTableIterator<TBlockIter, TValue>::MaterializeCurrentBlock() {
3059 assert(is_at_first_key_from_index_);
3060 assert(!block_iter_points_to_real_block_);
3061 assert(index_iter_->Valid());
3062
3063 is_at_first_key_from_index_ = false;
3064 InitDataBlock();
3065 assert(block_iter_points_to_real_block_);
3066 block_iter_.SeekToFirst();
3067
3068 if (!block_iter_.Valid() ||
3069 icomp_.Compare(block_iter_.key(),
3070 index_iter_->value().first_internal_key) != 0) {
3071 // Uh oh.
3072 block_iter_.Invalidate(Status::Corruption(
3073 "first key in index doesn't match first key in block"));
3074 return false;
3075 }
3076
3077 return true;
3078 }
3079
3080 template <class TBlockIter, typename TValue>
FindKeyForward()3081 void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
3082 // This method's code is kept short to make it likely to be inlined.
3083
3084 assert(!is_out_of_bound_);
3085 assert(block_iter_points_to_real_block_);
3086
3087 if (!block_iter_.Valid()) {
3088 // This is the only call site of FindBlockForward(), but it's extracted into
3089 // a separate method to keep FindKeyForward() short and likely to be
3090 // inlined. When transitioning to a different block, we call
3091 // FindBlockForward(), which is much longer and is probably not inlined.
3092 FindBlockForward();
3093 } else {
3094 // This is the fast path that avoids a function call.
3095 }
3096 }
3097
3098 template <class TBlockIter, typename TValue>
FindBlockForward()3099 void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
3100 // TODO the while loop inherits from two-level-iterator. We don't know
3101 // whether a block can be empty so it can be replaced by an "if".
3102 do {
3103 if (!block_iter_.status().ok()) {
3104 return;
3105 }
3106 // Whether next data block is out of upper bound, if there is one.
3107 const bool next_block_is_out_of_bound =
3108 read_options_.iterate_upper_bound != nullptr &&
3109 block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
3110 assert(!next_block_is_out_of_bound ||
3111 user_comparator_.Compare(*read_options_.iterate_upper_bound,
3112 index_iter_->user_key()) <= 0);
3113 ResetDataIter();
3114 index_iter_->Next();
3115 if (next_block_is_out_of_bound) {
3116 // The next block is out of bound. No need to read it.
3117 TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
3118 // We need to make sure this is not the last data block before setting
3119 // is_out_of_bound_, since the index key for the last data block can be
3120 // larger than smallest key of the next file on the same level.
3121 if (index_iter_->Valid()) {
3122 is_out_of_bound_ = true;
3123 }
3124 return;
3125 }
3126
3127 if (!index_iter_->Valid()) {
3128 return;
3129 }
3130
3131 IndexValue v = index_iter_->value();
3132
3133 // TODO(kolmike): Remove the != kBlockCacheTier condition.
3134 if (!v.first_internal_key.empty() &&
3135 read_options_.read_tier != kBlockCacheTier) {
3136 // Index contains the first key of the block. Defer reading the block.
3137 is_at_first_key_from_index_ = true;
3138 return;
3139 }
3140
3141 InitDataBlock();
3142 block_iter_.SeekToFirst();
3143 } while (!block_iter_.Valid());
3144 }
3145
3146 template <class TBlockIter, typename TValue>
FindKeyBackward()3147 void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
3148 while (!block_iter_.Valid()) {
3149 if (!block_iter_.status().ok()) {
3150 return;
3151 }
3152
3153 ResetDataIter();
3154 index_iter_->Prev();
3155
3156 if (index_iter_->Valid()) {
3157 InitDataBlock();
3158 block_iter_.SeekToLast();
3159 } else {
3160 return;
3161 }
3162 }
3163
3164 // We could have check lower bound here too, but we opt not to do it for
3165 // code simplicity.
3166 }
3167
3168 template <class TBlockIter, typename TValue>
CheckOutOfBound()3169 void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
3170 if (read_options_.iterate_upper_bound != nullptr && Valid()) {
3171 is_out_of_bound_ = user_comparator_.Compare(
3172 *read_options_.iterate_upper_bound, user_key()) <= 0;
3173 }
3174 }
3175
3176 template <class TBlockIter, typename TValue>
3177 void BlockBasedTableIterator<TBlockIter,
CheckDataBlockWithinUpperBound()3178 TValue>::CheckDataBlockWithinUpperBound() {
3179 if (read_options_.iterate_upper_bound != nullptr &&
3180 block_iter_points_to_real_block_) {
3181 data_block_within_upper_bound_ =
3182 (user_comparator_.Compare(*read_options_.iterate_upper_bound,
3183 index_iter_->user_key()) > 0);
3184 }
3185 }
3186
NewIterator(const ReadOptions & read_options,const SliceTransform * prefix_extractor,Arena * arena,bool skip_filters,TableReaderCaller caller,size_t compaction_readahead_size)3187 InternalIterator* BlockBasedTable::NewIterator(
3188 const ReadOptions& read_options, const SliceTransform* prefix_extractor,
3189 Arena* arena, bool skip_filters, TableReaderCaller caller,
3190 size_t compaction_readahead_size) {
3191 BlockCacheLookupContext lookup_context{caller};
3192 bool need_upper_bound_check =
3193 PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
3194 if (arena == nullptr) {
3195 return new BlockBasedTableIterator<DataBlockIter>(
3196 this, read_options, rep_->internal_comparator,
3197 NewIndexIterator(
3198 read_options,
3199 need_upper_bound_check &&
3200 rep_->index_type == BlockBasedTableOptions::kHashSearch,
3201 /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
3202 !skip_filters && !read_options.total_order_seek &&
3203 prefix_extractor != nullptr,
3204 need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
3205 compaction_readahead_size);
3206 } else {
3207 auto* mem =
3208 arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
3209 return new (mem) BlockBasedTableIterator<DataBlockIter>(
3210 this, read_options, rep_->internal_comparator,
3211 NewIndexIterator(
3212 read_options,
3213 need_upper_bound_check &&
3214 rep_->index_type == BlockBasedTableOptions::kHashSearch,
3215 /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
3216 !skip_filters && !read_options.total_order_seek &&
3217 prefix_extractor != nullptr,
3218 need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
3219 compaction_readahead_size);
3220 }
3221 }
3222
NewRangeTombstoneIterator(const ReadOptions & read_options)3223 FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
3224 const ReadOptions& read_options) {
3225 if (rep_->fragmented_range_dels == nullptr) {
3226 return nullptr;
3227 }
3228 SequenceNumber snapshot = kMaxSequenceNumber;
3229 if (read_options.snapshot != nullptr) {
3230 snapshot = read_options.snapshot->GetSequenceNumber();
3231 }
3232 return new FragmentedRangeTombstoneIterator(
3233 rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
3234 }
3235
FullFilterKeyMayMatch(const ReadOptions & read_options,FilterBlockReader * filter,const Slice & internal_key,const bool no_io,const SliceTransform * prefix_extractor,GetContext * get_context,BlockCacheLookupContext * lookup_context) const3236 bool BlockBasedTable::FullFilterKeyMayMatch(
3237 const ReadOptions& read_options, FilterBlockReader* filter,
3238 const Slice& internal_key, const bool no_io,
3239 const SliceTransform* prefix_extractor, GetContext* get_context,
3240 BlockCacheLookupContext* lookup_context) const {
3241 if (filter == nullptr || filter->IsBlockBased()) {
3242 return true;
3243 }
3244 Slice user_key = ExtractUserKey(internal_key);
3245 const Slice* const const_ikey_ptr = &internal_key;
3246 bool may_match = true;
3247 if (rep_->whole_key_filtering) {
3248 size_t ts_sz =
3249 rep_->internal_comparator.user_comparator()->timestamp_size();
3250 Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
3251 may_match =
3252 filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
3253 no_io, const_ikey_ptr, get_context, lookup_context);
3254 } else if (!read_options.total_order_seek && prefix_extractor &&
3255 rep_->table_properties->prefix_extractor_name.compare(
3256 prefix_extractor->Name()) == 0 &&
3257 prefix_extractor->InDomain(user_key) &&
3258 !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
3259 prefix_extractor, kNotValid, no_io,
3260 const_ikey_ptr, get_context,
3261 lookup_context)) {
3262 may_match = false;
3263 }
3264 if (may_match) {
3265 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
3266 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
3267 }
3268 return may_match;
3269 }
3270
FullFilterKeysMayMatch(const ReadOptions & read_options,FilterBlockReader * filter,MultiGetRange * range,const bool no_io,const SliceTransform * prefix_extractor,BlockCacheLookupContext * lookup_context) const3271 void BlockBasedTable::FullFilterKeysMayMatch(
3272 const ReadOptions& read_options, FilterBlockReader* filter,
3273 MultiGetRange* range, const bool no_io,
3274 const SliceTransform* prefix_extractor,
3275 BlockCacheLookupContext* lookup_context) const {
3276 if (filter == nullptr || filter->IsBlockBased()) {
3277 return;
3278 }
3279 if (rep_->whole_key_filtering) {
3280 filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
3281 lookup_context);
3282 } else if (!read_options.total_order_seek && prefix_extractor &&
3283 rep_->table_properties->prefix_extractor_name.compare(
3284 prefix_extractor->Name()) == 0) {
3285 filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
3286 lookup_context);
3287 }
3288 }
3289
Get(const ReadOptions & read_options,const Slice & key,GetContext * get_context,const SliceTransform * prefix_extractor,bool skip_filters)3290 Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
3291 GetContext* get_context,
3292 const SliceTransform* prefix_extractor,
3293 bool skip_filters) {
3294 assert(key.size() >= 8); // key must be internal key
3295 assert(get_context != nullptr);
3296 Status s;
3297 const bool no_io = read_options.read_tier == kBlockCacheTier;
3298
3299 FilterBlockReader* const filter =
3300 !skip_filters ? rep_->filter.get() : nullptr;
3301
3302 // First check the full filter
3303 // If full filter not useful, Then go into each block
3304 uint64_t tracing_get_id = get_context->get_tracing_get_id();
3305 BlockCacheLookupContext lookup_context{
3306 TableReaderCaller::kUserGet, tracing_get_id,
3307 /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
3308 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
3309 // Trace the key since it contains both user key and sequence number.
3310 lookup_context.referenced_key = key.ToString();
3311 lookup_context.get_from_user_specified_snapshot =
3312 read_options.snapshot != nullptr;
3313 }
3314 const bool may_match =
3315 FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor,
3316 get_context, &lookup_context);
3317 if (!may_match) {
3318 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
3319 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
3320 } else {
3321 IndexBlockIter iiter_on_stack;
3322 // if prefix_extractor found in block differs from options, disable
3323 // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
3324 bool need_upper_bound_check = false;
3325 if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
3326 need_upper_bound_check = PrefixExtractorChanged(
3327 rep_->table_properties.get(), prefix_extractor);
3328 }
3329 auto iiter =
3330 NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
3331 get_context, &lookup_context);
3332 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3333 if (iiter != &iiter_on_stack) {
3334 iiter_unique_ptr.reset(iiter);
3335 }
3336
3337 size_t ts_sz =
3338 rep_->internal_comparator.user_comparator()->timestamp_size();
3339 bool matched = false; // if such user key mathced a key in SST
3340 bool done = false;
3341 for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
3342 IndexValue v = iiter->value();
3343
3344 bool not_exist_in_filter =
3345 filter != nullptr && filter->IsBlockBased() == true &&
3346 !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
3347 prefix_extractor, v.handle.offset(), no_io,
3348 /*const_ikey_ptr=*/nullptr, get_context,
3349 &lookup_context);
3350
3351 if (not_exist_in_filter) {
3352 // Not found
3353 // TODO: think about interaction with Merge. If a user key cannot
3354 // cross one data block, we should be fine.
3355 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
3356 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
3357 break;
3358 }
3359
3360 if (!v.first_internal_key.empty() && !skip_filters &&
3361 UserComparatorWrapper(rep_->internal_comparator.user_comparator())
3362 .Compare(ExtractUserKey(key),
3363 ExtractUserKey(v.first_internal_key)) < 0) {
3364 // The requested key falls between highest key in previous block and
3365 // lowest key in current block.
3366 break;
3367 }
3368
3369 BlockCacheLookupContext lookup_data_block_context{
3370 TableReaderCaller::kUserGet, tracing_get_id,
3371 /*get_from_user_specified_snapshot=*/read_options.snapshot !=
3372 nullptr};
3373 bool does_referenced_key_exist = false;
3374 DataBlockIter biter;
3375 uint64_t referenced_data_size = 0;
3376 NewDataBlockIterator<DataBlockIter>(
3377 read_options, v.handle, &biter, BlockType::kData, get_context,
3378 &lookup_data_block_context,
3379 /*s=*/Status(), /*prefetch_buffer*/ nullptr);
3380
3381 if (no_io && biter.status().IsIncomplete()) {
3382 // couldn't get block from block_cache
3383 // Update Saver.state to Found because we are only looking for
3384 // whether we can guarantee the key is not there when "no_io" is set
3385 get_context->MarkKeyMayExist();
3386 break;
3387 }
3388 if (!biter.status().ok()) {
3389 s = biter.status();
3390 break;
3391 }
3392
3393 bool may_exist = biter.SeekForGet(key);
3394 // If user-specified timestamp is supported, we cannot end the search
3395 // just because hash index lookup indicates the key+ts does not exist.
3396 if (!may_exist && ts_sz == 0) {
3397 // HashSeek cannot find the key this block and the the iter is not
3398 // the end of the block, i.e. cannot be in the following blocks
3399 // either. In this case, the seek_key cannot be found, so we break
3400 // from the top level for-loop.
3401 done = true;
3402 } else {
3403 // Call the *saver function on each entry/block until it returns false
3404 for (; biter.Valid(); biter.Next()) {
3405 ParsedInternalKey parsed_key;
3406 if (!ParseInternalKey(biter.key(), &parsed_key)) {
3407 s = Status::Corruption(Slice());
3408 }
3409
3410 if (!get_context->SaveValue(
3411 parsed_key, biter.value(), &matched,
3412 biter.IsValuePinned() ? &biter : nullptr)) {
3413 if (get_context->State() == GetContext::GetState::kFound) {
3414 does_referenced_key_exist = true;
3415 referenced_data_size = biter.key().size() + biter.value().size();
3416 }
3417 done = true;
3418 break;
3419 }
3420 }
3421 s = biter.status();
3422 }
3423 // Write the block cache access record.
3424 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
3425 // Avoid making copy of block_key, cf_name, and referenced_key when
3426 // constructing the access record.
3427 Slice referenced_key;
3428 if (does_referenced_key_exist) {
3429 referenced_key = biter.key();
3430 } else {
3431 referenced_key = key;
3432 }
3433 BlockCacheTraceRecord access_record(
3434 rep_->ioptions.env->NowMicros(),
3435 /*block_key=*/"", lookup_data_block_context.block_type,
3436 lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
3437 /*cf_name=*/"", rep_->level_for_tracing(),
3438 rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
3439 lookup_data_block_context.is_cache_hit,
3440 lookup_data_block_context.no_insert,
3441 lookup_data_block_context.get_id,
3442 lookup_data_block_context.get_from_user_specified_snapshot,
3443 /*referenced_key=*/"", referenced_data_size,
3444 lookup_data_block_context.num_keys_in_block,
3445 does_referenced_key_exist);
3446 block_cache_tracer_->WriteBlockAccess(
3447 access_record, lookup_data_block_context.block_key,
3448 rep_->cf_name_for_tracing(), referenced_key);
3449 }
3450
3451 if (done) {
3452 // Avoid the extra Next which is expensive in two-level indexes
3453 break;
3454 }
3455 }
3456 if (matched && filter != nullptr && !filter->IsBlockBased()) {
3457 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
3458 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
3459 rep_->level);
3460 }
3461 if (s.ok()) {
3462 s = iiter->status();
3463 }
3464 }
3465
3466 return s;
3467 }
3468
3469 using MultiGetRange = MultiGetContext::Range;
MultiGet(const ReadOptions & read_options,const MultiGetRange * mget_range,const SliceTransform * prefix_extractor,bool skip_filters)3470 void BlockBasedTable::MultiGet(const ReadOptions& read_options,
3471 const MultiGetRange* mget_range,
3472 const SliceTransform* prefix_extractor,
3473 bool skip_filters) {
3474 FilterBlockReader* const filter =
3475 !skip_filters ? rep_->filter.get() : nullptr;
3476 MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
3477 mget_range->end());
3478
3479 // First check the full filter
3480 // If full filter not useful, Then go into each block
3481 const bool no_io = read_options.read_tier == kBlockCacheTier;
3482 uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
3483 if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
3484 tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
3485 }
3486 BlockCacheLookupContext lookup_context{
3487 TableReaderCaller::kUserMultiGet, tracing_mget_id,
3488 /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
3489 FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
3490 prefix_extractor, &lookup_context);
3491
3492 if (skip_filters || !sst_file_range.empty()) {
3493 IndexBlockIter iiter_on_stack;
3494 // if prefix_extractor found in block differs from options, disable
3495 // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
3496 bool need_upper_bound_check = false;
3497 if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
3498 need_upper_bound_check = PrefixExtractorChanged(
3499 rep_->table_properties.get(), prefix_extractor);
3500 }
3501 auto iiter =
3502 NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
3503 sst_file_range.begin()->get_context, &lookup_context);
3504 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3505 if (iiter != &iiter_on_stack) {
3506 iiter_unique_ptr.reset(iiter);
3507 }
3508
3509 uint64_t offset = std::numeric_limits<uint64_t>::max();
3510 autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles;
3511 autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results;
3512 autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
3513 char stack_buf[kMultiGetReadStackBufSize];
3514 std::unique_ptr<char[]> block_buf;
3515 {
3516 MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
3517 sst_file_range.end());
3518
3519 CachableEntry<UncompressionDict> uncompression_dict;
3520 Status uncompression_dict_status;
3521 if (rep_->uncompression_dict_reader) {
3522 uncompression_dict_status =
3523 rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
3524 nullptr /* prefetch_buffer */, no_io,
3525 sst_file_range.begin()->get_context, &lookup_context,
3526 &uncompression_dict);
3527 }
3528
3529 const UncompressionDict& dict = uncompression_dict.GetValue()
3530 ? *uncompression_dict.GetValue()
3531 : UncompressionDict::GetEmptyDict();
3532
3533 size_t total_len = 0;
3534 ReadOptions ro = read_options;
3535 ro.read_tier = kBlockCacheTier;
3536
3537 for (auto miter = data_block_range.begin();
3538 miter != data_block_range.end(); ++miter) {
3539 const Slice& key = miter->ikey;
3540 iiter->Seek(miter->ikey);
3541
3542 IndexValue v;
3543 if (iiter->Valid()) {
3544 v = iiter->value();
3545 }
3546 if (!iiter->Valid() ||
3547 (!v.first_internal_key.empty() && !skip_filters &&
3548 UserComparatorWrapper(rep_->internal_comparator.user_comparator())
3549 .Compare(ExtractUserKey(key),
3550 ExtractUserKey(v.first_internal_key)) < 0)) {
3551 // The requested key falls between highest key in previous block and
3552 // lowest key in current block.
3553 *(miter->s) = iiter->status();
3554 data_block_range.SkipKey(miter);
3555 sst_file_range.SkipKey(miter);
3556 continue;
3557 }
3558
3559 if (!uncompression_dict_status.ok()) {
3560 *(miter->s) = uncompression_dict_status;
3561 data_block_range.SkipKey(miter);
3562 sst_file_range.SkipKey(miter);
3563 continue;
3564 }
3565
3566 statuses.emplace_back();
3567 results.emplace_back();
3568 if (v.handle.offset() == offset) {
3569 // We're going to reuse the block for this key later on. No need to
3570 // look it up now. Place a null handle
3571 block_handles.emplace_back(BlockHandle::NullBlockHandle());
3572 continue;
3573 }
3574 // Lookup the cache for the given data block referenced by an index
3575 // iterator value (i.e BlockHandle). If it exists in the cache,
3576 // initialize block to the contents of the data block.
3577 offset = v.handle.offset();
3578 BlockHandle handle = v.handle;
3579 BlockCacheLookupContext lookup_data_block_context(
3580 TableReaderCaller::kUserMultiGet);
3581 Status s = RetrieveBlock(
3582 nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
3583 miter->get_context, &lookup_data_block_context,
3584 /* for_compaction */ false, /* use_cache */ true);
3585 if (s.IsIncomplete()) {
3586 s = Status::OK();
3587 }
3588 if (s.ok() && !results.back().IsEmpty()) {
3589 // Found it in the cache. Add NULL handle to indicate there is
3590 // nothing to read from disk
3591 block_handles.emplace_back(BlockHandle::NullBlockHandle());
3592 } else {
3593 block_handles.emplace_back(handle);
3594 total_len += block_size(handle);
3595 }
3596 }
3597
3598 if (total_len) {
3599 char* scratch = nullptr;
3600 // If the blocks need to be uncompressed and we don't need the
3601 // compressed blocks, then we can use a contiguous block of
3602 // memory to read in all the blocks as it will be temporary
3603 // storage
3604 // 1. If blocks are compressed and compressed block cache is there,
3605 // alloc heap bufs
3606 // 2. If blocks are uncompressed, alloc heap bufs
3607 // 3. If blocks are compressed and no compressed block cache, use
3608 // stack buf
3609 if (rep_->table_options.block_cache_compressed == nullptr &&
3610 rep_->blocks_maybe_compressed) {
3611 if (total_len <= kMultiGetReadStackBufSize) {
3612 scratch = stack_buf;
3613 } else {
3614 scratch = new char[total_len];
3615 block_buf.reset(scratch);
3616 }
3617 }
3618 RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles,
3619 &statuses, &results, scratch, dict);
3620 }
3621 }
3622
3623 DataBlockIter first_biter;
3624 DataBlockIter next_biter;
3625 size_t idx_in_batch = 0;
3626 for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
3627 ++miter) {
3628 Status s;
3629 GetContext* get_context = miter->get_context;
3630 const Slice& key = miter->ikey;
3631 bool matched = false; // if such user key matched a key in SST
3632 bool done = false;
3633 bool first_block = true;
3634 do {
3635 DataBlockIter* biter = nullptr;
3636 bool reusing_block = true;
3637 uint64_t referenced_data_size = 0;
3638 bool does_referenced_key_exist = false;
3639 BlockCacheLookupContext lookup_data_block_context(
3640 TableReaderCaller::kUserMultiGet, tracing_mget_id,
3641 /*get_from_user_specified_snapshot=*/read_options.snapshot !=
3642 nullptr);
3643 if (first_block) {
3644 if (!block_handles[idx_in_batch].IsNull() ||
3645 !results[idx_in_batch].IsEmpty()) {
3646 first_biter.Invalidate(Status::OK());
3647 NewDataBlockIterator<DataBlockIter>(
3648 read_options, results[idx_in_batch], &first_biter,
3649 statuses[idx_in_batch]);
3650 reusing_block = false;
3651 }
3652 biter = &first_biter;
3653 idx_in_batch++;
3654 } else {
3655 IndexValue v = iiter->value();
3656 if (!v.first_internal_key.empty() && !skip_filters &&
3657 UserComparatorWrapper(rep_->internal_comparator.user_comparator())
3658 .Compare(ExtractUserKey(key),
3659 ExtractUserKey(v.first_internal_key)) < 0) {
3660 // The requested key falls between highest key in previous block and
3661 // lowest key in current block.
3662 break;
3663 }
3664
3665 next_biter.Invalidate(Status::OK());
3666 NewDataBlockIterator<DataBlockIter>(
3667 read_options, iiter->value().handle, &next_biter,
3668 BlockType::kData, get_context, &lookup_data_block_context,
3669 Status(), nullptr);
3670 biter = &next_biter;
3671 reusing_block = false;
3672 }
3673
3674 if (read_options.read_tier == kBlockCacheTier &&
3675 biter->status().IsIncomplete()) {
3676 // couldn't get block from block_cache
3677 // Update Saver.state to Found because we are only looking for
3678 // whether we can guarantee the key is not there when "no_io" is set
3679 get_context->MarkKeyMayExist();
3680 break;
3681 }
3682 if (!biter->status().ok()) {
3683 s = biter->status();
3684 break;
3685 }
3686
3687 bool may_exist = biter->SeekForGet(key);
3688 if (!may_exist) {
3689 // HashSeek cannot find the key this block and the the iter is not
3690 // the end of the block, i.e. cannot be in the following blocks
3691 // either. In this case, the seek_key cannot be found, so we break
3692 // from the top level for-loop.
3693 break;
3694 }
3695
3696 // Call the *saver function on each entry/block until it returns false
3697 for (; biter->Valid(); biter->Next()) {
3698 ParsedInternalKey parsed_key;
3699 Cleanable dummy;
3700 Cleanable* value_pinner = nullptr;
3701 if (!ParseInternalKey(biter->key(), &parsed_key)) {
3702 s = Status::Corruption(Slice());
3703 }
3704 if (biter->IsValuePinned()) {
3705 if (reusing_block) {
3706 Cache* block_cache = rep_->table_options.block_cache.get();
3707 assert(biter->cache_handle() != nullptr);
3708 block_cache->Ref(biter->cache_handle());
3709 dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
3710 biter->cache_handle());
3711 value_pinner = &dummy;
3712 } else {
3713 value_pinner = biter;
3714 }
3715 }
3716 if (!get_context->SaveValue(parsed_key, biter->value(), &matched,
3717 value_pinner)) {
3718 if (get_context->State() == GetContext::GetState::kFound) {
3719 does_referenced_key_exist = true;
3720 referenced_data_size =
3721 biter->key().size() + biter->value().size();
3722 }
3723 done = true;
3724 break;
3725 }
3726 s = biter->status();
3727 }
3728 // Write the block cache access.
3729 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
3730 // Avoid making copy of block_key, cf_name, and referenced_key when
3731 // constructing the access record.
3732 Slice referenced_key;
3733 if (does_referenced_key_exist) {
3734 referenced_key = biter->key();
3735 } else {
3736 referenced_key = key;
3737 }
3738 BlockCacheTraceRecord access_record(
3739 rep_->ioptions.env->NowMicros(),
3740 /*block_key=*/"", lookup_data_block_context.block_type,
3741 lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
3742 /*cf_name=*/"", rep_->level_for_tracing(),
3743 rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
3744 lookup_data_block_context.is_cache_hit,
3745 lookup_data_block_context.no_insert,
3746 lookup_data_block_context.get_id,
3747 lookup_data_block_context.get_from_user_specified_snapshot,
3748 /*referenced_key=*/"", referenced_data_size,
3749 lookup_data_block_context.num_keys_in_block,
3750 does_referenced_key_exist);
3751 block_cache_tracer_->WriteBlockAccess(
3752 access_record, lookup_data_block_context.block_key,
3753 rep_->cf_name_for_tracing(), referenced_key);
3754 }
3755 s = biter->status();
3756 if (done) {
3757 // Avoid the extra Next which is expensive in two-level indexes
3758 break;
3759 }
3760 if (first_block) {
3761 iiter->Seek(key);
3762 }
3763 first_block = false;
3764 iiter->Next();
3765 } while (iiter->Valid());
3766
3767 if (matched && filter != nullptr && !filter->IsBlockBased()) {
3768 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
3769 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
3770 rep_->level);
3771 }
3772 if (s.ok()) {
3773 s = iiter->status();
3774 }
3775 *(miter->s) = s;
3776 }
3777 }
3778 }
3779
Prefetch(const Slice * const begin,const Slice * const end)3780 Status BlockBasedTable::Prefetch(const Slice* const begin,
3781 const Slice* const end) {
3782 auto& comparator = rep_->internal_comparator;
3783 UserComparatorWrapper user_comparator(comparator.user_comparator());
3784 // pre-condition
3785 if (begin && end && comparator.Compare(*begin, *end) > 0) {
3786 return Status::InvalidArgument(*begin, *end);
3787 }
3788 BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
3789 IndexBlockIter iiter_on_stack;
3790 auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
3791 &iiter_on_stack, /*get_context=*/nullptr,
3792 &lookup_context);
3793 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3794 if (iiter != &iiter_on_stack) {
3795 iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
3796 }
3797
3798 if (!iiter->status().ok()) {
3799 // error opening index iterator
3800 return iiter->status();
3801 }
3802
3803 // indicates if we are on the last page that need to be pre-fetched
3804 bool prefetching_boundary_page = false;
3805
3806 for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
3807 iiter->Next()) {
3808 BlockHandle block_handle = iiter->value().handle;
3809 const bool is_user_key = !rep_->index_key_includes_seq;
3810 if (end &&
3811 ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
3812 (is_user_key &&
3813 user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
3814 if (prefetching_boundary_page) {
3815 break;
3816 }
3817
3818 // The index entry represents the last key in the data block.
3819 // We should load this page into memory as well, but no more
3820 prefetching_boundary_page = true;
3821 }
3822
3823 // Load the block specified by the block_handle into the block cache
3824 DataBlockIter biter;
3825
3826 NewDataBlockIterator<DataBlockIter>(
3827 ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
3828 /*get_context=*/nullptr, &lookup_context, Status(),
3829 /*prefetch_buffer=*/nullptr);
3830
3831 if (!biter.status().ok()) {
3832 // there was an unexpected error while pre-fetching
3833 return biter.status();
3834 }
3835 }
3836
3837 return Status::OK();
3838 }
3839
VerifyChecksum(const ReadOptions & read_options,TableReaderCaller caller)3840 Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
3841 TableReaderCaller caller) {
3842 Status s;
3843 // Check Meta blocks
3844 std::unique_ptr<Block> metaindex;
3845 std::unique_ptr<InternalIterator> metaindex_iter;
3846 s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex,
3847 &metaindex_iter);
3848 if (s.ok()) {
3849 s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
3850 if (!s.ok()) {
3851 return s;
3852 }
3853 } else {
3854 return s;
3855 }
3856 // Check Data blocks
3857 IndexBlockIter iiter_on_stack;
3858 BlockCacheLookupContext context{caller};
3859 InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
3860 read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
3861 /*get_context=*/nullptr, &context);
3862 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
3863 if (iiter != &iiter_on_stack) {
3864 iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
3865 }
3866 if (!iiter->status().ok()) {
3867 // error opening index iterator
3868 return iiter->status();
3869 }
3870 s = VerifyChecksumInBlocks(read_options, iiter);
3871 return s;
3872 }
3873
VerifyChecksumInBlocks(const ReadOptions & read_options,InternalIteratorBase<IndexValue> * index_iter)3874 Status BlockBasedTable::VerifyChecksumInBlocks(
3875 const ReadOptions& read_options,
3876 InternalIteratorBase<IndexValue>* index_iter) {
3877 Status s;
3878 // We are scanning the whole file, so no need to do exponential
3879 // increasing of the buffer size.
3880 size_t readahead_size = (read_options.readahead_size != 0)
3881 ? read_options.readahead_size
3882 : kMaxAutoReadaheadSize;
3883 // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
3884 // needed there.
3885 FilePrefetchBuffer prefetch_buffer(
3886 rep_->file.get(), readahead_size /* readadhead_size */,
3887 readahead_size /* max_readahead_size */,
3888 !rep_->ioptions.allow_mmap_reads /* enable */);
3889
3890 for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
3891 s = index_iter->status();
3892 if (!s.ok()) {
3893 break;
3894 }
3895 BlockHandle handle = index_iter->value().handle;
3896 BlockContents contents;
3897 BlockFetcher block_fetcher(
3898 rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle,
3899 &contents, rep_->ioptions, false /* decompress */,
3900 false /*maybe_compressed*/, BlockType::kData,
3901 UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
3902 s = block_fetcher.ReadBlockContents();
3903 if (!s.ok()) {
3904 break;
3905 }
3906 }
3907 return s;
3908 }
3909
GetBlockTypeForMetaBlockByName(const Slice & meta_block_name)3910 BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
3911 const Slice& meta_block_name) {
3912 if (meta_block_name.starts_with(kFilterBlockPrefix) ||
3913 meta_block_name.starts_with(kFullFilterBlockPrefix) ||
3914 meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
3915 return BlockType::kFilter;
3916 }
3917
3918 if (meta_block_name == kPropertiesBlock) {
3919 return BlockType::kProperties;
3920 }
3921
3922 if (meta_block_name == kCompressionDictBlock) {
3923 return BlockType::kCompressionDictionary;
3924 }
3925
3926 if (meta_block_name == kRangeDelBlock) {
3927 return BlockType::kRangeDeletion;
3928 }
3929
3930 if (meta_block_name == kHashIndexPrefixesBlock) {
3931 return BlockType::kHashIndexPrefixes;
3932 }
3933
3934 if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
3935 return BlockType::kHashIndexMetadata;
3936 }
3937
3938 assert(false);
3939 return BlockType::kInvalid;
3940 }
3941
VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice> * index_iter)3942 Status BlockBasedTable::VerifyChecksumInMetaBlocks(
3943 InternalIteratorBase<Slice>* index_iter) {
3944 Status s;
3945 for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
3946 s = index_iter->status();
3947 if (!s.ok()) {
3948 break;
3949 }
3950 BlockHandle handle;
3951 Slice input = index_iter->value();
3952 s = handle.DecodeFrom(&input);
3953 BlockContents contents;
3954 const Slice meta_block_name = index_iter->key();
3955 BlockFetcher block_fetcher(
3956 rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
3957 ReadOptions(), handle, &contents, rep_->ioptions,
3958 false /* decompress */, false /*maybe_compressed*/,
3959 GetBlockTypeForMetaBlockByName(meta_block_name),
3960 UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
3961 s = block_fetcher.ReadBlockContents();
3962 if (s.IsCorruption() && meta_block_name == kPropertiesBlock) {
3963 TableProperties* table_properties;
3964 s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
3965 index_iter->value(),
3966 &table_properties);
3967 delete table_properties;
3968 }
3969 if (!s.ok()) {
3970 break;
3971 }
3972 }
3973 return s;
3974 }
3975
TEST_BlockInCache(const BlockHandle & handle) const3976 bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
3977 assert(rep_ != nullptr);
3978
3979 Cache* const cache = rep_->table_options.block_cache.get();
3980 if (cache == nullptr) {
3981 return false;
3982 }
3983
3984 char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
3985 Slice cache_key =
3986 GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
3987 cache_key_storage);
3988
3989 Cache::Handle* const cache_handle = cache->Lookup(cache_key);
3990 if (cache_handle == nullptr) {
3991 return false;
3992 }
3993
3994 cache->Release(cache_handle);
3995
3996 return true;
3997 }
3998
TEST_KeyInCache(const ReadOptions & options,const Slice & key)3999 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
4000 const Slice& key) {
4001 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
4002 options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
4003 /*get_context=*/nullptr, /*lookup_context=*/nullptr));
4004 iiter->Seek(key);
4005 assert(iiter->Valid());
4006
4007 return TEST_BlockInCache(iiter->value().handle);
4008 }
4009
4010 // REQUIRES: The following fields of rep_ should have already been populated:
4011 // 1. file
4012 // 2. index_handle,
4013 // 3. options
4014 // 4. internal_comparator
4015 // 5. index_type
CreateIndexReader(FilePrefetchBuffer * prefetch_buffer,InternalIterator * preloaded_meta_index_iter,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)4016 Status BlockBasedTable::CreateIndexReader(
4017 FilePrefetchBuffer* prefetch_buffer,
4018 InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
4019 bool pin, BlockCacheLookupContext* lookup_context,
4020 std::unique_ptr<IndexReader>* index_reader) {
4021 // kHashSearch requires non-empty prefix_extractor but bypass checking
4022 // prefix_extractor here since we have no access to MutableCFOptions.
4023 // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator.
4024 // If prefix_extractor does not match prefix_extractor_name from table
4025 // properties, turn off Hash Index by setting total_order_seek to true
4026
4027 switch (rep_->index_type) {
4028 case BlockBasedTableOptions::kTwoLevelIndexSearch: {
4029 return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
4030 prefetch, pin, lookup_context,
4031 index_reader);
4032 }
4033 case BlockBasedTableOptions::kBinarySearch:
4034 case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
4035 return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
4036 prefetch, pin, lookup_context,
4037 index_reader);
4038 }
4039 case BlockBasedTableOptions::kHashSearch: {
4040 std::unique_ptr<Block> metaindex_guard;
4041 std::unique_ptr<InternalIterator> metaindex_iter_guard;
4042 auto meta_index_iter = preloaded_meta_index_iter;
4043 if (meta_index_iter == nullptr) {
4044 auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard,
4045 &metaindex_iter_guard);
4046 if (!s.ok()) {
4047 // we simply fall back to binary search in case there is any
4048 // problem with prefix hash index loading.
4049 ROCKS_LOG_WARN(rep_->ioptions.info_log,
4050 "Unable to read the metaindex block."
4051 " Fall back to binary search index.");
4052 return BinarySearchIndexReader::Create(this, prefetch_buffer,
4053 use_cache, prefetch, pin,
4054 lookup_context, index_reader);
4055 }
4056 meta_index_iter = metaindex_iter_guard.get();
4057 }
4058
4059 return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
4060 use_cache, prefetch, pin, lookup_context,
4061 index_reader);
4062 }
4063 default: {
4064 std::string error_message =
4065 "Unrecognized index type: " + ToString(rep_->index_type);
4066 return Status::InvalidArgument(error_message.c_str());
4067 }
4068 }
4069 }
4070
ApproximateOffsetOf(const InternalIteratorBase<IndexValue> & index_iter) const4071 uint64_t BlockBasedTable::ApproximateOffsetOf(
4072 const InternalIteratorBase<IndexValue>& index_iter) const {
4073 uint64_t result = 0;
4074 if (index_iter.Valid()) {
4075 BlockHandle handle = index_iter.value().handle;
4076 result = handle.offset();
4077 } else {
4078 // The iterator is past the last key in the file. If table_properties is not
4079 // available, approximate the offset by returning the offset of the
4080 // metaindex block (which is right near the end of the file).
4081 if (rep_->table_properties) {
4082 result = rep_->table_properties->data_size;
4083 }
4084 // table_properties is not present in the table.
4085 if (result == 0) {
4086 result = rep_->footer.metaindex_handle().offset();
4087 }
4088 }
4089
4090 return result;
4091 }
4092
ApproximateOffsetOf(const Slice & key,TableReaderCaller caller)4093 uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
4094 TableReaderCaller caller) {
4095 BlockCacheLookupContext context(caller);
4096 IndexBlockIter iiter_on_stack;
4097 ReadOptions ro;
4098 ro.total_order_seek = true;
4099 auto index_iter =
4100 NewIndexIterator(ro, /*disable_prefix_seek=*/true,
4101 /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
4102 /*lookup_context=*/&context);
4103 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
4104 if (index_iter != &iiter_on_stack) {
4105 iiter_unique_ptr.reset(index_iter);
4106 }
4107
4108 index_iter->Seek(key);
4109 return ApproximateOffsetOf(*index_iter);
4110 }
4111
ApproximateSize(const Slice & start,const Slice & end,TableReaderCaller caller)4112 uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
4113 TableReaderCaller caller) {
4114 assert(rep_->internal_comparator.Compare(start, end) <= 0);
4115
4116 BlockCacheLookupContext context(caller);
4117 IndexBlockIter iiter_on_stack;
4118 ReadOptions ro;
4119 ro.total_order_seek = true;
4120 auto index_iter =
4121 NewIndexIterator(ro, /*disable_prefix_seek=*/true,
4122 /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
4123 /*lookup_context=*/&context);
4124 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
4125 if (index_iter != &iiter_on_stack) {
4126 iiter_unique_ptr.reset(index_iter);
4127 }
4128
4129 index_iter->Seek(start);
4130 uint64_t start_offset = ApproximateOffsetOf(*index_iter);
4131 index_iter->Seek(end);
4132 uint64_t end_offset = ApproximateOffsetOf(*index_iter);
4133
4134 assert(end_offset >= start_offset);
4135 return end_offset - start_offset;
4136 }
4137
TEST_FilterBlockInCache() const4138 bool BlockBasedTable::TEST_FilterBlockInCache() const {
4139 assert(rep_ != nullptr);
4140 return TEST_BlockInCache(rep_->filter_handle);
4141 }
4142
TEST_IndexBlockInCache() const4143 bool BlockBasedTable::TEST_IndexBlockInCache() const {
4144 assert(rep_ != nullptr);
4145
4146 return TEST_BlockInCache(rep_->footer.index_handle());
4147 }
4148
GetKVPairsFromDataBlocks(std::vector<KVPairBlock> * kv_pair_blocks)4149 Status BlockBasedTable::GetKVPairsFromDataBlocks(
4150 std::vector<KVPairBlock>* kv_pair_blocks) {
4151 std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
4152 NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
4153 /*input_iter=*/nullptr, /*get_context=*/nullptr,
4154 /*lookup_contex=*/nullptr));
4155
4156 Status s = blockhandles_iter->status();
4157 if (!s.ok()) {
4158 // Cannot read Index Block
4159 return s;
4160 }
4161
4162 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
4163 blockhandles_iter->Next()) {
4164 s = blockhandles_iter->status();
4165
4166 if (!s.ok()) {
4167 break;
4168 }
4169
4170 std::unique_ptr<InternalIterator> datablock_iter;
4171 datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
4172 ReadOptions(), blockhandles_iter->value().handle,
4173 /*input_iter=*/nullptr, /*type=*/BlockType::kData,
4174 /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
4175 /*prefetch_buffer=*/nullptr));
4176 s = datablock_iter->status();
4177
4178 if (!s.ok()) {
4179 // Error reading the block - Skipped
4180 continue;
4181 }
4182
4183 KVPairBlock kv_pair_block;
4184 for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
4185 datablock_iter->Next()) {
4186 s = datablock_iter->status();
4187 if (!s.ok()) {
4188 // Error reading the block - Skipped
4189 break;
4190 }
4191 const Slice& key = datablock_iter->key();
4192 const Slice& value = datablock_iter->value();
4193 std::string key_copy = std::string(key.data(), key.size());
4194 std::string value_copy = std::string(value.data(), value.size());
4195
4196 kv_pair_block.push_back(
4197 std::make_pair(std::move(key_copy), std::move(value_copy)));
4198 }
4199 kv_pair_blocks->push_back(std::move(kv_pair_block));
4200 }
4201 return Status::OK();
4202 }
4203
DumpTable(WritableFile * out_file)4204 Status BlockBasedTable::DumpTable(WritableFile* out_file) {
4205 // Output Footer
4206 out_file->Append(
4207 "Footer Details:\n"
4208 "--------------------------------------\n"
4209 " ");
4210 out_file->Append(rep_->footer.ToString().c_str());
4211 out_file->Append("\n");
4212
4213 // Output MetaIndex
4214 out_file->Append(
4215 "Metaindex Details:\n"
4216 "--------------------------------------\n");
4217 std::unique_ptr<Block> metaindex;
4218 std::unique_ptr<InternalIterator> metaindex_iter;
4219 Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex,
4220 &metaindex_iter);
4221 if (s.ok()) {
4222 for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
4223 metaindex_iter->Next()) {
4224 s = metaindex_iter->status();
4225 if (!s.ok()) {
4226 return s;
4227 }
4228 if (metaindex_iter->key() == rocksdb::kPropertiesBlock) {
4229 out_file->Append(" Properties block handle: ");
4230 out_file->Append(metaindex_iter->value().ToString(true).c_str());
4231 out_file->Append("\n");
4232 } else if (metaindex_iter->key() == rocksdb::kCompressionDictBlock) {
4233 out_file->Append(" Compression dictionary block handle: ");
4234 out_file->Append(metaindex_iter->value().ToString(true).c_str());
4235 out_file->Append("\n");
4236 } else if (strstr(metaindex_iter->key().ToString().c_str(),
4237 "filter.rocksdb.") != nullptr) {
4238 out_file->Append(" Filter block handle: ");
4239 out_file->Append(metaindex_iter->value().ToString(true).c_str());
4240 out_file->Append("\n");
4241 } else if (metaindex_iter->key() == rocksdb::kRangeDelBlock) {
4242 out_file->Append(" Range deletion block handle: ");
4243 out_file->Append(metaindex_iter->value().ToString(true).c_str());
4244 out_file->Append("\n");
4245 }
4246 }
4247 out_file->Append("\n");
4248 } else {
4249 return s;
4250 }
4251
4252 // Output TableProperties
4253 const rocksdb::TableProperties* table_properties;
4254 table_properties = rep_->table_properties.get();
4255
4256 if (table_properties != nullptr) {
4257 out_file->Append(
4258 "Table Properties:\n"
4259 "--------------------------------------\n"
4260 " ");
4261 out_file->Append(table_properties->ToString("\n ", ": ").c_str());
4262 out_file->Append("\n");
4263 }
4264
4265 if (rep_->filter) {
4266 out_file->Append(
4267 "Filter Details:\n"
4268 "--------------------------------------\n"
4269 " ");
4270 out_file->Append(rep_->filter->ToString().c_str());
4271 out_file->Append("\n");
4272 }
4273
4274 // Output Index block
4275 s = DumpIndexBlock(out_file);
4276 if (!s.ok()) {
4277 return s;
4278 }
4279
4280 // Output compression dictionary
4281 if (rep_->uncompression_dict_reader) {
4282 CachableEntry<UncompressionDict> uncompression_dict;
4283 s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
4284 nullptr /* prefetch_buffer */, false /* no_io */,
4285 nullptr /* get_context */, nullptr /* lookup_context */,
4286 &uncompression_dict);
4287 if (!s.ok()) {
4288 return s;
4289 }
4290
4291 assert(uncompression_dict.GetValue());
4292
4293 const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
4294 out_file->Append(
4295 "Compression Dictionary:\n"
4296 "--------------------------------------\n");
4297 out_file->Append(" size (bytes): ");
4298 out_file->Append(rocksdb::ToString(raw_dict.size()));
4299 out_file->Append("\n\n");
4300 out_file->Append(" HEX ");
4301 out_file->Append(raw_dict.ToString(true).c_str());
4302 out_file->Append("\n\n");
4303 }
4304
4305 // Output range deletions block
4306 auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
4307 if (range_del_iter != nullptr) {
4308 range_del_iter->SeekToFirst();
4309 if (range_del_iter->Valid()) {
4310 out_file->Append(
4311 "Range deletions:\n"
4312 "--------------------------------------\n"
4313 " ");
4314 for (; range_del_iter->Valid(); range_del_iter->Next()) {
4315 DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
4316 }
4317 out_file->Append("\n");
4318 }
4319 delete range_del_iter;
4320 }
4321 // Output Data blocks
4322 s = DumpDataBlocks(out_file);
4323
4324 return s;
4325 }
4326
DumpIndexBlock(WritableFile * out_file)4327 Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
4328 out_file->Append(
4329 "Index Details:\n"
4330 "--------------------------------------\n");
4331 std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
4332 NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
4333 /*input_iter=*/nullptr, /*get_context=*/nullptr,
4334 /*lookup_contex=*/nullptr));
4335 Status s = blockhandles_iter->status();
4336 if (!s.ok()) {
4337 out_file->Append("Can not read Index Block \n\n");
4338 return s;
4339 }
4340
4341 out_file->Append(" Block key hex dump: Data block handle\n");
4342 out_file->Append(" Block key ascii\n\n");
4343 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
4344 blockhandles_iter->Next()) {
4345 s = blockhandles_iter->status();
4346 if (!s.ok()) {
4347 break;
4348 }
4349 Slice key = blockhandles_iter->key();
4350 Slice user_key;
4351 InternalKey ikey;
4352 if (!rep_->index_key_includes_seq) {
4353 user_key = key;
4354 } else {
4355 ikey.DecodeFrom(key);
4356 user_key = ikey.user_key();
4357 }
4358
4359 out_file->Append(" HEX ");
4360 out_file->Append(user_key.ToString(true).c_str());
4361 out_file->Append(": ");
4362 out_file->Append(blockhandles_iter->value()
4363 .ToString(true, rep_->index_has_first_key)
4364 .c_str());
4365 out_file->Append("\n");
4366
4367 std::string str_key = user_key.ToString();
4368 std::string res_key("");
4369 char cspace = ' ';
4370 for (size_t i = 0; i < str_key.size(); i++) {
4371 res_key.append(&str_key[i], 1);
4372 res_key.append(1, cspace);
4373 }
4374 out_file->Append(" ASCII ");
4375 out_file->Append(res_key.c_str());
4376 out_file->Append("\n ------\n");
4377 }
4378 out_file->Append("\n");
4379 return Status::OK();
4380 }
4381
DumpDataBlocks(WritableFile * out_file)4382 Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
4383 std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
4384 NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
4385 /*input_iter=*/nullptr, /*get_context=*/nullptr,
4386 /*lookup_contex=*/nullptr));
4387 Status s = blockhandles_iter->status();
4388 if (!s.ok()) {
4389 out_file->Append("Can not read Index Block \n\n");
4390 return s;
4391 }
4392
4393 uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
4394 uint64_t datablock_size_max = 0;
4395 uint64_t datablock_size_sum = 0;
4396
4397 size_t block_id = 1;
4398 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
4399 block_id++, blockhandles_iter->Next()) {
4400 s = blockhandles_iter->status();
4401 if (!s.ok()) {
4402 break;
4403 }
4404
4405 BlockHandle bh = blockhandles_iter->value().handle;
4406 uint64_t datablock_size = bh.size();
4407 datablock_size_min = std::min(datablock_size_min, datablock_size);
4408 datablock_size_max = std::max(datablock_size_max, datablock_size);
4409 datablock_size_sum += datablock_size;
4410
4411 out_file->Append("Data Block # ");
4412 out_file->Append(rocksdb::ToString(block_id));
4413 out_file->Append(" @ ");
4414 out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str());
4415 out_file->Append("\n");
4416 out_file->Append("--------------------------------------\n");
4417
4418 std::unique_ptr<InternalIterator> datablock_iter;
4419 datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
4420 ReadOptions(), blockhandles_iter->value().handle,
4421 /*input_iter=*/nullptr, /*type=*/BlockType::kData,
4422 /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
4423 /*prefetch_buffer=*/nullptr));
4424 s = datablock_iter->status();
4425
4426 if (!s.ok()) {
4427 out_file->Append("Error reading the block - Skipped \n\n");
4428 continue;
4429 }
4430
4431 for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
4432 datablock_iter->Next()) {
4433 s = datablock_iter->status();
4434 if (!s.ok()) {
4435 out_file->Append("Error reading the block - Skipped \n");
4436 break;
4437 }
4438 DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
4439 }
4440 out_file->Append("\n");
4441 }
4442
4443 uint64_t num_datablocks = block_id - 1;
4444 if (num_datablocks) {
4445 double datablock_size_avg =
4446 static_cast<double>(datablock_size_sum) / num_datablocks;
4447 out_file->Append("Data Block Summary:\n");
4448 out_file->Append("--------------------------------------");
4449 out_file->Append("\n # data blocks: ");
4450 out_file->Append(rocksdb::ToString(num_datablocks));
4451 out_file->Append("\n min data block size: ");
4452 out_file->Append(rocksdb::ToString(datablock_size_min));
4453 out_file->Append("\n max data block size: ");
4454 out_file->Append(rocksdb::ToString(datablock_size_max));
4455 out_file->Append("\n avg data block size: ");
4456 out_file->Append(rocksdb::ToString(datablock_size_avg));
4457 out_file->Append("\n");
4458 }
4459
4460 return Status::OK();
4461 }
4462
DumpKeyValue(const Slice & key,const Slice & value,WritableFile * out_file)4463 void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
4464 WritableFile* out_file) {
4465 InternalKey ikey;
4466 ikey.DecodeFrom(key);
4467
4468 out_file->Append(" HEX ");
4469 out_file->Append(ikey.user_key().ToString(true).c_str());
4470 out_file->Append(": ");
4471 out_file->Append(value.ToString(true).c_str());
4472 out_file->Append("\n");
4473
4474 std::string str_key = ikey.user_key().ToString();
4475 std::string str_value = value.ToString();
4476 std::string res_key(""), res_value("");
4477 char cspace = ' ';
4478 for (size_t i = 0; i < str_key.size(); i++) {
4479 if (str_key[i] == '\0') {
4480 res_key.append("\\0", 2);
4481 } else {
4482 res_key.append(&str_key[i], 1);
4483 }
4484 res_key.append(1, cspace);
4485 }
4486 for (size_t i = 0; i < str_value.size(); i++) {
4487 if (str_value[i] == '\0') {
4488 res_value.append("\\0", 2);
4489 } else {
4490 res_value.append(&str_value[i], 1);
4491 }
4492 res_value.append(1, cspace);
4493 }
4494
4495 out_file->Append(" ASCII ");
4496 out_file->Append(res_key.c_str());
4497 out_file->Append(": ");
4498 out_file->Append(res_value.c_str());
4499 out_file->Append("\n ------\n");
4500 }
4501
4502 } // namespace rocksdb
4503