1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 // 6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 7 // Use of this source code is governed by a BSD-style license that can be 8 // found in the LICENSE file. See the AUTHORS file for names of contributors. 9 10 #pragma once 11 12 #include <stdint.h> 13 #include <memory> 14 #include <set> 15 #include <string> 16 #include <utility> 17 #include <vector> 18 19 #include "db/range_tombstone_fragmenter.h" 20 #include "file/filename.h" 21 #include "file/random_access_file_reader.h" 22 #include "options/cf_options.h" 23 #include "rocksdb/options.h" 24 #include "rocksdb/persistent_cache.h" 25 #include "rocksdb/statistics.h" 26 #include "rocksdb/status.h" 27 #include "rocksdb/table.h" 28 #include "table/block_based/block.h" 29 #include "table/block_based/block_based_table_factory.h" 30 #include "table/block_based/block_type.h" 31 #include "table/block_based/cachable_entry.h" 32 #include "table/block_based/filter_block.h" 33 #include "table/block_based/uncompression_dict_reader.h" 34 #include "table/format.h" 35 #include "table/get_context.h" 36 #include "table/multiget_context.h" 37 #include "table/persistent_cache_helper.h" 38 #include "table/table_properties_internal.h" 39 #include "table/table_reader.h" 40 #include "table/two_level_iterator.h" 41 #include "trace_replay/block_cache_tracer.h" 42 #include "util/coding.h" 43 #include "util/user_comparator_wrapper.h" 44 45 namespace ROCKSDB_NAMESPACE { 46 47 class Cache; 48 class FilterBlockReader; 49 class BlockBasedFilterBlockReader; 50 class FullFilterBlockReader; 51 class Footer; 52 class InternalKeyComparator; 53 class Iterator; 54 class FSRandomAccessFile; 55 class TableCache; 56 class TableReader; 57 class WritableFile; 58 struct BlockBasedTableOptions; 59 struct EnvOptions; 60 struct ReadOptions; 61 class GetContext; 62 63 typedef std::vector<std::pair<std::string, std::string>> KVPairBlock; 64 65 // Reader class for BlockBasedTable format. 66 // For the format of BlockBasedTable refer to 67 // https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format. 68 // This is the default table type. Data is chucked into fixed size blocks and 69 // each block in-turn stores entries. When storing data, we can compress and/or 70 // encode data efficiently within a block, which often results in a much smaller 71 // data size compared with the raw data size. As for the record retrieval, we'll 72 // first locate the block where target record may reside, then read the block to 73 // memory, and finally search that record within the block. Of course, to avoid 74 // frequent reads of the same block, we introduced the block cache to keep the 75 // loaded blocks in the memory. 76 class BlockBasedTable : public TableReader { 77 public: 78 static const std::string kFilterBlockPrefix; 79 static const std::string kFullFilterBlockPrefix; 80 static const std::string kPartitionedFilterBlockPrefix; 81 // The longest prefix of the cache key used to identify blocks. 82 // For Posix files the unique ID is three varints. 83 static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1; 84 85 // All the below fields control iterator readahead 86 static const size_t kInitAutoReadaheadSize = 8 * 1024; 87 // Found that 256 KB readahead size provides the best performance, based on 88 // experiments, for auto readahead. Experiment data is in PR #3282. 89 static const size_t kMaxAutoReadaheadSize; 90 static const int kMinNumFileReadsToStartAutoReadahead = 2; 91 92 // Attempt to open the table that is stored in bytes [0..file_size) 93 // of "file", and read the metadata entries necessary to allow 94 // retrieving data from the table. 95 // 96 // If successful, returns ok and sets "*table_reader" to the newly opened 97 // table. The client should delete "*table_reader" when no longer needed. 98 // If there was an error while initializing the table, sets "*table_reader" 99 // to nullptr and returns a non-ok status. 100 // 101 // @param file must remain live while this Table is in use. 102 // @param prefetch_index_and_filter_in_cache can be used to disable 103 // prefetching of 104 // index and filter blocks into block cache at startup 105 // @param skip_filters Disables loading/accessing the filter block. Overrides 106 // prefetch_index_and_filter_in_cache, so filter will be skipped if both 107 // are set. 108 static Status Open(const ImmutableCFOptions& ioptions, 109 const EnvOptions& env_options, 110 const BlockBasedTableOptions& table_options, 111 const InternalKeyComparator& internal_key_comparator, 112 std::unique_ptr<RandomAccessFileReader>&& file, 113 uint64_t file_size, 114 std::unique_ptr<TableReader>* table_reader, 115 const SliceTransform* prefix_extractor = nullptr, 116 bool prefetch_index_and_filter_in_cache = true, 117 bool skip_filters = false, int level = -1, 118 const bool immortal_table = false, 119 const SequenceNumber largest_seqno = 0, 120 TailPrefetchStats* tail_prefetch_stats = nullptr, 121 BlockCacheTracer* const block_cache_tracer = nullptr); 122 123 bool PrefixMayMatch(const Slice& internal_key, 124 const ReadOptions& read_options, 125 const SliceTransform* options_prefix_extractor, 126 const bool need_upper_bound_check, 127 BlockCacheLookupContext* lookup_context) const; 128 129 // Returns a new iterator over the table contents. 130 // The result of NewIterator() is initially invalid (caller must 131 // call one of the Seek methods on the iterator before using it). 132 // @param skip_filters Disables loading/accessing the filter block 133 // compaction_readahead_size: its value will only be used if caller = 134 // kCompaction. 135 InternalIterator* NewIterator(const ReadOptions&, 136 const SliceTransform* prefix_extractor, 137 Arena* arena, bool skip_filters, 138 TableReaderCaller caller, 139 size_t compaction_readahead_size = 0) override; 140 141 FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( 142 const ReadOptions& read_options) override; 143 144 // @param skip_filters Disables loading/accessing the filter block 145 Status Get(const ReadOptions& readOptions, const Slice& key, 146 GetContext* get_context, const SliceTransform* prefix_extractor, 147 bool skip_filters = false) override; 148 149 void MultiGet(const ReadOptions& readOptions, 150 const MultiGetContext::Range* mget_range, 151 const SliceTransform* prefix_extractor, 152 bool skip_filters = false) override; 153 154 // Pre-fetch the disk blocks that correspond to the key range specified by 155 // (kbegin, kend). The call will return error status in the event of 156 // IO or iteration error. 157 Status Prefetch(const Slice* begin, const Slice* end) override; 158 159 // Given a key, return an approximate byte offset in the file where 160 // the data for that key begins (or would begin if the key were 161 // present in the file). The returned value is in terms of file 162 // bytes, and so includes effects like compression of the underlying data. 163 // E.g., the approximate offset of the last key in the table will 164 // be close to the file length. 165 uint64_t ApproximateOffsetOf(const Slice& key, 166 TableReaderCaller caller) override; 167 168 // Given start and end keys, return the approximate data size in the file 169 // between the keys. The returned value is in terms of file bytes, and so 170 // includes effects like compression of the underlying data. 171 // The start key must not be greater than the end key. 172 uint64_t ApproximateSize(const Slice& start, const Slice& end, 173 TableReaderCaller caller) override; 174 175 bool TEST_BlockInCache(const BlockHandle& handle) const; 176 177 // Returns true if the block for the specified key is in cache. 178 // REQUIRES: key is in this table && block cache enabled 179 bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); 180 181 // Set up the table for Compaction. Might change some parameters with 182 // posix_fadvise 183 void SetupForCompaction() override; 184 185 std::shared_ptr<const TableProperties> GetTableProperties() const override; 186 187 size_t ApproximateMemoryUsage() const override; 188 189 // convert SST file to a human readable form 190 Status DumpTable(WritableFile* out_file) override; 191 192 Status VerifyChecksum(const ReadOptions& readOptions, 193 TableReaderCaller caller) override; 194 195 ~BlockBasedTable(); 196 197 bool TEST_FilterBlockInCache() const; 198 bool TEST_IndexBlockInCache() const; 199 200 // IndexReader is the interface that provides the functionality for index 201 // access. 202 class IndexReader { 203 public: 204 virtual ~IndexReader() = default; 205 206 // Create an iterator for index access. If iter is null, then a new object 207 // is created on the heap, and the callee will have the ownership. 208 // If a non-null iter is passed in, it will be used, and the returned value 209 // is either the same as iter or a new on-heap object that 210 // wraps the passed iter. In the latter case the return value points 211 // to a different object then iter, and the callee has the ownership of the 212 // returned object. 213 virtual InternalIteratorBase<IndexValue>* NewIterator( 214 const ReadOptions& read_options, bool disable_prefix_seek, 215 IndexBlockIter* iter, GetContext* get_context, 216 BlockCacheLookupContext* lookup_context) = 0; 217 218 // Report an approximation of how much memory has been used other than 219 // memory that was allocated in block cache. 220 virtual size_t ApproximateMemoryUsage() const = 0; 221 // Cache the dependencies of the index reader (e.g. the partitions 222 // of a partitioned index). CacheDependencies(bool)223 virtual void CacheDependencies(bool /* pin */) {} 224 }; 225 226 class IndexReaderCommon; 227 228 static Slice GetCacheKey(const char* cache_key_prefix, 229 size_t cache_key_prefix_size, 230 const BlockHandle& handle, char* cache_key); 231 232 // Retrieve all key value pairs from data blocks in the table. 233 // The key retrieved are internal keys. 234 Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks); 235 236 struct Rep; 237 get_rep()238 Rep* get_rep() { return rep_; } get_rep()239 const Rep* get_rep() const { return rep_; } 240 241 // input_iter: if it is not null, update this one and return it as Iterator 242 template <typename TBlockIter> 243 TBlockIter* NewDataBlockIterator( 244 const ReadOptions& ro, const BlockHandle& block_handle, 245 TBlockIter* input_iter, BlockType block_type, GetContext* get_context, 246 BlockCacheLookupContext* lookup_context, Status s, 247 FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const; 248 249 // input_iter: if it is not null, update this one and return it as Iterator 250 template <typename TBlockIter> 251 TBlockIter* NewDataBlockIterator(const ReadOptions& ro, 252 CachableEntry<Block>& block, 253 TBlockIter* input_iter, Status s) const; 254 255 class PartitionedIndexIteratorState; 256 257 template <typename TBlocklike> 258 friend class FilterBlockReaderCommon; 259 260 friend class PartitionIndexReader; 261 262 friend class UncompressionDictReader; 263 264 protected: 265 Rep* rep_; BlockBasedTable(Rep * rep,BlockCacheTracer * const block_cache_tracer)266 explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer) 267 : rep_(rep), block_cache_tracer_(block_cache_tracer) {} 268 // No copying allowed 269 explicit BlockBasedTable(const TableReader&) = delete; 270 void operator=(const TableReader&) = delete; 271 272 private: 273 friend class MockedBlockBasedTable; 274 static std::atomic<uint64_t> next_cache_key_id_; 275 BlockCacheTracer* const block_cache_tracer_; 276 277 void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context, 278 size_t usage) const; 279 void UpdateCacheMissMetrics(BlockType block_type, 280 GetContext* get_context) const; 281 void UpdateCacheInsertionMetrics(BlockType block_type, 282 GetContext* get_context, size_t usage) const; 283 Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, 284 BlockType block_type, 285 GetContext* get_context) const; 286 287 // Either Block::NewDataIterator() or Block::NewIndexIterator(). 288 template <typename TBlockIter> 289 static TBlockIter* InitBlockIterator(const Rep* rep, Block* block, 290 TBlockIter* input_iter, 291 bool block_contents_pinned); 292 293 // If block cache enabled (compressed or uncompressed), looks for the block 294 // identified by handle in (1) uncompressed cache, (2) compressed cache, and 295 // then (3) file. If found, inserts into the cache(s) that were searched 296 // unsuccessfully (e.g., if found in file, will add to both uncompressed and 297 // compressed caches if they're enabled). 298 // 299 // @param block_entry value is set to the uncompressed block if found. If 300 // in uncompressed block cache, also sets cache_handle to reference that 301 // block. 302 template <typename TBlocklike> 303 Status MaybeReadBlockAndLoadToCache( 304 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, 305 const BlockHandle& handle, const UncompressionDict& uncompression_dict, 306 CachableEntry<TBlocklike>* block_entry, BlockType block_type, 307 GetContext* get_context, BlockCacheLookupContext* lookup_context, 308 BlockContents* contents) const; 309 310 // Similar to the above, with one crucial difference: it will retrieve the 311 // block from the file even if there are no caches configured (assuming the 312 // read options allow I/O). 313 template <typename TBlocklike> 314 Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer, 315 const ReadOptions& ro, const BlockHandle& handle, 316 const UncompressionDict& uncompression_dict, 317 CachableEntry<TBlocklike>* block_entry, 318 BlockType block_type, GetContext* get_context, 319 BlockCacheLookupContext* lookup_context, 320 bool for_compaction, bool use_cache) const; 321 322 void RetrieveMultipleBlocks( 323 const ReadOptions& options, const MultiGetRange* batch, 324 const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles, 325 autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses, 326 autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* 327 results, 328 char* scratch, const UncompressionDict& uncompression_dict) const; 329 330 // Get the iterator from the index reader. 331 // 332 // If input_iter is not set, return a new Iterator. 333 // If input_iter is set, try to update it and return it as Iterator. 334 // However note that in some cases the returned iterator may be different 335 // from input_iter. In such case the returned iterator should be freed. 336 // 337 // Note: ErrorIterator with Status::Incomplete shall be returned if all the 338 // following conditions are met: 339 // 1. We enabled table_options.cache_index_and_filter_blocks. 340 // 2. index is not present in block cache. 341 // 3. We disallowed any io to be performed, that is, read_options == 342 // kBlockCacheTier 343 InternalIteratorBase<IndexValue>* NewIndexIterator( 344 const ReadOptions& read_options, bool need_upper_bound_check, 345 IndexBlockIter* input_iter, GetContext* get_context, 346 BlockCacheLookupContext* lookup_context) const; 347 348 // Read block cache from block caches (if set): block_cache and 349 // block_cache_compressed. 350 // On success, Status::OK with be returned and @block will be populated with 351 // pointer to the block as well as its block handle. 352 // @param uncompression_dict Data for presetting the compression library's 353 // dictionary. 354 template <typename TBlocklike> 355 Status GetDataBlockFromCache( 356 const Slice& block_cache_key, const Slice& compressed_block_cache_key, 357 Cache* block_cache, Cache* block_cache_compressed, 358 const ReadOptions& read_options, CachableEntry<TBlocklike>* block, 359 const UncompressionDict& uncompression_dict, BlockType block_type, 360 GetContext* get_context) const; 361 362 // Put a raw block (maybe compressed) to the corresponding block caches. 363 // This method will perform decompression against raw_block if needed and then 364 // populate the block caches. 365 // On success, Status::OK will be returned; also @block will be populated with 366 // uncompressed block and its cache handle. 367 // 368 // Allocated memory managed by raw_block_contents will be transferred to 369 // PutDataBlockToCache(). After the call, the object will be invalid. 370 // @param uncompression_dict Data for presetting the compression library's 371 // dictionary. 372 template <typename TBlocklike> 373 Status PutDataBlockToCache( 374 const Slice& block_cache_key, const Slice& compressed_block_cache_key, 375 Cache* block_cache, Cache* block_cache_compressed, 376 CachableEntry<TBlocklike>* cached_block, 377 BlockContents* raw_block_contents, CompressionType raw_block_comp_type, 378 const UncompressionDict& uncompression_dict, SequenceNumber seq_no, 379 MemoryAllocator* memory_allocator, BlockType block_type, 380 GetContext* get_context) const; 381 382 // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found 383 // after a call to Seek(key), until handle_result returns false. 384 // May not make such a call if filter policy says that key is not present. 385 friend class TableCache; 386 friend class BlockBasedTableBuilder; 387 388 // Create a index reader based on the index type stored in the table. 389 // Optionally, user can pass a preloaded meta_index_iter for the index that 390 // need to access extra meta blocks for index construction. This parameter 391 // helps avoid re-reading meta index block if caller already created one. 392 Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer, 393 InternalIterator* preloaded_meta_index_iter, 394 bool use_cache, bool prefetch, bool pin, 395 BlockCacheLookupContext* lookup_context, 396 std::unique_ptr<IndexReader>* index_reader); 397 398 bool FullFilterKeyMayMatch(const ReadOptions& read_options, 399 FilterBlockReader* filter, const Slice& user_key, 400 const bool no_io, 401 const SliceTransform* prefix_extractor, 402 GetContext* get_context, 403 BlockCacheLookupContext* lookup_context) const; 404 405 void FullFilterKeysMayMatch(const ReadOptions& read_options, 406 FilterBlockReader* filter, MultiGetRange* range, 407 const bool no_io, 408 const SliceTransform* prefix_extractor, 409 BlockCacheLookupContext* lookup_context) const; 410 411 static Status PrefetchTail( 412 RandomAccessFileReader* file, uint64_t file_size, 413 TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, 414 const bool preload_all, 415 std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer); 416 Status ReadMetaIndexBlock(FilePrefetchBuffer* prefetch_buffer, 417 std::unique_ptr<Block>* metaindex_block, 418 std::unique_ptr<InternalIterator>* iter); 419 Status TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer* prefetch_buffer, 420 const Slice& handle_value, 421 TableProperties** table_properties); 422 Status ReadPropertiesBlock(FilePrefetchBuffer* prefetch_buffer, 423 InternalIterator* meta_iter, 424 const SequenceNumber largest_seqno); 425 Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer, 426 InternalIterator* meta_iter, 427 const InternalKeyComparator& internal_comparator, 428 BlockCacheLookupContext* lookup_context); 429 Status PrefetchIndexAndFilterBlocks( 430 FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, 431 BlockBasedTable* new_table, bool prefetch_all, 432 const BlockBasedTableOptions& table_options, const int level, 433 BlockCacheLookupContext* lookup_context); 434 435 static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name); 436 437 Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter); 438 Status VerifyChecksumInBlocks(const ReadOptions& read_options, 439 InternalIteratorBase<IndexValue>* index_iter); 440 441 // Create the filter from the filter block. 442 std::unique_ptr<FilterBlockReader> CreateFilterBlockReader( 443 FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, 444 bool pin, BlockCacheLookupContext* lookup_context); 445 446 static void SetupCacheKeyPrefix(Rep* rep); 447 448 // Generate a cache key prefix from the file 449 static void GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file, 450 char* buffer, size_t* size); 451 static void GenerateCachePrefix(Cache* cc, FSWritableFile* file, char* buffer, 452 size_t* size); 453 454 // Given an iterator return its offset in file. 455 uint64_t ApproximateOffsetOf( 456 const InternalIteratorBase<IndexValue>& index_iter) const; 457 458 // Helper functions for DumpTable() 459 Status DumpIndexBlock(WritableFile* out_file); 460 Status DumpDataBlocks(WritableFile* out_file); 461 void DumpKeyValue(const Slice& key, const Slice& value, 462 WritableFile* out_file); 463 464 // A cumulative data block file read in MultiGet lower than this size will 465 // use a stack buffer 466 static constexpr size_t kMultiGetReadStackBufSize = 8192; 467 468 friend class PartitionedFilterBlockReader; 469 friend class PartitionedFilterBlockTest; 470 friend class DBBasicTest_MultiGetIOBufferOverrun_Test; 471 }; 472 473 // Maitaning state of a two-level iteration on a partitioned index structure. 474 class BlockBasedTable::PartitionedIndexIteratorState 475 : public TwoLevelIteratorState { 476 public: 477 PartitionedIndexIteratorState( 478 const BlockBasedTable* table, 479 std::unordered_map<uint64_t, CachableEntry<Block>>* block_map); 480 InternalIteratorBase<IndexValue>* NewSecondaryIterator( 481 const BlockHandle& index_value) override; 482 483 private: 484 // Don't own table_ 485 const BlockBasedTable* table_; 486 std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_; 487 }; 488 489 // Stores all the properties associated with a BlockBasedTable. 490 // These are immutable. 491 struct BlockBasedTable::Rep { RepRep492 Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, 493 const BlockBasedTableOptions& _table_opt, 494 const InternalKeyComparator& _internal_comparator, bool skip_filters, 495 int _level, const bool _immortal_table) 496 : ioptions(_ioptions), 497 env_options(_env_options), 498 table_options(_table_opt), 499 filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()), 500 internal_comparator(_internal_comparator), 501 filter_type(FilterType::kNoFilter), 502 index_type(BlockBasedTableOptions::IndexType::kBinarySearch), 503 hash_index_allow_collision(false), 504 whole_key_filtering(_table_opt.whole_key_filtering), 505 prefix_filtering(true), 506 global_seqno(kDisableGlobalSequenceNumber), 507 level(_level), 508 immortal_table(_immortal_table) {} 509 510 const ImmutableCFOptions& ioptions; 511 const EnvOptions& env_options; 512 const BlockBasedTableOptions table_options; 513 const FilterPolicy* const filter_policy; 514 const InternalKeyComparator& internal_comparator; 515 Status status; 516 std::unique_ptr<RandomAccessFileReader> file; 517 char cache_key_prefix[kMaxCacheKeyPrefixSize]; 518 size_t cache_key_prefix_size = 0; 519 char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize]; 520 size_t persistent_cache_key_prefix_size = 0; 521 char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; 522 size_t compressed_cache_key_prefix_size = 0; 523 PersistentCacheOptions persistent_cache_options; 524 525 // Footer contains the fixed table information 526 Footer footer; 527 528 std::unique_ptr<IndexReader> index_reader; 529 std::unique_ptr<FilterBlockReader> filter; 530 std::unique_ptr<UncompressionDictReader> uncompression_dict_reader; 531 532 enum class FilterType { 533 kNoFilter, 534 kFullFilter, 535 kBlockFilter, 536 kPartitionedFilter, 537 }; 538 FilterType filter_type; 539 BlockHandle filter_handle; 540 BlockHandle compression_dict_handle; 541 542 std::shared_ptr<const TableProperties> table_properties; 543 BlockBasedTableOptions::IndexType index_type; 544 bool hash_index_allow_collision; 545 bool whole_key_filtering; 546 bool prefix_filtering; 547 // TODO(kailiu) It is very ugly to use internal key in table, since table 548 // module should not be relying on db module. However to make things easier 549 // and compatible with existing code, we introduce a wrapper that allows 550 // block to extract prefix without knowing if a key is internal or not. 551 // null if no prefix_extractor is passed in when opening the table reader. 552 std::unique_ptr<SliceTransform> internal_prefix_transform; 553 std::shared_ptr<const SliceTransform> table_prefix_extractor; 554 555 std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels; 556 557 // If global_seqno is used, all Keys in this file will have the same 558 // seqno with value `global_seqno`. 559 // 560 // A value of kDisableGlobalSequenceNumber means that this feature is disabled 561 // and every key have it's own seqno. 562 SequenceNumber global_seqno; 563 564 // the level when the table is opened, could potentially change when trivial 565 // move is involved 566 int level; 567 568 // If false, blocks in this file are definitely all uncompressed. Knowing this 569 // before reading individual blocks enables certain optimizations. 570 bool blocks_maybe_compressed = true; 571 572 // If true, data blocks in this file are definitely ZSTD compressed. If false 573 // they might not be. When false we skip creating a ZSTD digested 574 // uncompression dictionary. Even if we get a false negative, things should 575 // still work, just not as quickly. 576 bool blocks_definitely_zstd_compressed = false; 577 578 // These describe how index is encoded. 579 bool index_has_first_key = false; 580 bool index_key_includes_seq = true; 581 bool index_value_is_full = true; 582 583 const bool immortal_table; 584 get_global_seqnoRep585 SequenceNumber get_global_seqno(BlockType block_type) const { 586 return (block_type == BlockType::kFilter || 587 block_type == BlockType::kCompressionDictionary) 588 ? kDisableGlobalSequenceNumber 589 : global_seqno; 590 } 591 cf_id_for_tracingRep592 uint64_t cf_id_for_tracing() const { 593 return table_properties 594 ? table_properties->column_family_id 595 : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context:: 596 kUnknownColumnFamily; 597 } 598 cf_name_for_tracingRep599 Slice cf_name_for_tracing() const { 600 return table_properties ? table_properties->column_family_name 601 : BlockCacheTraceHelper::kUnknownColumnFamilyName; 602 } 603 level_for_tracingRep604 uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; } 605 sst_number_for_tracingRep606 uint64_t sst_number_for_tracing() const { 607 return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX; 608 } CreateFilePrefetchBufferRep609 void CreateFilePrefetchBuffer( 610 size_t readahead_size, size_t max_readahead_size, 611 std::unique_ptr<FilePrefetchBuffer>* fpb) const { 612 fpb->reset(new FilePrefetchBuffer(file.get(), readahead_size, 613 max_readahead_size, 614 !ioptions.allow_mmap_reads /* enable */)); 615 } 616 }; 617 618 // Iterates over the contents of BlockBasedTable. 619 template <class TBlockIter, typename TValue = Slice> 620 class BlockBasedTableIterator : public InternalIteratorBase<TValue> { 621 // compaction_readahead_size: its value will only be used if for_compaction = 622 // true 623 public: 624 BlockBasedTableIterator(const BlockBasedTable* table, 625 const ReadOptions& read_options, 626 const InternalKeyComparator& icomp, 627 InternalIteratorBase<IndexValue>* index_iter, 628 bool check_filter, bool need_upper_bound_check, 629 const SliceTransform* prefix_extractor, 630 BlockType block_type, TableReaderCaller caller, 631 size_t compaction_readahead_size = 0) table_(table)632 : table_(table), 633 read_options_(read_options), 634 icomp_(icomp), 635 user_comparator_(icomp.user_comparator()), 636 index_iter_(index_iter), 637 pinned_iters_mgr_(nullptr), 638 block_iter_points_to_real_block_(false), 639 check_filter_(check_filter), 640 need_upper_bound_check_(need_upper_bound_check), 641 prefix_extractor_(prefix_extractor), 642 block_type_(block_type), 643 lookup_context_(caller), 644 compaction_readahead_size_(compaction_readahead_size) {} 645 ~BlockBasedTableIterator()646 ~BlockBasedTableIterator() { delete index_iter_; } 647 648 void Seek(const Slice& target) override; 649 void SeekForPrev(const Slice& target) override; 650 void SeekToFirst() override; 651 void SeekToLast() override; 652 void Next() final override; 653 bool NextAndGetResult(IterateResult* result) override; 654 void Prev() override; Valid()655 bool Valid() const override { 656 return !is_out_of_bound_ && 657 (is_at_first_key_from_index_ || 658 (block_iter_points_to_real_block_ && block_iter_.Valid())); 659 } key()660 Slice key() const override { 661 assert(Valid()); 662 if (is_at_first_key_from_index_) { 663 return index_iter_->value().first_internal_key; 664 } else { 665 return block_iter_.key(); 666 } 667 } user_key()668 Slice user_key() const override { 669 assert(Valid()); 670 if (is_at_first_key_from_index_) { 671 return ExtractUserKey(index_iter_->value().first_internal_key); 672 } else { 673 return block_iter_.user_key(); 674 } 675 } value()676 TValue value() const override { 677 assert(Valid()); 678 679 // Load current block if not loaded. 680 if (is_at_first_key_from_index_ && 681 !const_cast<BlockBasedTableIterator*>(this) 682 ->MaterializeCurrentBlock()) { 683 // Oops, index is not consistent with block contents, but we have 684 // no good way to report error at this point. Let's return empty value. 685 return TValue(); 686 } 687 688 return block_iter_.value(); 689 } status()690 Status status() const override { 691 // Prefix index set status to NotFound when the prefix does not exist 692 if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) { 693 return index_iter_->status(); 694 } else if (block_iter_points_to_real_block_) { 695 return block_iter_.status(); 696 } else { 697 return Status::OK(); 698 } 699 } 700 701 // Whether iterator invalidated for being out of bound. IsOutOfBound()702 bool IsOutOfBound() override { return is_out_of_bound_; } 703 MayBeOutOfUpperBound()704 inline bool MayBeOutOfUpperBound() override { 705 assert(Valid()); 706 return !data_block_within_upper_bound_; 707 } 708 SetPinnedItersMgr(PinnedIteratorsManager * pinned_iters_mgr)709 void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { 710 pinned_iters_mgr_ = pinned_iters_mgr; 711 } IsKeyPinned()712 bool IsKeyPinned() const override { 713 // Our key comes either from block_iter_'s current key 714 // or index_iter_'s current *value*. 715 return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && 716 ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) || 717 (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned())); 718 } IsValuePinned()719 bool IsValuePinned() const override { 720 // Load current block if not loaded. 721 if (is_at_first_key_from_index_) { 722 const_cast<BlockBasedTableIterator*>(this)->MaterializeCurrentBlock(); 723 } 724 // BlockIter::IsValuePinned() is always true. No need to check 725 return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && 726 block_iter_points_to_real_block_; 727 } 728 ResetDataIter()729 void ResetDataIter() { 730 if (block_iter_points_to_real_block_) { 731 if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) { 732 block_iter_.DelegateCleanupsTo(pinned_iters_mgr_); 733 } 734 block_iter_.Invalidate(Status::OK()); 735 block_iter_points_to_real_block_ = false; 736 } 737 } 738 SavePrevIndexValue()739 void SavePrevIndexValue() { 740 if (block_iter_points_to_real_block_) { 741 // Reseek. If they end up with the same data block, we shouldn't re-fetch 742 // the same data block. 743 prev_block_offset_ = index_iter_->value().handle.offset(); 744 } 745 } 746 747 private: 748 enum class IterDirection { 749 kForward, 750 kBackward, 751 }; 752 753 const BlockBasedTable* table_; 754 const ReadOptions read_options_; 755 const InternalKeyComparator& icomp_; 756 UserComparatorWrapper user_comparator_; 757 InternalIteratorBase<IndexValue>* index_iter_; 758 PinnedIteratorsManager* pinned_iters_mgr_; 759 TBlockIter block_iter_; 760 761 // True if block_iter_ is initialized and points to the same block 762 // as index iterator. 763 bool block_iter_points_to_real_block_; 764 // See InternalIteratorBase::IsOutOfBound(). 765 bool is_out_of_bound_ = false; 766 // Whether current data block being fully within iterate upper bound. 767 bool data_block_within_upper_bound_ = false; 768 // True if we're standing at the first key of a block, and we haven't loaded 769 // that block yet. A call to value() will trigger loading the block. 770 bool is_at_first_key_from_index_ = false; 771 bool check_filter_; 772 // TODO(Zhongyi): pick a better name 773 bool need_upper_bound_check_; 774 const SliceTransform* prefix_extractor_; 775 BlockType block_type_; 776 uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max(); 777 BlockCacheLookupContext lookup_context_; 778 // Readahead size used in compaction, its value is used only if 779 // lookup_context_.caller = kCompaction. 780 size_t compaction_readahead_size_; 781 782 size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize; 783 size_t readahead_limit_ = 0; 784 int64_t num_file_reads_ = 0; 785 std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_; 786 787 // If `target` is null, seek to first. 788 void SeekImpl(const Slice* target); 789 790 void InitDataBlock(); 791 bool MaterializeCurrentBlock(); 792 void FindKeyForward(); 793 void FindBlockForward(); 794 void FindKeyBackward(); 795 void CheckOutOfBound(); 796 797 // Check if data block is fully within iterate_upper_bound. 798 // 799 // Note MyRocks may update iterate bounds between seek. To workaround it, 800 // we need to check and update data_block_within_upper_bound_ accordingly. 801 void CheckDataBlockWithinUpperBound(); 802 CheckPrefixMayMatch(const Slice & ikey,IterDirection direction)803 bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) { 804 if (need_upper_bound_check_ && direction == IterDirection::kBackward) { 805 // Upper bound check isn't sufficnet for backward direction to 806 // guarantee the same result as total order, so disable prefix 807 // check. 808 return true; 809 } 810 if (check_filter_ && 811 !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_, 812 need_upper_bound_check_, &lookup_context_)) { 813 // TODO remember the iterator is invalidated because of prefix 814 // match. This can avoid the upper level file iterator to falsely 815 // believe the position is the end of the SST file and move to 816 // the first key of the next file. 817 ResetDataIter(); 818 return false; 819 } 820 return true; 821 } 822 }; 823 824 } // namespace ROCKSDB_NAMESPACE 825