1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 // 6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 7 // Use of this source code is governed by a BSD-style license that can be 8 // found in the LICENSE file. See the AUTHORS file for names of contributors. 9 10 #pragma once 11 12 #include "db/range_tombstone_fragmenter.h" 13 #include "file/filename.h" 14 #include "table/block_based/block_based_table_factory.h" 15 #include "table/block_based/block_type.h" 16 #include "table/block_based/cachable_entry.h" 17 #include "table/block_based/filter_block.h" 18 #include "table/block_based/uncompression_dict_reader.h" 19 #include "table/table_properties_internal.h" 20 #include "table/table_reader.h" 21 #include "table/two_level_iterator.h" 22 23 #include "trace_replay/block_cache_tracer.h" 24 25 namespace ROCKSDB_NAMESPACE { 26 27 class Cache; 28 class FilterBlockReader; 29 class BlockBasedFilterBlockReader; 30 class FullFilterBlockReader; 31 class Footer; 32 class InternalKeyComparator; 33 class Iterator; 34 class FSRandomAccessFile; 35 class TableCache; 36 class TableReader; 37 class WritableFile; 38 struct BlockBasedTableOptions; 39 struct EnvOptions; 40 struct ReadOptions; 41 class GetContext; 42 43 typedef std::vector<std::pair<std::string, std::string>> KVPairBlock; 44 45 // Reader class for BlockBasedTable format. 46 // For the format of BlockBasedTable refer to 47 // https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format. 48 // This is the default table type. Data is chucked into fixed size blocks and 49 // each block in-turn stores entries. When storing data, we can compress and/or 50 // encode data efficiently within a block, which often results in a much smaller 51 // data size compared with the raw data size. As for the record retrieval, we'll 52 // first locate the block where target record may reside, then read the block to 53 // memory, and finally search that record within the block. Of course, to avoid 54 // frequent reads of the same block, we introduced the block cache to keep the 55 // loaded blocks in the memory. 56 class BlockBasedTable : public TableReader { 57 public: 58 static const std::string kFilterBlockPrefix; 59 static const std::string kFullFilterBlockPrefix; 60 static const std::string kPartitionedFilterBlockPrefix; 61 // The longest prefix of the cache key used to identify blocks. 62 // For Posix files the unique ID is three varints. 63 static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1; 64 65 // All the below fields control iterator readahead 66 static const size_t kInitAutoReadaheadSize = 8 * 1024; 67 static const int kMinNumFileReadsToStartAutoReadahead = 2; 68 69 // Attempt to open the table that is stored in bytes [0..file_size) 70 // of "file", and read the metadata entries necessary to allow 71 // retrieving data from the table. 72 // 73 // If successful, returns ok and sets "*table_reader" to the newly opened 74 // table. The client should delete "*table_reader" when no longer needed. 75 // If there was an error while initializing the table, sets "*table_reader" 76 // to nullptr and returns a non-ok status. 77 // 78 // @param file must remain live while this Table is in use. 79 // @param prefetch_index_and_filter_in_cache can be used to disable 80 // prefetching of 81 // index and filter blocks into block cache at startup 82 // @param skip_filters Disables loading/accessing the filter block. Overrides 83 // prefetch_index_and_filter_in_cache, so filter will be skipped if both 84 // are set. 85 // @param force_direct_prefetch if true, always prefetching to RocksDB 86 // buffer, rather than calling RandomAccessFile::Prefetch(). 87 static Status Open(const ReadOptions& ro, const ImmutableOptions& ioptions, 88 const EnvOptions& env_options, 89 const BlockBasedTableOptions& table_options, 90 const InternalKeyComparator& internal_key_comparator, 91 std::unique_ptr<RandomAccessFileReader>&& file, 92 uint64_t file_size, 93 std::unique_ptr<TableReader>* table_reader, 94 const SliceTransform* prefix_extractor = nullptr, 95 bool prefetch_index_and_filter_in_cache = true, 96 bool skip_filters = false, int level = -1, 97 const bool immortal_table = false, 98 const SequenceNumber largest_seqno = 0, 99 bool force_direct_prefetch = false, 100 TailPrefetchStats* tail_prefetch_stats = nullptr, 101 BlockCacheTracer* const block_cache_tracer = nullptr, 102 size_t max_file_size_for_l0_meta_pin = 0); 103 104 bool PrefixMayMatch(const Slice& internal_key, 105 const ReadOptions& read_options, 106 const SliceTransform* options_prefix_extractor, 107 const bool need_upper_bound_check, 108 BlockCacheLookupContext* lookup_context) const; 109 110 // Returns a new iterator over the table contents. 111 // The result of NewIterator() is initially invalid (caller must 112 // call one of the Seek methods on the iterator before using it). 113 // @param read_options Must outlive the returned iterator. 114 // @param skip_filters Disables loading/accessing the filter block 115 // compaction_readahead_size: its value will only be used if caller = 116 // kCompaction. 117 InternalIterator* NewIterator(const ReadOptions&, 118 const SliceTransform* prefix_extractor, 119 Arena* arena, bool skip_filters, 120 TableReaderCaller caller, 121 size_t compaction_readahead_size = 0, 122 bool allow_unprepared_value = false) override; 123 124 FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( 125 const ReadOptions& read_options) override; 126 127 // @param skip_filters Disables loading/accessing the filter block 128 Status Get(const ReadOptions& readOptions, const Slice& key, 129 GetContext* get_context, const SliceTransform* prefix_extractor, 130 bool skip_filters = false) override; 131 132 void MultiGet(const ReadOptions& readOptions, 133 const MultiGetContext::Range* mget_range, 134 const SliceTransform* prefix_extractor, 135 bool skip_filters = false) override; 136 137 // Pre-fetch the disk blocks that correspond to the key range specified by 138 // (kbegin, kend). The call will return error status in the event of 139 // IO or iteration error. 140 Status Prefetch(const Slice* begin, const Slice* end) override; 141 142 // Given a key, return an approximate byte offset in the file where 143 // the data for that key begins (or would begin if the key were 144 // present in the file). The returned value is in terms of file 145 // bytes, and so includes effects like compression of the underlying data. 146 // E.g., the approximate offset of the last key in the table will 147 // be close to the file length. 148 uint64_t ApproximateOffsetOf(const Slice& key, 149 TableReaderCaller caller) override; 150 151 // Given start and end keys, return the approximate data size in the file 152 // between the keys. The returned value is in terms of file bytes, and so 153 // includes effects like compression of the underlying data. 154 // The start key must not be greater than the end key. 155 uint64_t ApproximateSize(const Slice& start, const Slice& end, 156 TableReaderCaller caller) override; 157 158 bool TEST_BlockInCache(const BlockHandle& handle) const; 159 160 // Returns true if the block for the specified key is in cache. 161 // REQUIRES: key is in this table && block cache enabled 162 bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); 163 164 // Set up the table for Compaction. Might change some parameters with 165 // posix_fadvise 166 void SetupForCompaction() override; 167 168 std::shared_ptr<const TableProperties> GetTableProperties() const override; 169 170 size_t ApproximateMemoryUsage() const override; 171 172 // convert SST file to a human readable form 173 Status DumpTable(WritableFile* out_file) override; 174 175 Status VerifyChecksum(const ReadOptions& readOptions, 176 TableReaderCaller caller) override; 177 178 ~BlockBasedTable(); 179 180 bool TEST_FilterBlockInCache() const; 181 bool TEST_IndexBlockInCache() const; 182 183 // IndexReader is the interface that provides the functionality for index 184 // access. 185 class IndexReader { 186 public: 187 virtual ~IndexReader() = default; 188 189 // Create an iterator for index access. If iter is null, then a new object 190 // is created on the heap, and the callee will have the ownership. 191 // If a non-null iter is passed in, it will be used, and the returned value 192 // is either the same as iter or a new on-heap object that 193 // wraps the passed iter. In the latter case the return value points 194 // to a different object then iter, and the callee has the ownership of the 195 // returned object. 196 virtual InternalIteratorBase<IndexValue>* NewIterator( 197 const ReadOptions& read_options, bool disable_prefix_seek, 198 IndexBlockIter* iter, GetContext* get_context, 199 BlockCacheLookupContext* lookup_context) = 0; 200 201 // Report an approximation of how much memory has been used other than 202 // memory that was allocated in block cache. 203 virtual size_t ApproximateMemoryUsage() const = 0; 204 // Cache the dependencies of the index reader (e.g. the partitions 205 // of a partitioned index). CacheDependencies(const ReadOptions &,bool)206 virtual Status CacheDependencies(const ReadOptions& /*ro*/, 207 bool /* pin */) { 208 return Status::OK(); 209 } 210 }; 211 212 class IndexReaderCommon; 213 214 static Slice GetCacheKey(const char* cache_key_prefix, 215 size_t cache_key_prefix_size, 216 const BlockHandle& handle, char* cache_key); 217 218 // Retrieve all key value pairs from data blocks in the table. 219 // The key retrieved are internal keys. 220 Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks); 221 222 struct Rep; 223 get_rep()224 Rep* get_rep() { return rep_; } get_rep()225 const Rep* get_rep() const { return rep_; } 226 227 // input_iter: if it is not null, update this one and return it as Iterator 228 template <typename TBlockIter> 229 TBlockIter* NewDataBlockIterator( 230 const ReadOptions& ro, const BlockHandle& block_handle, 231 TBlockIter* input_iter, BlockType block_type, GetContext* get_context, 232 BlockCacheLookupContext* lookup_context, Status s, 233 FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const; 234 235 // input_iter: if it is not null, update this one and return it as Iterator 236 template <typename TBlockIter> 237 TBlockIter* NewDataBlockIterator(const ReadOptions& ro, 238 CachableEntry<Block>& block, 239 TBlockIter* input_iter, Status s) const; 240 241 class PartitionedIndexIteratorState; 242 243 template <typename TBlocklike> 244 friend class FilterBlockReaderCommon; 245 246 friend class PartitionIndexReader; 247 248 friend class UncompressionDictReader; 249 250 protected: 251 Rep* rep_; BlockBasedTable(Rep * rep,BlockCacheTracer * const block_cache_tracer)252 explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer) 253 : rep_(rep), block_cache_tracer_(block_cache_tracer) {} 254 // No copying allowed 255 explicit BlockBasedTable(const TableReader&) = delete; 256 void operator=(const TableReader&) = delete; 257 258 private: 259 friend class MockedBlockBasedTable; 260 friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test; 261 static std::atomic<uint64_t> next_cache_key_id_; 262 BlockCacheTracer* const block_cache_tracer_; 263 264 void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context, 265 size_t usage) const; 266 void UpdateCacheMissMetrics(BlockType block_type, 267 GetContext* get_context) const; 268 void UpdateCacheInsertionMetrics(BlockType block_type, 269 GetContext* get_context, size_t usage, 270 bool redundant) const; 271 Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, 272 BlockType block_type, 273 GetContext* get_context, 274 const Cache::CacheItemHelper* cache_helper, 275 const Cache::CreateCallback& create_cb, 276 Cache::Priority priority) const; 277 278 // Either Block::NewDataIterator() or Block::NewIndexIterator(). 279 template <typename TBlockIter> 280 static TBlockIter* InitBlockIterator(const Rep* rep, Block* block, 281 BlockType block_type, 282 TBlockIter* input_iter, 283 bool block_contents_pinned); 284 285 // If block cache enabled (compressed or uncompressed), looks for the block 286 // identified by handle in (1) uncompressed cache, (2) compressed cache, and 287 // then (3) file. If found, inserts into the cache(s) that were searched 288 // unsuccessfully (e.g., if found in file, will add to both uncompressed and 289 // compressed caches if they're enabled). 290 // 291 // @param block_entry value is set to the uncompressed block if found. If 292 // in uncompressed block cache, also sets cache_handle to reference that 293 // block. 294 template <typename TBlocklike> 295 Status MaybeReadBlockAndLoadToCache( 296 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, 297 const BlockHandle& handle, const UncompressionDict& uncompression_dict, 298 CachableEntry<TBlocklike>* block_entry, BlockType block_type, 299 GetContext* get_context, BlockCacheLookupContext* lookup_context, 300 BlockContents* contents) const; 301 302 // Similar to the above, with one crucial difference: it will retrieve the 303 // block from the file even if there are no caches configured (assuming the 304 // read options allow I/O). 305 template <typename TBlocklike> 306 Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer, 307 const ReadOptions& ro, const BlockHandle& handle, 308 const UncompressionDict& uncompression_dict, 309 CachableEntry<TBlocklike>* block_entry, 310 BlockType block_type, GetContext* get_context, 311 BlockCacheLookupContext* lookup_context, 312 bool for_compaction, bool use_cache) const; 313 314 void RetrieveMultipleBlocks( 315 const ReadOptions& options, const MultiGetRange* batch, 316 const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles, 317 autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses, 318 autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* 319 results, 320 char* scratch, const UncompressionDict& uncompression_dict) const; 321 322 // Get the iterator from the index reader. 323 // 324 // If input_iter is not set, return a new Iterator. 325 // If input_iter is set, try to update it and return it as Iterator. 326 // However note that in some cases the returned iterator may be different 327 // from input_iter. In such case the returned iterator should be freed. 328 // 329 // Note: ErrorIterator with Status::Incomplete shall be returned if all the 330 // following conditions are met: 331 // 1. We enabled table_options.cache_index_and_filter_blocks. 332 // 2. index is not present in block cache. 333 // 3. We disallowed any io to be performed, that is, read_options == 334 // kBlockCacheTier 335 InternalIteratorBase<IndexValue>* NewIndexIterator( 336 const ReadOptions& read_options, bool need_upper_bound_check, 337 IndexBlockIter* input_iter, GetContext* get_context, 338 BlockCacheLookupContext* lookup_context) const; 339 340 // Read block cache from block caches (if set): block_cache and 341 // block_cache_compressed. 342 // On success, Status::OK with be returned and @block will be populated with 343 // pointer to the block as well as its block handle. 344 // @param uncompression_dict Data for presetting the compression library's 345 // dictionary. 346 template <typename TBlocklike> 347 Status GetDataBlockFromCache( 348 const Slice& block_cache_key, const Slice& compressed_block_cache_key, 349 Cache* block_cache, Cache* block_cache_compressed, 350 const ReadOptions& read_options, CachableEntry<TBlocklike>* block, 351 const UncompressionDict& uncompression_dict, BlockType block_type, 352 GetContext* get_context) const; 353 354 // Put a raw block (maybe compressed) to the corresponding block caches. 355 // This method will perform decompression against raw_block if needed and then 356 // populate the block caches. 357 // On success, Status::OK will be returned; also @block will be populated with 358 // uncompressed block and its cache handle. 359 // 360 // Allocated memory managed by raw_block_contents will be transferred to 361 // PutDataBlockToCache(). After the call, the object will be invalid. 362 // @param uncompression_dict Data for presetting the compression library's 363 // dictionary. 364 template <typename TBlocklike> 365 Status PutDataBlockToCache(const Slice& block_cache_key, 366 const Slice& compressed_block_cache_key, 367 Cache* block_cache, Cache* block_cache_compressed, 368 CachableEntry<TBlocklike>* cached_block, 369 BlockContents* raw_block_contents, 370 CompressionType raw_block_comp_type, 371 const UncompressionDict& uncompression_dict, 372 MemoryAllocator* memory_allocator, 373 BlockType block_type, 374 GetContext* get_context) const; 375 376 // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found 377 // after a call to Seek(key), until handle_result returns false. 378 // May not make such a call if filter policy says that key is not present. 379 friend class TableCache; 380 friend class BlockBasedTableBuilder; 381 382 // Create a index reader based on the index type stored in the table. 383 // Optionally, user can pass a preloaded meta_index_iter for the index that 384 // need to access extra meta blocks for index construction. This parameter 385 // helps avoid re-reading meta index block if caller already created one. 386 Status CreateIndexReader(const ReadOptions& ro, 387 FilePrefetchBuffer* prefetch_buffer, 388 InternalIterator* preloaded_meta_index_iter, 389 bool use_cache, bool prefetch, bool pin, 390 BlockCacheLookupContext* lookup_context, 391 std::unique_ptr<IndexReader>* index_reader); 392 393 bool FullFilterKeyMayMatch(const ReadOptions& read_options, 394 FilterBlockReader* filter, const Slice& user_key, 395 const bool no_io, 396 const SliceTransform* prefix_extractor, 397 GetContext* get_context, 398 BlockCacheLookupContext* lookup_context) const; 399 400 void FullFilterKeysMayMatch(const ReadOptions& read_options, 401 FilterBlockReader* filter, MultiGetRange* range, 402 const bool no_io, 403 const SliceTransform* prefix_extractor, 404 BlockCacheLookupContext* lookup_context) const; 405 406 // If force_direct_prefetch is true, always prefetching to RocksDB 407 // buffer, rather than calling RandomAccessFile::Prefetch(). 408 static Status PrefetchTail( 409 const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, 410 bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, 411 const bool prefetch_all, const bool preload_all, 412 std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer); 413 Status ReadMetaIndexBlock(const ReadOptions& ro, 414 FilePrefetchBuffer* prefetch_buffer, 415 std::unique_ptr<Block>* metaindex_block, 416 std::unique_ptr<InternalIterator>* iter); 417 Status TryReadPropertiesWithGlobalSeqno(const ReadOptions& ro, 418 FilePrefetchBuffer* prefetch_buffer, 419 const Slice& handle_value, 420 TableProperties** table_properties); 421 Status ReadPropertiesBlock(const ReadOptions& ro, 422 FilePrefetchBuffer* prefetch_buffer, 423 InternalIterator* meta_iter, 424 const SequenceNumber largest_seqno); 425 Status ReadRangeDelBlock(const ReadOptions& ro, 426 FilePrefetchBuffer* prefetch_buffer, 427 InternalIterator* meta_iter, 428 const InternalKeyComparator& internal_comparator, 429 BlockCacheLookupContext* lookup_context); 430 Status PrefetchIndexAndFilterBlocks( 431 const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, 432 InternalIterator* meta_iter, BlockBasedTable* new_table, 433 bool prefetch_all, const BlockBasedTableOptions& table_options, 434 const int level, size_t file_size, size_t max_file_size_for_l0_meta_pin, 435 BlockCacheLookupContext* lookup_context); 436 437 static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name); 438 439 Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter); 440 Status VerifyChecksumInBlocks(const ReadOptions& read_options, 441 InternalIteratorBase<IndexValue>* index_iter); 442 443 // Create the filter from the filter block. 444 std::unique_ptr<FilterBlockReader> CreateFilterBlockReader( 445 const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, 446 bool use_cache, bool prefetch, bool pin, 447 BlockCacheLookupContext* lookup_context); 448 449 static void SetupCacheKeyPrefix(Rep* rep); 450 451 // Generate a cache key prefix from the file 452 template <typename TCache, typename TFile> GenerateCachePrefix(TCache * cc,TFile * file,char * buffer,size_t * size)453 static void GenerateCachePrefix(TCache* cc, TFile* file, char* buffer, 454 size_t* size) { 455 // generate an id from the file 456 *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); 457 458 // If the prefix wasn't generated or was too long, 459 // create one from the cache. 460 if (cc != nullptr && *size == 0) { 461 char* end = EncodeVarint64(buffer, cc->NewId()); 462 *size = static_cast<size_t>(end - buffer); 463 } 464 } 465 466 // Size of all data blocks, maybe approximate 467 uint64_t GetApproximateDataSize(); 468 469 // Given an iterator return its offset in data block section of file. 470 uint64_t ApproximateDataOffsetOf( 471 const InternalIteratorBase<IndexValue>& index_iter, 472 uint64_t data_size) const; 473 474 // Helper functions for DumpTable() 475 Status DumpIndexBlock(std::ostream& out_stream); 476 Status DumpDataBlocks(std::ostream& out_stream); 477 void DumpKeyValue(const Slice& key, const Slice& value, 478 std::ostream& out_stream); 479 480 // A cumulative data block file read in MultiGet lower than this size will 481 // use a stack buffer 482 static constexpr size_t kMultiGetReadStackBufSize = 8192; 483 484 friend class PartitionedFilterBlockReader; 485 friend class PartitionedFilterBlockTest; 486 friend class DBBasicTest_MultiGetIOBufferOverrun_Test; 487 }; 488 489 // Maintaining state of a two-level iteration on a partitioned index structure. 490 class BlockBasedTable::PartitionedIndexIteratorState 491 : public TwoLevelIteratorState { 492 public: 493 PartitionedIndexIteratorState( 494 const BlockBasedTable* table, 495 std::unordered_map<uint64_t, CachableEntry<Block>>* block_map); 496 InternalIteratorBase<IndexValue>* NewSecondaryIterator( 497 const BlockHandle& index_value) override; 498 499 private: 500 // Don't own table_ 501 const BlockBasedTable* table_; 502 std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_; 503 }; 504 505 // Stores all the properties associated with a BlockBasedTable. 506 // These are immutable. 507 struct BlockBasedTable::Rep { RepRep508 Rep(const ImmutableOptions& _ioptions, const EnvOptions& _env_options, 509 const BlockBasedTableOptions& _table_opt, 510 const InternalKeyComparator& _internal_comparator, bool skip_filters, 511 uint64_t _file_size, int _level, const bool _immortal_table) 512 : ioptions(_ioptions), 513 env_options(_env_options), 514 table_options(_table_opt), 515 filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()), 516 internal_comparator(_internal_comparator), 517 filter_type(FilterType::kNoFilter), 518 index_type(BlockBasedTableOptions::IndexType::kBinarySearch), 519 hash_index_allow_collision(false), 520 whole_key_filtering(_table_opt.whole_key_filtering), 521 prefix_filtering(true), 522 global_seqno(kDisableGlobalSequenceNumber), 523 file_size(_file_size), 524 level(_level), 525 immortal_table(_immortal_table) {} ~RepRep526 ~Rep() { status.PermitUncheckedError(); } 527 const ImmutableOptions& ioptions; 528 const EnvOptions& env_options; 529 const BlockBasedTableOptions table_options; 530 const FilterPolicy* const filter_policy; 531 const InternalKeyComparator& internal_comparator; 532 Status status; 533 std::unique_ptr<RandomAccessFileReader> file; 534 char cache_key_prefix[kMaxCacheKeyPrefixSize]; 535 size_t cache_key_prefix_size = 0; 536 char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize]; 537 size_t persistent_cache_key_prefix_size = 0; 538 char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; 539 size_t compressed_cache_key_prefix_size = 0; 540 PersistentCacheOptions persistent_cache_options; 541 542 // Footer contains the fixed table information 543 Footer footer; 544 545 std::unique_ptr<IndexReader> index_reader; 546 std::unique_ptr<FilterBlockReader> filter; 547 std::unique_ptr<UncompressionDictReader> uncompression_dict_reader; 548 549 enum class FilterType { 550 kNoFilter, 551 kFullFilter, 552 kBlockFilter, 553 kPartitionedFilter, 554 }; 555 FilterType filter_type; 556 BlockHandle filter_handle; 557 BlockHandle compression_dict_handle; 558 559 std::shared_ptr<const TableProperties> table_properties; 560 BlockBasedTableOptions::IndexType index_type; 561 bool hash_index_allow_collision; 562 bool whole_key_filtering; 563 bool prefix_filtering; 564 // TODO(kailiu) It is very ugly to use internal key in table, since table 565 // module should not be relying on db module. However to make things easier 566 // and compatible with existing code, we introduce a wrapper that allows 567 // block to extract prefix without knowing if a key is internal or not. 568 // null if no prefix_extractor is passed in when opening the table reader. 569 std::unique_ptr<SliceTransform> internal_prefix_transform; 570 std::shared_ptr<const SliceTransform> table_prefix_extractor; 571 572 std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels; 573 574 // If global_seqno is used, all Keys in this file will have the same 575 // seqno with value `global_seqno`. 576 // 577 // A value of kDisableGlobalSequenceNumber means that this feature is disabled 578 // and every key have it's own seqno. 579 SequenceNumber global_seqno; 580 581 // Size of the table file on disk 582 uint64_t file_size; 583 584 // the level when the table is opened, could potentially change when trivial 585 // move is involved 586 int level; 587 588 // If false, blocks in this file are definitely all uncompressed. Knowing this 589 // before reading individual blocks enables certain optimizations. 590 bool blocks_maybe_compressed = true; 591 592 // If true, data blocks in this file are definitely ZSTD compressed. If false 593 // they might not be. When false we skip creating a ZSTD digested 594 // uncompression dictionary. Even if we get a false negative, things should 595 // still work, just not as quickly. 596 bool blocks_definitely_zstd_compressed = false; 597 598 // These describe how index is encoded. 599 bool index_has_first_key = false; 600 bool index_key_includes_seq = true; 601 bool index_value_is_full = true; 602 603 const bool immortal_table; 604 get_global_seqnoRep605 SequenceNumber get_global_seqno(BlockType block_type) const { 606 return (block_type == BlockType::kFilter || 607 block_type == BlockType::kCompressionDictionary) 608 ? kDisableGlobalSequenceNumber 609 : global_seqno; 610 } 611 cf_id_for_tracingRep612 uint64_t cf_id_for_tracing() const { 613 return table_properties 614 ? table_properties->column_family_id 615 : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context:: 616 kUnknownColumnFamily; 617 } 618 cf_name_for_tracingRep619 Slice cf_name_for_tracing() const { 620 return table_properties ? table_properties->column_family_name 621 : BlockCacheTraceHelper::kUnknownColumnFamilyName; 622 } 623 level_for_tracingRep624 uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; } 625 sst_number_for_tracingRep626 uint64_t sst_number_for_tracing() const { 627 return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX; 628 } CreateFilePrefetchBufferRep629 void CreateFilePrefetchBuffer(size_t readahead_size, 630 size_t max_readahead_size, 631 std::unique_ptr<FilePrefetchBuffer>* fpb, 632 bool implicit_auto_readahead) const { 633 fpb->reset(new FilePrefetchBuffer( 634 file.get(), readahead_size, max_readahead_size, 635 !ioptions.allow_mmap_reads /* enable */, false /* track_min_offset*/, 636 implicit_auto_readahead)); 637 } 638 CreateFilePrefetchBufferIfNotExistsRep639 void CreateFilePrefetchBufferIfNotExists( 640 size_t readahead_size, size_t max_readahead_size, 641 std::unique_ptr<FilePrefetchBuffer>* fpb, 642 bool implicit_auto_readahead) const { 643 if (!(*fpb)) { 644 CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb, 645 implicit_auto_readahead); 646 } 647 } 648 }; 649 650 // This is an adapter class for `WritableFile` to be used for `std::ostream`. 651 // The adapter wraps a `WritableFile`, which can be passed to a `std::ostream` 652 // constructor for storing streaming data. 653 // Note: 654 // * This adapter doesn't provide any buffering, each write is forwarded to 655 // `WritableFile->Append()` directly. 656 // * For a failed write, the user needs to check the status by `ostream.good()` 657 class WritableFileStringStreamAdapter : public std::stringbuf { 658 public: WritableFileStringStreamAdapter(WritableFile * writable_file)659 explicit WritableFileStringStreamAdapter(WritableFile* writable_file) 660 : file_(writable_file) {} 661 662 // Override overflow() to handle `sputc()`. There are cases that will not go 663 // through `xsputn()` e.g. `std::endl` or an unsigned long long is written by 664 // `os.put()` directly and will call `sputc()` By internal implementation: 665 // int_type __CLR_OR_THIS_CALL sputc(_Elem _Ch) { // put a character 666 // return 0 < _Pnavail() ? _Traits::to_int_type(*_Pninc() = _Ch) : 667 // overflow(_Traits::to_int_type(_Ch)); 668 // } 669 // As we explicitly disabled buffering (_Pnavail() is always 0), every write, 670 // not captured by xsputn(), becomes an overflow here. 671 int overflow(int ch = EOF) override { 672 if (ch != EOF) { 673 Status s = file_->Append(Slice((char*)&ch, 1)); 674 if (s.ok()) { 675 return ch; 676 } 677 } 678 return EOF; 679 } 680 xsputn(char const * p,std::streamsize n)681 std::streamsize xsputn(char const* p, std::streamsize n) override { 682 Status s = file_->Append(Slice(p, n)); 683 if (!s.ok()) { 684 return 0; 685 } 686 return n; 687 } 688 689 private: 690 WritableFile* file_; 691 }; 692 693 } // namespace ROCKSDB_NAMESPACE 694