1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #pragma once
11 
12 #include "db/range_tombstone_fragmenter.h"
13 #include "file/filename.h"
14 #include "table/block_based/block_based_table_factory.h"
15 #include "table/block_based/block_type.h"
16 #include "table/block_based/cachable_entry.h"
17 #include "table/block_based/filter_block.h"
18 #include "table/block_based/uncompression_dict_reader.h"
19 #include "table/table_properties_internal.h"
20 #include "table/table_reader.h"
21 #include "table/two_level_iterator.h"
22 
23 #include "trace_replay/block_cache_tracer.h"
24 
25 namespace ROCKSDB_NAMESPACE {
26 
27 class Cache;
28 class FilterBlockReader;
29 class BlockBasedFilterBlockReader;
30 class FullFilterBlockReader;
31 class Footer;
32 class InternalKeyComparator;
33 class Iterator;
34 class FSRandomAccessFile;
35 class TableCache;
36 class TableReader;
37 class WritableFile;
38 struct BlockBasedTableOptions;
39 struct EnvOptions;
40 struct ReadOptions;
41 class GetContext;
42 
43 typedef std::vector<std::pair<std::string, std::string>> KVPairBlock;
44 
45 // Reader class for BlockBasedTable format.
46 // For the format of BlockBasedTable refer to
47 // https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format.
48 // This is the default table type. Data is chucked into fixed size blocks and
49 // each block in-turn stores entries. When storing data, we can compress and/or
50 // encode data efficiently within a block, which often results in a much smaller
51 // data size compared with the raw data size. As for the record retrieval, we'll
52 // first locate the block where target record may reside, then read the block to
53 // memory, and finally search that record within the block. Of course, to avoid
54 // frequent reads of the same block, we introduced the block cache to keep the
55 // loaded blocks in the memory.
56 class BlockBasedTable : public TableReader {
57  public:
58   static const std::string kFilterBlockPrefix;
59   static const std::string kFullFilterBlockPrefix;
60   static const std::string kPartitionedFilterBlockPrefix;
61   // The longest prefix of the cache key used to identify blocks.
62   // For Posix files the unique ID is three varints.
63   static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1;
64 
65   // All the below fields control iterator readahead
66   static const size_t kInitAutoReadaheadSize = 8 * 1024;
67   static const int kMinNumFileReadsToStartAutoReadahead = 2;
68 
69   // Attempt to open the table that is stored in bytes [0..file_size)
70   // of "file", and read the metadata entries necessary to allow
71   // retrieving data from the table.
72   //
73   // If successful, returns ok and sets "*table_reader" to the newly opened
74   // table.  The client should delete "*table_reader" when no longer needed.
75   // If there was an error while initializing the table, sets "*table_reader"
76   // to nullptr and returns a non-ok status.
77   //
78   // @param file must remain live while this Table is in use.
79   // @param prefetch_index_and_filter_in_cache can be used to disable
80   // prefetching of
81   //    index and filter blocks into block cache at startup
82   // @param skip_filters Disables loading/accessing the filter block. Overrides
83   //    prefetch_index_and_filter_in_cache, so filter will be skipped if both
84   //    are set.
85   // @param force_direct_prefetch if true, always prefetching to RocksDB
86   //    buffer, rather than calling RandomAccessFile::Prefetch().
87   static Status Open(const ReadOptions& ro, const ImmutableOptions& ioptions,
88                      const EnvOptions& env_options,
89                      const BlockBasedTableOptions& table_options,
90                      const InternalKeyComparator& internal_key_comparator,
91                      std::unique_ptr<RandomAccessFileReader>&& file,
92                      uint64_t file_size,
93                      std::unique_ptr<TableReader>* table_reader,
94                      const SliceTransform* prefix_extractor = nullptr,
95                      bool prefetch_index_and_filter_in_cache = true,
96                      bool skip_filters = false, int level = -1,
97                      const bool immortal_table = false,
98                      const SequenceNumber largest_seqno = 0,
99                      bool force_direct_prefetch = false,
100                      TailPrefetchStats* tail_prefetch_stats = nullptr,
101                      BlockCacheTracer* const block_cache_tracer = nullptr,
102                      size_t max_file_size_for_l0_meta_pin = 0);
103 
104   bool PrefixMayMatch(const Slice& internal_key,
105                       const ReadOptions& read_options,
106                       const SliceTransform* options_prefix_extractor,
107                       const bool need_upper_bound_check,
108                       BlockCacheLookupContext* lookup_context) const;
109 
110   // Returns a new iterator over the table contents.
111   // The result of NewIterator() is initially invalid (caller must
112   // call one of the Seek methods on the iterator before using it).
113   // @param read_options Must outlive the returned iterator.
114   // @param skip_filters Disables loading/accessing the filter block
115   // compaction_readahead_size: its value will only be used if caller =
116   // kCompaction.
117   InternalIterator* NewIterator(const ReadOptions&,
118                                 const SliceTransform* prefix_extractor,
119                                 Arena* arena, bool skip_filters,
120                                 TableReaderCaller caller,
121                                 size_t compaction_readahead_size = 0,
122                                 bool allow_unprepared_value = false) override;
123 
124   FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
125       const ReadOptions& read_options) override;
126 
127   // @param skip_filters Disables loading/accessing the filter block
128   Status Get(const ReadOptions& readOptions, const Slice& key,
129              GetContext* get_context, const SliceTransform* prefix_extractor,
130              bool skip_filters = false) override;
131 
132   void MultiGet(const ReadOptions& readOptions,
133                 const MultiGetContext::Range* mget_range,
134                 const SliceTransform* prefix_extractor,
135                 bool skip_filters = false) override;
136 
137   // Pre-fetch the disk blocks that correspond to the key range specified by
138   // (kbegin, kend). The call will return error status in the event of
139   // IO or iteration error.
140   Status Prefetch(const Slice* begin, const Slice* end) override;
141 
142   // Given a key, return an approximate byte offset in the file where
143   // the data for that key begins (or would begin if the key were
144   // present in the file). The returned value is in terms of file
145   // bytes, and so includes effects like compression of the underlying data.
146   // E.g., the approximate offset of the last key in the table will
147   // be close to the file length.
148   uint64_t ApproximateOffsetOf(const Slice& key,
149                                TableReaderCaller caller) override;
150 
151   // Given start and end keys, return the approximate data size in the file
152   // between the keys. The returned value is in terms of file bytes, and so
153   // includes effects like compression of the underlying data.
154   // The start key must not be greater than the end key.
155   uint64_t ApproximateSize(const Slice& start, const Slice& end,
156                            TableReaderCaller caller) override;
157 
158   bool TEST_BlockInCache(const BlockHandle& handle) const;
159 
160   // Returns true if the block for the specified key is in cache.
161   // REQUIRES: key is in this table && block cache enabled
162   bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
163 
164   // Set up the table for Compaction. Might change some parameters with
165   // posix_fadvise
166   void SetupForCompaction() override;
167 
168   std::shared_ptr<const TableProperties> GetTableProperties() const override;
169 
170   size_t ApproximateMemoryUsage() const override;
171 
172   // convert SST file to a human readable form
173   Status DumpTable(WritableFile* out_file) override;
174 
175   Status VerifyChecksum(const ReadOptions& readOptions,
176                         TableReaderCaller caller) override;
177 
178   ~BlockBasedTable();
179 
180   bool TEST_FilterBlockInCache() const;
181   bool TEST_IndexBlockInCache() const;
182 
183   // IndexReader is the interface that provides the functionality for index
184   // access.
185   class IndexReader {
186    public:
187     virtual ~IndexReader() = default;
188 
189     // Create an iterator for index access. If iter is null, then a new object
190     // is created on the heap, and the callee will have the ownership.
191     // If a non-null iter is passed in, it will be used, and the returned value
192     // is either the same as iter or a new on-heap object that
193     // wraps the passed iter. In the latter case the return value points
194     // to a different object then iter, and the callee has the ownership of the
195     // returned object.
196     virtual InternalIteratorBase<IndexValue>* NewIterator(
197         const ReadOptions& read_options, bool disable_prefix_seek,
198         IndexBlockIter* iter, GetContext* get_context,
199         BlockCacheLookupContext* lookup_context) = 0;
200 
201     // Report an approximation of how much memory has been used other than
202     // memory that was allocated in block cache.
203     virtual size_t ApproximateMemoryUsage() const = 0;
204     // Cache the dependencies of the index reader (e.g. the partitions
205     // of a partitioned index).
CacheDependencies(const ReadOptions &,bool)206     virtual Status CacheDependencies(const ReadOptions& /*ro*/,
207                                      bool /* pin */) {
208       return Status::OK();
209     }
210   };
211 
212   class IndexReaderCommon;
213 
214   static Slice GetCacheKey(const char* cache_key_prefix,
215                            size_t cache_key_prefix_size,
216                            const BlockHandle& handle, char* cache_key);
217 
218   // Retrieve all key value pairs from data blocks in the table.
219   // The key retrieved are internal keys.
220   Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
221 
222   struct Rep;
223 
get_rep()224   Rep* get_rep() { return rep_; }
get_rep()225   const Rep* get_rep() const { return rep_; }
226 
227   // input_iter: if it is not null, update this one and return it as Iterator
228   template <typename TBlockIter>
229   TBlockIter* NewDataBlockIterator(
230       const ReadOptions& ro, const BlockHandle& block_handle,
231       TBlockIter* input_iter, BlockType block_type, GetContext* get_context,
232       BlockCacheLookupContext* lookup_context, Status s,
233       FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const;
234 
235   // input_iter: if it is not null, update this one and return it as Iterator
236   template <typename TBlockIter>
237   TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
238                                    CachableEntry<Block>& block,
239                                    TBlockIter* input_iter, Status s) const;
240 
241   class PartitionedIndexIteratorState;
242 
243   template <typename TBlocklike>
244   friend class FilterBlockReaderCommon;
245 
246   friend class PartitionIndexReader;
247 
248   friend class UncompressionDictReader;
249 
250  protected:
251   Rep* rep_;
BlockBasedTable(Rep * rep,BlockCacheTracer * const block_cache_tracer)252   explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
253       : rep_(rep), block_cache_tracer_(block_cache_tracer) {}
254   // No copying allowed
255   explicit BlockBasedTable(const TableReader&) = delete;
256   void operator=(const TableReader&) = delete;
257 
258  private:
259   friend class MockedBlockBasedTable;
260   friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test;
261   static std::atomic<uint64_t> next_cache_key_id_;
262   BlockCacheTracer* const block_cache_tracer_;
263 
264   void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context,
265                              size_t usage) const;
266   void UpdateCacheMissMetrics(BlockType block_type,
267                               GetContext* get_context) const;
268   void UpdateCacheInsertionMetrics(BlockType block_type,
269                                    GetContext* get_context, size_t usage,
270                                    bool redundant) const;
271   Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
272                                    BlockType block_type,
273                                    GetContext* get_context,
274                                    const Cache::CacheItemHelper* cache_helper,
275                                    const Cache::CreateCallback& create_cb,
276                                    Cache::Priority priority) const;
277 
278   // Either Block::NewDataIterator() or Block::NewIndexIterator().
279   template <typename TBlockIter>
280   static TBlockIter* InitBlockIterator(const Rep* rep, Block* block,
281                                        BlockType block_type,
282                                        TBlockIter* input_iter,
283                                        bool block_contents_pinned);
284 
285   // If block cache enabled (compressed or uncompressed), looks for the block
286   // identified by handle in (1) uncompressed cache, (2) compressed cache, and
287   // then (3) file. If found, inserts into the cache(s) that were searched
288   // unsuccessfully (e.g., if found in file, will add to both uncompressed and
289   // compressed caches if they're enabled).
290   //
291   // @param block_entry value is set to the uncompressed block if found. If
292   //    in uncompressed block cache, also sets cache_handle to reference that
293   //    block.
294   template <typename TBlocklike>
295   Status MaybeReadBlockAndLoadToCache(
296       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
297       const BlockHandle& handle, const UncompressionDict& uncompression_dict,
298       CachableEntry<TBlocklike>* block_entry, BlockType block_type,
299       GetContext* get_context, BlockCacheLookupContext* lookup_context,
300       BlockContents* contents) const;
301 
302   // Similar to the above, with one crucial difference: it will retrieve the
303   // block from the file even if there are no caches configured (assuming the
304   // read options allow I/O).
305   template <typename TBlocklike>
306   Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
307                        const ReadOptions& ro, const BlockHandle& handle,
308                        const UncompressionDict& uncompression_dict,
309                        CachableEntry<TBlocklike>* block_entry,
310                        BlockType block_type, GetContext* get_context,
311                        BlockCacheLookupContext* lookup_context,
312                        bool for_compaction, bool use_cache) const;
313 
314   void RetrieveMultipleBlocks(
315       const ReadOptions& options, const MultiGetRange* batch,
316       const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
317       autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
318       autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>*
319           results,
320       char* scratch, const UncompressionDict& uncompression_dict) const;
321 
322   // Get the iterator from the index reader.
323   //
324   // If input_iter is not set, return a new Iterator.
325   // If input_iter is set, try to update it and return it as Iterator.
326   // However note that in some cases the returned iterator may be different
327   // from input_iter. In such case the returned iterator should be freed.
328   //
329   // Note: ErrorIterator with Status::Incomplete shall be returned if all the
330   // following conditions are met:
331   //  1. We enabled table_options.cache_index_and_filter_blocks.
332   //  2. index is not present in block cache.
333   //  3. We disallowed any io to be performed, that is, read_options ==
334   //     kBlockCacheTier
335   InternalIteratorBase<IndexValue>* NewIndexIterator(
336       const ReadOptions& read_options, bool need_upper_bound_check,
337       IndexBlockIter* input_iter, GetContext* get_context,
338       BlockCacheLookupContext* lookup_context) const;
339 
340   // Read block cache from block caches (if set): block_cache and
341   // block_cache_compressed.
342   // On success, Status::OK with be returned and @block will be populated with
343   // pointer to the block as well as its block handle.
344   // @param uncompression_dict Data for presetting the compression library's
345   //    dictionary.
346   template <typename TBlocklike>
347   Status GetDataBlockFromCache(
348       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
349       Cache* block_cache, Cache* block_cache_compressed,
350       const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
351       const UncompressionDict& uncompression_dict, BlockType block_type,
352       GetContext* get_context) const;
353 
354   // Put a raw block (maybe compressed) to the corresponding block caches.
355   // This method will perform decompression against raw_block if needed and then
356   // populate the block caches.
357   // On success, Status::OK will be returned; also @block will be populated with
358   // uncompressed block and its cache handle.
359   //
360   // Allocated memory managed by raw_block_contents will be transferred to
361   // PutDataBlockToCache(). After the call, the object will be invalid.
362   // @param uncompression_dict Data for presetting the compression library's
363   //    dictionary.
364   template <typename TBlocklike>
365   Status PutDataBlockToCache(const Slice& block_cache_key,
366                              const Slice& compressed_block_cache_key,
367                              Cache* block_cache, Cache* block_cache_compressed,
368                              CachableEntry<TBlocklike>* cached_block,
369                              BlockContents* raw_block_contents,
370                              CompressionType raw_block_comp_type,
371                              const UncompressionDict& uncompression_dict,
372                              MemoryAllocator* memory_allocator,
373                              BlockType block_type,
374                              GetContext* get_context) const;
375 
376   // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
377   // after a call to Seek(key), until handle_result returns false.
378   // May not make such a call if filter policy says that key is not present.
379   friend class TableCache;
380   friend class BlockBasedTableBuilder;
381 
382   // Create a index reader based on the index type stored in the table.
383   // Optionally, user can pass a preloaded meta_index_iter for the index that
384   // need to access extra meta blocks for index construction. This parameter
385   // helps avoid re-reading meta index block if caller already created one.
386   Status CreateIndexReader(const ReadOptions& ro,
387                            FilePrefetchBuffer* prefetch_buffer,
388                            InternalIterator* preloaded_meta_index_iter,
389                            bool use_cache, bool prefetch, bool pin,
390                            BlockCacheLookupContext* lookup_context,
391                            std::unique_ptr<IndexReader>* index_reader);
392 
393   bool FullFilterKeyMayMatch(const ReadOptions& read_options,
394                              FilterBlockReader* filter, const Slice& user_key,
395                              const bool no_io,
396                              const SliceTransform* prefix_extractor,
397                              GetContext* get_context,
398                              BlockCacheLookupContext* lookup_context) const;
399 
400   void FullFilterKeysMayMatch(const ReadOptions& read_options,
401                               FilterBlockReader* filter, MultiGetRange* range,
402                               const bool no_io,
403                               const SliceTransform* prefix_extractor,
404                               BlockCacheLookupContext* lookup_context) const;
405 
406   // If force_direct_prefetch is true, always prefetching to RocksDB
407   //    buffer, rather than calling RandomAccessFile::Prefetch().
408   static Status PrefetchTail(
409       const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size,
410       bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats,
411       const bool prefetch_all, const bool preload_all,
412       std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
413   Status ReadMetaIndexBlock(const ReadOptions& ro,
414                             FilePrefetchBuffer* prefetch_buffer,
415                             std::unique_ptr<Block>* metaindex_block,
416                             std::unique_ptr<InternalIterator>* iter);
417   Status TryReadPropertiesWithGlobalSeqno(const ReadOptions& ro,
418                                           FilePrefetchBuffer* prefetch_buffer,
419                                           const Slice& handle_value,
420                                           TableProperties** table_properties);
421   Status ReadPropertiesBlock(const ReadOptions& ro,
422                              FilePrefetchBuffer* prefetch_buffer,
423                              InternalIterator* meta_iter,
424                              const SequenceNumber largest_seqno);
425   Status ReadRangeDelBlock(const ReadOptions& ro,
426                            FilePrefetchBuffer* prefetch_buffer,
427                            InternalIterator* meta_iter,
428                            const InternalKeyComparator& internal_comparator,
429                            BlockCacheLookupContext* lookup_context);
430   Status PrefetchIndexAndFilterBlocks(
431       const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
432       InternalIterator* meta_iter, BlockBasedTable* new_table,
433       bool prefetch_all, const BlockBasedTableOptions& table_options,
434       const int level, size_t file_size, size_t max_file_size_for_l0_meta_pin,
435       BlockCacheLookupContext* lookup_context);
436 
437   static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
438 
439   Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
440   Status VerifyChecksumInBlocks(const ReadOptions& read_options,
441                                 InternalIteratorBase<IndexValue>* index_iter);
442 
443   // Create the filter from the filter block.
444   std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
445       const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
446       bool use_cache, bool prefetch, bool pin,
447       BlockCacheLookupContext* lookup_context);
448 
449   static void SetupCacheKeyPrefix(Rep* rep);
450 
451   // Generate a cache key prefix from the file
452   template <typename TCache, typename TFile>
GenerateCachePrefix(TCache * cc,TFile * file,char * buffer,size_t * size)453   static void GenerateCachePrefix(TCache* cc, TFile* file, char* buffer,
454                                   size_t* size) {
455     // generate an id from the file
456     *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
457 
458     // If the prefix wasn't generated or was too long,
459     // create one from the cache.
460     if (cc != nullptr && *size == 0) {
461       char* end = EncodeVarint64(buffer, cc->NewId());
462       *size = static_cast<size_t>(end - buffer);
463     }
464   }
465 
466   // Size of all data blocks, maybe approximate
467   uint64_t GetApproximateDataSize();
468 
469   // Given an iterator return its offset in data block section of file.
470   uint64_t ApproximateDataOffsetOf(
471       const InternalIteratorBase<IndexValue>& index_iter,
472       uint64_t data_size) const;
473 
474   // Helper functions for DumpTable()
475   Status DumpIndexBlock(std::ostream& out_stream);
476   Status DumpDataBlocks(std::ostream& out_stream);
477   void DumpKeyValue(const Slice& key, const Slice& value,
478                     std::ostream& out_stream);
479 
480   // A cumulative data block file read in MultiGet lower than this size will
481   // use a stack buffer
482   static constexpr size_t kMultiGetReadStackBufSize = 8192;
483 
484   friend class PartitionedFilterBlockReader;
485   friend class PartitionedFilterBlockTest;
486   friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
487 };
488 
489 // Maintaining state of a two-level iteration on a partitioned index structure.
490 class BlockBasedTable::PartitionedIndexIteratorState
491     : public TwoLevelIteratorState {
492  public:
493   PartitionedIndexIteratorState(
494       const BlockBasedTable* table,
495       std::unordered_map<uint64_t, CachableEntry<Block>>* block_map);
496   InternalIteratorBase<IndexValue>* NewSecondaryIterator(
497       const BlockHandle& index_value) override;
498 
499  private:
500   // Don't own table_
501   const BlockBasedTable* table_;
502   std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
503 };
504 
505 // Stores all the properties associated with a BlockBasedTable.
506 // These are immutable.
507 struct BlockBasedTable::Rep {
RepRep508   Rep(const ImmutableOptions& _ioptions, const EnvOptions& _env_options,
509       const BlockBasedTableOptions& _table_opt,
510       const InternalKeyComparator& _internal_comparator, bool skip_filters,
511       uint64_t _file_size, int _level, const bool _immortal_table)
512       : ioptions(_ioptions),
513         env_options(_env_options),
514         table_options(_table_opt),
515         filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
516         internal_comparator(_internal_comparator),
517         filter_type(FilterType::kNoFilter),
518         index_type(BlockBasedTableOptions::IndexType::kBinarySearch),
519         hash_index_allow_collision(false),
520         whole_key_filtering(_table_opt.whole_key_filtering),
521         prefix_filtering(true),
522         global_seqno(kDisableGlobalSequenceNumber),
523         file_size(_file_size),
524         level(_level),
525         immortal_table(_immortal_table) {}
~RepRep526   ~Rep() { status.PermitUncheckedError(); }
527   const ImmutableOptions& ioptions;
528   const EnvOptions& env_options;
529   const BlockBasedTableOptions table_options;
530   const FilterPolicy* const filter_policy;
531   const InternalKeyComparator& internal_comparator;
532   Status status;
533   std::unique_ptr<RandomAccessFileReader> file;
534   char cache_key_prefix[kMaxCacheKeyPrefixSize];
535   size_t cache_key_prefix_size = 0;
536   char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
537   size_t persistent_cache_key_prefix_size = 0;
538   char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
539   size_t compressed_cache_key_prefix_size = 0;
540   PersistentCacheOptions persistent_cache_options;
541 
542   // Footer contains the fixed table information
543   Footer footer;
544 
545   std::unique_ptr<IndexReader> index_reader;
546   std::unique_ptr<FilterBlockReader> filter;
547   std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
548 
549   enum class FilterType {
550     kNoFilter,
551     kFullFilter,
552     kBlockFilter,
553     kPartitionedFilter,
554   };
555   FilterType filter_type;
556   BlockHandle filter_handle;
557   BlockHandle compression_dict_handle;
558 
559   std::shared_ptr<const TableProperties> table_properties;
560   BlockBasedTableOptions::IndexType index_type;
561   bool hash_index_allow_collision;
562   bool whole_key_filtering;
563   bool prefix_filtering;
564   // TODO(kailiu) It is very ugly to use internal key in table, since table
565   // module should not be relying on db module. However to make things easier
566   // and compatible with existing code, we introduce a wrapper that allows
567   // block to extract prefix without knowing if a key is internal or not.
568   // null if no prefix_extractor is passed in when opening the table reader.
569   std::unique_ptr<SliceTransform> internal_prefix_transform;
570   std::shared_ptr<const SliceTransform> table_prefix_extractor;
571 
572   std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
573 
574   // If global_seqno is used, all Keys in this file will have the same
575   // seqno with value `global_seqno`.
576   //
577   // A value of kDisableGlobalSequenceNumber means that this feature is disabled
578   // and every key have it's own seqno.
579   SequenceNumber global_seqno;
580 
581   // Size of the table file on disk
582   uint64_t file_size;
583 
584   // the level when the table is opened, could potentially change when trivial
585   // move is involved
586   int level;
587 
588   // If false, blocks in this file are definitely all uncompressed. Knowing this
589   // before reading individual blocks enables certain optimizations.
590   bool blocks_maybe_compressed = true;
591 
592   // If true, data blocks in this file are definitely ZSTD compressed. If false
593   // they might not be. When false we skip creating a ZSTD digested
594   // uncompression dictionary. Even if we get a false negative, things should
595   // still work, just not as quickly.
596   bool blocks_definitely_zstd_compressed = false;
597 
598   // These describe how index is encoded.
599   bool index_has_first_key = false;
600   bool index_key_includes_seq = true;
601   bool index_value_is_full = true;
602 
603   const bool immortal_table;
604 
get_global_seqnoRep605   SequenceNumber get_global_seqno(BlockType block_type) const {
606     return (block_type == BlockType::kFilter ||
607             block_type == BlockType::kCompressionDictionary)
608                ? kDisableGlobalSequenceNumber
609                : global_seqno;
610   }
611 
cf_id_for_tracingRep612   uint64_t cf_id_for_tracing() const {
613     return table_properties
614                ? table_properties->column_family_id
615                : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context::
616                      kUnknownColumnFamily;
617   }
618 
cf_name_for_tracingRep619   Slice cf_name_for_tracing() const {
620     return table_properties ? table_properties->column_family_name
621                             : BlockCacheTraceHelper::kUnknownColumnFamilyName;
622   }
623 
level_for_tracingRep624   uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; }
625 
sst_number_for_tracingRep626   uint64_t sst_number_for_tracing() const {
627     return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
628   }
CreateFilePrefetchBufferRep629   void CreateFilePrefetchBuffer(size_t readahead_size,
630                                 size_t max_readahead_size,
631                                 std::unique_ptr<FilePrefetchBuffer>* fpb,
632                                 bool implicit_auto_readahead) const {
633     fpb->reset(new FilePrefetchBuffer(
634         file.get(), readahead_size, max_readahead_size,
635         !ioptions.allow_mmap_reads /* enable */, false /* track_min_offset*/,
636         implicit_auto_readahead));
637   }
638 
CreateFilePrefetchBufferIfNotExistsRep639   void CreateFilePrefetchBufferIfNotExists(
640       size_t readahead_size, size_t max_readahead_size,
641       std::unique_ptr<FilePrefetchBuffer>* fpb,
642       bool implicit_auto_readahead) const {
643     if (!(*fpb)) {
644       CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb,
645                                implicit_auto_readahead);
646     }
647   }
648 };
649 
650 // This is an adapter class for `WritableFile` to be used for `std::ostream`.
651 // The adapter wraps a `WritableFile`, which can be passed to a `std::ostream`
652 // constructor for storing streaming data.
653 // Note:
654 //  * This adapter doesn't provide any buffering, each write is forwarded to
655 //    `WritableFile->Append()` directly.
656 //  * For a failed write, the user needs to check the status by `ostream.good()`
657 class WritableFileStringStreamAdapter : public std::stringbuf {
658  public:
WritableFileStringStreamAdapter(WritableFile * writable_file)659   explicit WritableFileStringStreamAdapter(WritableFile* writable_file)
660       : file_(writable_file) {}
661 
662   // Override overflow() to handle `sputc()`. There are cases that will not go
663   // through `xsputn()` e.g. `std::endl` or an unsigned long long is written by
664   // `os.put()` directly and will call `sputc()` By internal implementation:
665   //    int_type __CLR_OR_THIS_CALL sputc(_Elem _Ch) {  // put a character
666   //        return 0 < _Pnavail() ? _Traits::to_int_type(*_Pninc() = _Ch) :
667   //        overflow(_Traits::to_int_type(_Ch));
668   //    }
669   // As we explicitly disabled buffering (_Pnavail() is always 0), every write,
670   // not captured by xsputn(), becomes an overflow here.
671   int overflow(int ch = EOF) override {
672     if (ch != EOF) {
673       Status s = file_->Append(Slice((char*)&ch, 1));
674       if (s.ok()) {
675         return ch;
676       }
677     }
678     return EOF;
679   }
680 
xsputn(char const * p,std::streamsize n)681   std::streamsize xsputn(char const* p, std::streamsize n) override {
682     Status s = file_->Append(Slice(p, n));
683     if (!s.ok()) {
684       return 0;
685     }
686     return n;
687   }
688 
689  private:
690   WritableFile* file_;
691 };
692 
693 }  // namespace ROCKSDB_NAMESPACE
694