1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #pragma once
11 
12 #include <stdint.h>
13 #include <memory>
14 #include <set>
15 #include <string>
16 #include <utility>
17 #include <vector>
18 
19 #include "db/range_tombstone_fragmenter.h"
20 #include "file/filename.h"
21 #include "file/random_access_file_reader.h"
22 #include "options/cf_options.h"
23 #include "rocksdb/options.h"
24 #include "rocksdb/persistent_cache.h"
25 #include "rocksdb/statistics.h"
26 #include "rocksdb/status.h"
27 #include "rocksdb/table.h"
28 #include "table/block_based/block.h"
29 #include "table/block_based/block_based_table_factory.h"
30 #include "table/block_based/block_type.h"
31 #include "table/block_based/cachable_entry.h"
32 #include "table/block_based/filter_block.h"
33 #include "table/block_based/uncompression_dict_reader.h"
34 #include "table/format.h"
35 #include "table/get_context.h"
36 #include "table/multiget_context.h"
37 #include "table/persistent_cache_helper.h"
38 #include "table/table_properties_internal.h"
39 #include "table/table_reader.h"
40 #include "table/two_level_iterator.h"
41 #include "trace_replay/block_cache_tracer.h"
42 #include "util/coding.h"
43 #include "util/user_comparator_wrapper.h"
44 
45 namespace ROCKSDB_NAMESPACE {
46 
47 class Cache;
48 class FilterBlockReader;
49 class BlockBasedFilterBlockReader;
50 class FullFilterBlockReader;
51 class Footer;
52 class InternalKeyComparator;
53 class Iterator;
54 class FSRandomAccessFile;
55 class TableCache;
56 class TableReader;
57 class WritableFile;
58 struct BlockBasedTableOptions;
59 struct EnvOptions;
60 struct ReadOptions;
61 class GetContext;
62 
63 typedef std::vector<std::pair<std::string, std::string>> KVPairBlock;
64 
65 // Reader class for BlockBasedTable format.
66 // For the format of BlockBasedTable refer to
67 // https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format.
68 // This is the default table type. Data is chucked into fixed size blocks and
69 // each block in-turn stores entries. When storing data, we can compress and/or
70 // encode data efficiently within a block, which often results in a much smaller
71 // data size compared with the raw data size. As for the record retrieval, we'll
72 // first locate the block where target record may reside, then read the block to
73 // memory, and finally search that record within the block. Of course, to avoid
74 // frequent reads of the same block, we introduced the block cache to keep the
75 // loaded blocks in the memory.
76 class BlockBasedTable : public TableReader {
77  public:
78   static const std::string kFilterBlockPrefix;
79   static const std::string kFullFilterBlockPrefix;
80   static const std::string kPartitionedFilterBlockPrefix;
81   // The longest prefix of the cache key used to identify blocks.
82   // For Posix files the unique ID is three varints.
83   static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1;
84 
85   // All the below fields control iterator readahead
86   static const size_t kInitAutoReadaheadSize = 8 * 1024;
87   // Found that 256 KB readahead size provides the best performance, based on
88   // experiments, for auto readahead. Experiment data is in PR #3282.
89   static const size_t kMaxAutoReadaheadSize;
90   static const int kMinNumFileReadsToStartAutoReadahead = 2;
91 
92   // Attempt to open the table that is stored in bytes [0..file_size)
93   // of "file", and read the metadata entries necessary to allow
94   // retrieving data from the table.
95   //
96   // If successful, returns ok and sets "*table_reader" to the newly opened
97   // table.  The client should delete "*table_reader" when no longer needed.
98   // If there was an error while initializing the table, sets "*table_reader"
99   // to nullptr and returns a non-ok status.
100   //
101   // @param file must remain live while this Table is in use.
102   // @param prefetch_index_and_filter_in_cache can be used to disable
103   // prefetching of
104   //    index and filter blocks into block cache at startup
105   // @param skip_filters Disables loading/accessing the filter block. Overrides
106   //    prefetch_index_and_filter_in_cache, so filter will be skipped if both
107   //    are set.
108   static Status Open(const ImmutableCFOptions& ioptions,
109                      const EnvOptions& env_options,
110                      const BlockBasedTableOptions& table_options,
111                      const InternalKeyComparator& internal_key_comparator,
112                      std::unique_ptr<RandomAccessFileReader>&& file,
113                      uint64_t file_size,
114                      std::unique_ptr<TableReader>* table_reader,
115                      const SliceTransform* prefix_extractor = nullptr,
116                      bool prefetch_index_and_filter_in_cache = true,
117                      bool skip_filters = false, int level = -1,
118                      const bool immortal_table = false,
119                      const SequenceNumber largest_seqno = 0,
120                      TailPrefetchStats* tail_prefetch_stats = nullptr,
121                      BlockCacheTracer* const block_cache_tracer = nullptr);
122 
123   bool PrefixMayMatch(const Slice& internal_key,
124                       const ReadOptions& read_options,
125                       const SliceTransform* options_prefix_extractor,
126                       const bool need_upper_bound_check,
127                       BlockCacheLookupContext* lookup_context) const;
128 
129   // Returns a new iterator over the table contents.
130   // The result of NewIterator() is initially invalid (caller must
131   // call one of the Seek methods on the iterator before using it).
132   // @param skip_filters Disables loading/accessing the filter block
133   // compaction_readahead_size: its value will only be used if caller =
134   // kCompaction.
135   InternalIterator* NewIterator(const ReadOptions&,
136                                 const SliceTransform* prefix_extractor,
137                                 Arena* arena, bool skip_filters,
138                                 TableReaderCaller caller,
139                                 size_t compaction_readahead_size = 0) override;
140 
141   FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
142       const ReadOptions& read_options) override;
143 
144   // @param skip_filters Disables loading/accessing the filter block
145   Status Get(const ReadOptions& readOptions, const Slice& key,
146              GetContext* get_context, const SliceTransform* prefix_extractor,
147              bool skip_filters = false) override;
148 
149   void MultiGet(const ReadOptions& readOptions,
150                 const MultiGetContext::Range* mget_range,
151                 const SliceTransform* prefix_extractor,
152                 bool skip_filters = false) override;
153 
154   // Pre-fetch the disk blocks that correspond to the key range specified by
155   // (kbegin, kend). The call will return error status in the event of
156   // IO or iteration error.
157   Status Prefetch(const Slice* begin, const Slice* end) override;
158 
159   // Given a key, return an approximate byte offset in the file where
160   // the data for that key begins (or would begin if the key were
161   // present in the file). The returned value is in terms of file
162   // bytes, and so includes effects like compression of the underlying data.
163   // E.g., the approximate offset of the last key in the table will
164   // be close to the file length.
165   uint64_t ApproximateOffsetOf(const Slice& key,
166                                TableReaderCaller caller) override;
167 
168   // Given start and end keys, return the approximate data size in the file
169   // between the keys. The returned value is in terms of file bytes, and so
170   // includes effects like compression of the underlying data.
171   // The start key must not be greater than the end key.
172   uint64_t ApproximateSize(const Slice& start, const Slice& end,
173                            TableReaderCaller caller) override;
174 
175   bool TEST_BlockInCache(const BlockHandle& handle) const;
176 
177   // Returns true if the block for the specified key is in cache.
178   // REQUIRES: key is in this table && block cache enabled
179   bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
180 
181   // Set up the table for Compaction. Might change some parameters with
182   // posix_fadvise
183   void SetupForCompaction() override;
184 
185   std::shared_ptr<const TableProperties> GetTableProperties() const override;
186 
187   size_t ApproximateMemoryUsage() const override;
188 
189   // convert SST file to a human readable form
190   Status DumpTable(WritableFile* out_file) override;
191 
192   Status VerifyChecksum(const ReadOptions& readOptions,
193                         TableReaderCaller caller) override;
194 
195   ~BlockBasedTable();
196 
197   bool TEST_FilterBlockInCache() const;
198   bool TEST_IndexBlockInCache() const;
199 
200   // IndexReader is the interface that provides the functionality for index
201   // access.
202   class IndexReader {
203    public:
204     virtual ~IndexReader() = default;
205 
206     // Create an iterator for index access. If iter is null, then a new object
207     // is created on the heap, and the callee will have the ownership.
208     // If a non-null iter is passed in, it will be used, and the returned value
209     // is either the same as iter or a new on-heap object that
210     // wraps the passed iter. In the latter case the return value points
211     // to a different object then iter, and the callee has the ownership of the
212     // returned object.
213     virtual InternalIteratorBase<IndexValue>* NewIterator(
214         const ReadOptions& read_options, bool disable_prefix_seek,
215         IndexBlockIter* iter, GetContext* get_context,
216         BlockCacheLookupContext* lookup_context) = 0;
217 
218     // Report an approximation of how much memory has been used other than
219     // memory that was allocated in block cache.
220     virtual size_t ApproximateMemoryUsage() const = 0;
221     // Cache the dependencies of the index reader (e.g. the partitions
222     // of a partitioned index).
CacheDependencies(bool)223     virtual void CacheDependencies(bool /* pin */) {}
224   };
225 
226   class IndexReaderCommon;
227 
228   static Slice GetCacheKey(const char* cache_key_prefix,
229                            size_t cache_key_prefix_size,
230                            const BlockHandle& handle, char* cache_key);
231 
232   // Retrieve all key value pairs from data blocks in the table.
233   // The key retrieved are internal keys.
234   Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
235 
236   struct Rep;
237 
get_rep()238   Rep* get_rep() { return rep_; }
get_rep()239   const Rep* get_rep() const { return rep_; }
240 
241   // input_iter: if it is not null, update this one and return it as Iterator
242   template <typename TBlockIter>
243   TBlockIter* NewDataBlockIterator(
244       const ReadOptions& ro, const BlockHandle& block_handle,
245       TBlockIter* input_iter, BlockType block_type, GetContext* get_context,
246       BlockCacheLookupContext* lookup_context, Status s,
247       FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const;
248 
249   // input_iter: if it is not null, update this one and return it as Iterator
250   template <typename TBlockIter>
251   TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
252                                    CachableEntry<Block>& block,
253                                    TBlockIter* input_iter, Status s) const;
254 
255   class PartitionedIndexIteratorState;
256 
257   template <typename TBlocklike>
258   friend class FilterBlockReaderCommon;
259 
260   friend class PartitionIndexReader;
261 
262   friend class UncompressionDictReader;
263 
264  protected:
265   Rep* rep_;
BlockBasedTable(Rep * rep,BlockCacheTracer * const block_cache_tracer)266   explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
267       : rep_(rep), block_cache_tracer_(block_cache_tracer) {}
268   // No copying allowed
269   explicit BlockBasedTable(const TableReader&) = delete;
270   void operator=(const TableReader&) = delete;
271 
272  private:
273   friend class MockedBlockBasedTable;
274   static std::atomic<uint64_t> next_cache_key_id_;
275   BlockCacheTracer* const block_cache_tracer_;
276 
277   void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context,
278                              size_t usage) const;
279   void UpdateCacheMissMetrics(BlockType block_type,
280                               GetContext* get_context) const;
281   void UpdateCacheInsertionMetrics(BlockType block_type,
282                                    GetContext* get_context, size_t usage) const;
283   Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
284                                    BlockType block_type,
285                                    GetContext* get_context) const;
286 
287   // Either Block::NewDataIterator() or Block::NewIndexIterator().
288   template <typename TBlockIter>
289   static TBlockIter* InitBlockIterator(const Rep* rep, Block* block,
290                                        TBlockIter* input_iter,
291                                        bool block_contents_pinned);
292 
293   // If block cache enabled (compressed or uncompressed), looks for the block
294   // identified by handle in (1) uncompressed cache, (2) compressed cache, and
295   // then (3) file. If found, inserts into the cache(s) that were searched
296   // unsuccessfully (e.g., if found in file, will add to both uncompressed and
297   // compressed caches if they're enabled).
298   //
299   // @param block_entry value is set to the uncompressed block if found. If
300   //    in uncompressed block cache, also sets cache_handle to reference that
301   //    block.
302   template <typename TBlocklike>
303   Status MaybeReadBlockAndLoadToCache(
304       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
305       const BlockHandle& handle, const UncompressionDict& uncompression_dict,
306       CachableEntry<TBlocklike>* block_entry, BlockType block_type,
307       GetContext* get_context, BlockCacheLookupContext* lookup_context,
308       BlockContents* contents) const;
309 
310   // Similar to the above, with one crucial difference: it will retrieve the
311   // block from the file even if there are no caches configured (assuming the
312   // read options allow I/O).
313   template <typename TBlocklike>
314   Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
315                        const ReadOptions& ro, const BlockHandle& handle,
316                        const UncompressionDict& uncompression_dict,
317                        CachableEntry<TBlocklike>* block_entry,
318                        BlockType block_type, GetContext* get_context,
319                        BlockCacheLookupContext* lookup_context,
320                        bool for_compaction, bool use_cache) const;
321 
322   void RetrieveMultipleBlocks(
323       const ReadOptions& options, const MultiGetRange* batch,
324       const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
325       autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
326       autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>*
327           results,
328       char* scratch, const UncompressionDict& uncompression_dict) const;
329 
330   // Get the iterator from the index reader.
331   //
332   // If input_iter is not set, return a new Iterator.
333   // If input_iter is set, try to update it and return it as Iterator.
334   // However note that in some cases the returned iterator may be different
335   // from input_iter. In such case the returned iterator should be freed.
336   //
337   // Note: ErrorIterator with Status::Incomplete shall be returned if all the
338   // following conditions are met:
339   //  1. We enabled table_options.cache_index_and_filter_blocks.
340   //  2. index is not present in block cache.
341   //  3. We disallowed any io to be performed, that is, read_options ==
342   //     kBlockCacheTier
343   InternalIteratorBase<IndexValue>* NewIndexIterator(
344       const ReadOptions& read_options, bool need_upper_bound_check,
345       IndexBlockIter* input_iter, GetContext* get_context,
346       BlockCacheLookupContext* lookup_context) const;
347 
348   // Read block cache from block caches (if set): block_cache and
349   // block_cache_compressed.
350   // On success, Status::OK with be returned and @block will be populated with
351   // pointer to the block as well as its block handle.
352   // @param uncompression_dict Data for presetting the compression library's
353   //    dictionary.
354   template <typename TBlocklike>
355   Status GetDataBlockFromCache(
356       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
357       Cache* block_cache, Cache* block_cache_compressed,
358       const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
359       const UncompressionDict& uncompression_dict, BlockType block_type,
360       GetContext* get_context) const;
361 
362   // Put a raw block (maybe compressed) to the corresponding block caches.
363   // This method will perform decompression against raw_block if needed and then
364   // populate the block caches.
365   // On success, Status::OK will be returned; also @block will be populated with
366   // uncompressed block and its cache handle.
367   //
368   // Allocated memory managed by raw_block_contents will be transferred to
369   // PutDataBlockToCache(). After the call, the object will be invalid.
370   // @param uncompression_dict Data for presetting the compression library's
371   //    dictionary.
372   template <typename TBlocklike>
373   Status PutDataBlockToCache(
374       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
375       Cache* block_cache, Cache* block_cache_compressed,
376       CachableEntry<TBlocklike>* cached_block,
377       BlockContents* raw_block_contents, CompressionType raw_block_comp_type,
378       const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
379       MemoryAllocator* memory_allocator, BlockType block_type,
380       GetContext* get_context) const;
381 
382   // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
383   // after a call to Seek(key), until handle_result returns false.
384   // May not make such a call if filter policy says that key is not present.
385   friend class TableCache;
386   friend class BlockBasedTableBuilder;
387 
388   // Create a index reader based on the index type stored in the table.
389   // Optionally, user can pass a preloaded meta_index_iter for the index that
390   // need to access extra meta blocks for index construction. This parameter
391   // helps avoid re-reading meta index block if caller already created one.
392   Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer,
393                            InternalIterator* preloaded_meta_index_iter,
394                            bool use_cache, bool prefetch, bool pin,
395                            BlockCacheLookupContext* lookup_context,
396                            std::unique_ptr<IndexReader>* index_reader);
397 
398   bool FullFilterKeyMayMatch(const ReadOptions& read_options,
399                              FilterBlockReader* filter, const Slice& user_key,
400                              const bool no_io,
401                              const SliceTransform* prefix_extractor,
402                              GetContext* get_context,
403                              BlockCacheLookupContext* lookup_context) const;
404 
405   void FullFilterKeysMayMatch(const ReadOptions& read_options,
406                               FilterBlockReader* filter, MultiGetRange* range,
407                               const bool no_io,
408                               const SliceTransform* prefix_extractor,
409                               BlockCacheLookupContext* lookup_context) const;
410 
411   static Status PrefetchTail(
412       RandomAccessFileReader* file, uint64_t file_size,
413       TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
414       const bool preload_all,
415       std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
416   Status ReadMetaIndexBlock(FilePrefetchBuffer* prefetch_buffer,
417                             std::unique_ptr<Block>* metaindex_block,
418                             std::unique_ptr<InternalIterator>* iter);
419   Status TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer* prefetch_buffer,
420                                           const Slice& handle_value,
421                                           TableProperties** table_properties);
422   Status ReadPropertiesBlock(FilePrefetchBuffer* prefetch_buffer,
423                              InternalIterator* meta_iter,
424                              const SequenceNumber largest_seqno);
425   Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer,
426                            InternalIterator* meta_iter,
427                            const InternalKeyComparator& internal_comparator,
428                            BlockCacheLookupContext* lookup_context);
429   Status PrefetchIndexAndFilterBlocks(
430       FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
431       BlockBasedTable* new_table, bool prefetch_all,
432       const BlockBasedTableOptions& table_options, const int level,
433       BlockCacheLookupContext* lookup_context);
434 
435   static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
436 
437   Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
438   Status VerifyChecksumInBlocks(const ReadOptions& read_options,
439                                 InternalIteratorBase<IndexValue>* index_iter);
440 
441   // Create the filter from the filter block.
442   std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
443       FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
444       bool pin, BlockCacheLookupContext* lookup_context);
445 
446   static void SetupCacheKeyPrefix(Rep* rep);
447 
448   // Generate a cache key prefix from the file
449   static void GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
450                                   char* buffer, size_t* size);
451   static void GenerateCachePrefix(Cache* cc, FSWritableFile* file, char* buffer,
452                                   size_t* size);
453 
454   // Given an iterator return its offset in file.
455   uint64_t ApproximateOffsetOf(
456       const InternalIteratorBase<IndexValue>& index_iter) const;
457 
458   // Helper functions for DumpTable()
459   Status DumpIndexBlock(WritableFile* out_file);
460   Status DumpDataBlocks(WritableFile* out_file);
461   void DumpKeyValue(const Slice& key, const Slice& value,
462                     WritableFile* out_file);
463 
464   // A cumulative data block file read in MultiGet lower than this size will
465   // use a stack buffer
466   static constexpr size_t kMultiGetReadStackBufSize = 8192;
467 
468   friend class PartitionedFilterBlockReader;
469   friend class PartitionedFilterBlockTest;
470   friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
471 };
472 
473 // Maitaning state of a two-level iteration on a partitioned index structure.
474 class BlockBasedTable::PartitionedIndexIteratorState
475     : public TwoLevelIteratorState {
476  public:
477   PartitionedIndexIteratorState(
478       const BlockBasedTable* table,
479       std::unordered_map<uint64_t, CachableEntry<Block>>* block_map);
480   InternalIteratorBase<IndexValue>* NewSecondaryIterator(
481       const BlockHandle& index_value) override;
482 
483  private:
484   // Don't own table_
485   const BlockBasedTable* table_;
486   std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
487 };
488 
489 // Stores all the properties associated with a BlockBasedTable.
490 // These are immutable.
491 struct BlockBasedTable::Rep {
RepRep492   Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
493       const BlockBasedTableOptions& _table_opt,
494       const InternalKeyComparator& _internal_comparator, bool skip_filters,
495       int _level, const bool _immortal_table)
496       : ioptions(_ioptions),
497         env_options(_env_options),
498         table_options(_table_opt),
499         filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
500         internal_comparator(_internal_comparator),
501         filter_type(FilterType::kNoFilter),
502         index_type(BlockBasedTableOptions::IndexType::kBinarySearch),
503         hash_index_allow_collision(false),
504         whole_key_filtering(_table_opt.whole_key_filtering),
505         prefix_filtering(true),
506         global_seqno(kDisableGlobalSequenceNumber),
507         level(_level),
508         immortal_table(_immortal_table) {}
509 
510   const ImmutableCFOptions& ioptions;
511   const EnvOptions& env_options;
512   const BlockBasedTableOptions table_options;
513   const FilterPolicy* const filter_policy;
514   const InternalKeyComparator& internal_comparator;
515   Status status;
516   std::unique_ptr<RandomAccessFileReader> file;
517   char cache_key_prefix[kMaxCacheKeyPrefixSize];
518   size_t cache_key_prefix_size = 0;
519   char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
520   size_t persistent_cache_key_prefix_size = 0;
521   char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
522   size_t compressed_cache_key_prefix_size = 0;
523   PersistentCacheOptions persistent_cache_options;
524 
525   // Footer contains the fixed table information
526   Footer footer;
527 
528   std::unique_ptr<IndexReader> index_reader;
529   std::unique_ptr<FilterBlockReader> filter;
530   std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
531 
532   enum class FilterType {
533     kNoFilter,
534     kFullFilter,
535     kBlockFilter,
536     kPartitionedFilter,
537   };
538   FilterType filter_type;
539   BlockHandle filter_handle;
540   BlockHandle compression_dict_handle;
541 
542   std::shared_ptr<const TableProperties> table_properties;
543   BlockBasedTableOptions::IndexType index_type;
544   bool hash_index_allow_collision;
545   bool whole_key_filtering;
546   bool prefix_filtering;
547   // TODO(kailiu) It is very ugly to use internal key in table, since table
548   // module should not be relying on db module. However to make things easier
549   // and compatible with existing code, we introduce a wrapper that allows
550   // block to extract prefix without knowing if a key is internal or not.
551   // null if no prefix_extractor is passed in when opening the table reader.
552   std::unique_ptr<SliceTransform> internal_prefix_transform;
553   std::shared_ptr<const SliceTransform> table_prefix_extractor;
554 
555   std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
556 
557   // If global_seqno is used, all Keys in this file will have the same
558   // seqno with value `global_seqno`.
559   //
560   // A value of kDisableGlobalSequenceNumber means that this feature is disabled
561   // and every key have it's own seqno.
562   SequenceNumber global_seqno;
563 
564   // the level when the table is opened, could potentially change when trivial
565   // move is involved
566   int level;
567 
568   // If false, blocks in this file are definitely all uncompressed. Knowing this
569   // before reading individual blocks enables certain optimizations.
570   bool blocks_maybe_compressed = true;
571 
572   // If true, data blocks in this file are definitely ZSTD compressed. If false
573   // they might not be. When false we skip creating a ZSTD digested
574   // uncompression dictionary. Even if we get a false negative, things should
575   // still work, just not as quickly.
576   bool blocks_definitely_zstd_compressed = false;
577 
578   // These describe how index is encoded.
579   bool index_has_first_key = false;
580   bool index_key_includes_seq = true;
581   bool index_value_is_full = true;
582 
583   const bool immortal_table;
584 
get_global_seqnoRep585   SequenceNumber get_global_seqno(BlockType block_type) const {
586     return (block_type == BlockType::kFilter ||
587             block_type == BlockType::kCompressionDictionary)
588                ? kDisableGlobalSequenceNumber
589                : global_seqno;
590   }
591 
cf_id_for_tracingRep592   uint64_t cf_id_for_tracing() const {
593     return table_properties
594                ? table_properties->column_family_id
595                : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context::
596                      kUnknownColumnFamily;
597   }
598 
cf_name_for_tracingRep599   Slice cf_name_for_tracing() const {
600     return table_properties ? table_properties->column_family_name
601                             : BlockCacheTraceHelper::kUnknownColumnFamilyName;
602   }
603 
level_for_tracingRep604   uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; }
605 
sst_number_for_tracingRep606   uint64_t sst_number_for_tracing() const {
607     return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
608   }
CreateFilePrefetchBufferRep609   void CreateFilePrefetchBuffer(
610       size_t readahead_size, size_t max_readahead_size,
611       std::unique_ptr<FilePrefetchBuffer>* fpb) const {
612     fpb->reset(new FilePrefetchBuffer(file.get(), readahead_size,
613                                       max_readahead_size,
614                                       !ioptions.allow_mmap_reads /* enable */));
615   }
616 };
617 
618 // Iterates over the contents of BlockBasedTable.
619 template <class TBlockIter, typename TValue = Slice>
620 class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
621   // compaction_readahead_size: its value will only be used if for_compaction =
622   // true
623  public:
624   BlockBasedTableIterator(const BlockBasedTable* table,
625                           const ReadOptions& read_options,
626                           const InternalKeyComparator& icomp,
627                           InternalIteratorBase<IndexValue>* index_iter,
628                           bool check_filter, bool need_upper_bound_check,
629                           const SliceTransform* prefix_extractor,
630                           BlockType block_type, TableReaderCaller caller,
631                           size_t compaction_readahead_size = 0)
table_(table)632       : table_(table),
633         read_options_(read_options),
634         icomp_(icomp),
635         user_comparator_(icomp.user_comparator()),
636         index_iter_(index_iter),
637         pinned_iters_mgr_(nullptr),
638         block_iter_points_to_real_block_(false),
639         check_filter_(check_filter),
640         need_upper_bound_check_(need_upper_bound_check),
641         prefix_extractor_(prefix_extractor),
642         block_type_(block_type),
643         lookup_context_(caller),
644         compaction_readahead_size_(compaction_readahead_size) {}
645 
~BlockBasedTableIterator()646   ~BlockBasedTableIterator() { delete index_iter_; }
647 
648   void Seek(const Slice& target) override;
649   void SeekForPrev(const Slice& target) override;
650   void SeekToFirst() override;
651   void SeekToLast() override;
652   void Next() final override;
653   bool NextAndGetResult(IterateResult* result) override;
654   void Prev() override;
Valid()655   bool Valid() const override {
656     return !is_out_of_bound_ &&
657            (is_at_first_key_from_index_ ||
658             (block_iter_points_to_real_block_ && block_iter_.Valid()));
659   }
key()660   Slice key() const override {
661     assert(Valid());
662     if (is_at_first_key_from_index_) {
663       return index_iter_->value().first_internal_key;
664     } else {
665       return block_iter_.key();
666     }
667   }
user_key()668   Slice user_key() const override {
669     assert(Valid());
670     if (is_at_first_key_from_index_) {
671       return ExtractUserKey(index_iter_->value().first_internal_key);
672     } else {
673       return block_iter_.user_key();
674     }
675   }
value()676   TValue value() const override {
677     assert(Valid());
678 
679     // Load current block if not loaded.
680     if (is_at_first_key_from_index_ &&
681         !const_cast<BlockBasedTableIterator*>(this)
682              ->MaterializeCurrentBlock()) {
683       // Oops, index is not consistent with block contents, but we have
684       // no good way to report error at this point. Let's return empty value.
685       return TValue();
686     }
687 
688     return block_iter_.value();
689   }
status()690   Status status() const override {
691     // Prefix index set status to NotFound when the prefix does not exist
692     if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
693       return index_iter_->status();
694     } else if (block_iter_points_to_real_block_) {
695       return block_iter_.status();
696     } else {
697       return Status::OK();
698     }
699   }
700 
701   // Whether iterator invalidated for being out of bound.
IsOutOfBound()702   bool IsOutOfBound() override { return is_out_of_bound_; }
703 
MayBeOutOfUpperBound()704   inline bool MayBeOutOfUpperBound() override {
705     assert(Valid());
706     return !data_block_within_upper_bound_;
707   }
708 
SetPinnedItersMgr(PinnedIteratorsManager * pinned_iters_mgr)709   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
710     pinned_iters_mgr_ = pinned_iters_mgr;
711   }
IsKeyPinned()712   bool IsKeyPinned() const override {
713     // Our key comes either from block_iter_'s current key
714     // or index_iter_'s current *value*.
715     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
716            ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) ||
717             (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned()));
718   }
IsValuePinned()719   bool IsValuePinned() const override {
720     // Load current block if not loaded.
721     if (is_at_first_key_from_index_) {
722       const_cast<BlockBasedTableIterator*>(this)->MaterializeCurrentBlock();
723     }
724     // BlockIter::IsValuePinned() is always true. No need to check
725     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
726            block_iter_points_to_real_block_;
727   }
728 
ResetDataIter()729   void ResetDataIter() {
730     if (block_iter_points_to_real_block_) {
731       if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
732         block_iter_.DelegateCleanupsTo(pinned_iters_mgr_);
733       }
734       block_iter_.Invalidate(Status::OK());
735       block_iter_points_to_real_block_ = false;
736     }
737   }
738 
SavePrevIndexValue()739   void SavePrevIndexValue() {
740     if (block_iter_points_to_real_block_) {
741       // Reseek. If they end up with the same data block, we shouldn't re-fetch
742       // the same data block.
743       prev_block_offset_ = index_iter_->value().handle.offset();
744     }
745   }
746 
747  private:
748   enum class IterDirection {
749     kForward,
750     kBackward,
751   };
752 
753   const BlockBasedTable* table_;
754   const ReadOptions read_options_;
755   const InternalKeyComparator& icomp_;
756   UserComparatorWrapper user_comparator_;
757   InternalIteratorBase<IndexValue>* index_iter_;
758   PinnedIteratorsManager* pinned_iters_mgr_;
759   TBlockIter block_iter_;
760 
761   // True if block_iter_ is initialized and points to the same block
762   // as index iterator.
763   bool block_iter_points_to_real_block_;
764   // See InternalIteratorBase::IsOutOfBound().
765   bool is_out_of_bound_ = false;
766   // Whether current data block being fully within iterate upper bound.
767   bool data_block_within_upper_bound_ = false;
768   // True if we're standing at the first key of a block, and we haven't loaded
769   // that block yet. A call to value() will trigger loading the block.
770   bool is_at_first_key_from_index_ = false;
771   bool check_filter_;
772   // TODO(Zhongyi): pick a better name
773   bool need_upper_bound_check_;
774   const SliceTransform* prefix_extractor_;
775   BlockType block_type_;
776   uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
777   BlockCacheLookupContext lookup_context_;
778   // Readahead size used in compaction, its value is used only if
779   // lookup_context_.caller = kCompaction.
780   size_t compaction_readahead_size_;
781 
782   size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize;
783   size_t readahead_limit_ = 0;
784   int64_t num_file_reads_ = 0;
785   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
786 
787   // If `target` is null, seek to first.
788   void SeekImpl(const Slice* target);
789 
790   void InitDataBlock();
791   bool MaterializeCurrentBlock();
792   void FindKeyForward();
793   void FindBlockForward();
794   void FindKeyBackward();
795   void CheckOutOfBound();
796 
797   // Check if data block is fully within iterate_upper_bound.
798   //
799   // Note MyRocks may update iterate bounds between seek. To workaround it,
800   // we need to check and update data_block_within_upper_bound_ accordingly.
801   void CheckDataBlockWithinUpperBound();
802 
CheckPrefixMayMatch(const Slice & ikey,IterDirection direction)803   bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) {
804     if (need_upper_bound_check_ && direction == IterDirection::kBackward) {
805       // Upper bound check isn't sufficnet for backward direction to
806       // guarantee the same result as total order, so disable prefix
807       // check.
808       return true;
809     }
810     if (check_filter_ &&
811         !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_,
812                                 need_upper_bound_check_, &lookup_context_)) {
813       // TODO remember the iterator is invalidated because of prefix
814       // match. This can avoid the upper level file iterator to falsely
815       // believe the position is the end of the SST file and move to
816       // the first key of the next file.
817       ResetDataIter();
818       return false;
819     }
820     return true;
821   }
822 };
823 
824 }  // namespace ROCKSDB_NAMESPACE
825