1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #pragma once
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <string>
14 #include <vector>
15 
16 #include "db/dbformat.h"
17 #include "db/pinned_iterators_manager.h"
18 #include "port/malloc.h"
19 #include "rocksdb/iterator.h"
20 #include "rocksdb/options.h"
21 #include "rocksdb/statistics.h"
22 #include "rocksdb/table.h"
23 #include "table/block_based/block_prefix_index.h"
24 #include "table/block_based/data_block_hash_index.h"
25 #include "table/format.h"
26 #include "table/internal_iterator.h"
27 #include "test_util/sync_point.h"
28 #include "util/random.h"
29 
30 namespace ROCKSDB_NAMESPACE {
31 
32 struct BlockContents;
33 class Comparator;
34 template <class TValue>
35 class BlockIter;
36 class DataBlockIter;
37 class IndexBlockIter;
38 class BlockPrefixIndex;
39 
40 // BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data
41 // bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of
42 // bytes in the Block we update the bitmap and increment
43 // READ_AMP_ESTIMATE_USEFUL_BYTES.
44 class BlockReadAmpBitmap {
45  public:
BlockReadAmpBitmap(size_t block_size,size_t bytes_per_bit,Statistics * statistics)46   explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit,
47                               Statistics* statistics)
48       : bitmap_(nullptr),
49         bytes_per_bit_pow_(0),
50         statistics_(statistics),
51         rnd_(Random::GetTLSInstance()->Uniform(
52             static_cast<int>(bytes_per_bit))) {
53     TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_);
54     assert(block_size > 0 && bytes_per_bit > 0);
55 
56     // convert bytes_per_bit to be a power of 2
57     while (bytes_per_bit >>= 1) {
58       bytes_per_bit_pow_++;
59     }
60 
61     // num_bits_needed = ceil(block_size / bytes_per_bit)
62     size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1;
63     assert(num_bits_needed > 0);
64 
65     // bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
66     size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1;
67 
68     // Create bitmap and set all the bits to 0
69     bitmap_ = new std::atomic<uint32_t>[bitmap_size]();
70 
71     RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size);
72   }
73 
~BlockReadAmpBitmap()74   ~BlockReadAmpBitmap() { delete[] bitmap_; }
75 
Mark(uint32_t start_offset,uint32_t end_offset)76   void Mark(uint32_t start_offset, uint32_t end_offset) {
77     assert(end_offset >= start_offset);
78     // Index of first bit in mask
79     uint32_t start_bit =
80         (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >>
81         bytes_per_bit_pow_;
82     // Index of last bit in mask + 1
83     uint32_t exclusive_end_bit =
84         (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_;
85     if (start_bit >= exclusive_end_bit) {
86       return;
87     }
88     assert(exclusive_end_bit > 0);
89 
90     if (GetAndSet(start_bit) == 0) {
91       uint32_t new_useful_bytes = (exclusive_end_bit - start_bit)
92                                   << bytes_per_bit_pow_;
93       RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES,
94                  new_useful_bytes);
95     }
96   }
97 
GetStatistics()98   Statistics* GetStatistics() {
99     return statistics_.load(std::memory_order_relaxed);
100   }
101 
SetStatistics(Statistics * stats)102   void SetStatistics(Statistics* stats) { statistics_.store(stats); }
103 
GetBytesPerBit()104   uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; }
105 
ApproximateMemoryUsage()106   size_t ApproximateMemoryUsage() const {
107 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
108     return malloc_usable_size((void*)this);
109 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
110     return sizeof(*this);
111   }
112 
113  private:
114   // Get the current value of bit at `bit_idx` and set it to 1
GetAndSet(uint32_t bit_idx)115   inline bool GetAndSet(uint32_t bit_idx) {
116     const uint32_t byte_idx = bit_idx / kBitsPerEntry;
117     const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry);
118 
119     return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) &
120            bit_mask;
121   }
122 
123   const uint32_t kBytesPersEntry = sizeof(uint32_t);   // 4 bytes
124   const uint32_t kBitsPerEntry = kBytesPersEntry * 8;  // 32 bits
125 
126   // Bitmap used to record the bytes that we read, use atomic to protect
127   // against multiple threads updating the same bit
128   std::atomic<uint32_t>* bitmap_;
129   // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize
130   // muliplication and division
131   uint8_t bytes_per_bit_pow_;
132   // Pointer to DB Statistics object, Since this bitmap may outlive the DB
133   // this pointer maybe invalid, but the DB will update it to a valid pointer
134   // by using SetStatistics() before calling Mark()
135   std::atomic<Statistics*> statistics_;
136   uint32_t rnd_;
137 };
138 
139 // This Block class is not for any old block: it is designed to hold only
140 // uncompressed blocks containing sorted key-value pairs. It is thus
141 // suitable for storing uncompressed data blocks, index blocks (including
142 // partitions), range deletion blocks, properties blocks, metaindex blocks,
143 // as well as the top level of the partitioned filter structure (which is
144 // actually an index of the filter partitions). It is NOT suitable for
145 // compressed blocks in general, filter blocks/partitions, or compression
146 // dictionaries (since the latter do not contain sorted key-value pairs).
147 // Use BlockContents directly for those.
148 //
149 // See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
150 // for details of the format and the various block types.
151 class Block {
152  public:
153   // Initialize the block with the specified contents.
154   explicit Block(BlockContents&& contents, SequenceNumber _global_seqno,
155                  size_t read_amp_bytes_per_bit = 0,
156                  Statistics* statistics = nullptr);
157   // No copying allowed
158   Block(const Block&) = delete;
159   void operator=(const Block&) = delete;
160 
161   ~Block();
162 
size()163   size_t size() const { return size_; }
data()164   const char* data() const { return data_; }
165   // The additional memory space taken by the block data.
usable_size()166   size_t usable_size() const { return contents_.usable_size(); }
167   uint32_t NumRestarts() const;
own_bytes()168   bool own_bytes() const { return contents_.own_bytes(); }
169 
170   BlockBasedTableOptions::DataBlockIndexType IndexType() const;
171 
172   // If comparator is InternalKeyComparator, user_comparator is its user
173   // comparator; they are equal otherwise.
174   //
175   // If iter is null, return new Iterator
176   // If iter is not null, update this one and return it as Iterator*
177   //
178   // Updates read_amp_bitmap_ if it is not nullptr.
179   //
180   // If `block_contents_pinned` is true, the caller will guarantee that when
181   // the cleanup functions are transferred from the iterator to other
182   // classes, e.g. PinnableSlice, the pointer to the bytes will still be
183   // valid. Either the iterator holds cache handle or ownership of some resource
184   // and release them in a release function, or caller is sure that the data
185   // will not go away (for example, it's from mmapped file which will not be
186   // closed).
187   //
188   // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
189   // the iterator will simply be set as "invalid", rather than returning
190   // the key that is just pass the target key.
191   DataBlockIter* NewDataIterator(const Comparator* comparator,
192                                  const Comparator* user_comparator,
193                                  DataBlockIter* iter = nullptr,
194                                  Statistics* stats = nullptr,
195                                  bool block_contents_pinned = false);
196 
197   // key_includes_seq, default true, means that the keys are in internal key
198   // format.
199   // value_is_full, default true, means that no delta encoding is
200   // applied to values.
201   //
202   // If `prefix_index` is not nullptr this block will do hash lookup for the key
203   // prefix. If total_order_seek is true, prefix_index_ is ignored.
204   //
205   // `have_first_key` controls whether IndexValue will contain
206   // first_internal_key. It affects data serialization format, so the same value
207   // have_first_key must be used when writing and reading index.
208   // It is determined by IndexType property of the table.
209   IndexBlockIter* NewIndexIterator(const Comparator* comparator,
210                                    const Comparator* user_comparator,
211                                    IndexBlockIter* iter, Statistics* stats,
212                                    bool total_order_seek, bool have_first_key,
213                                    bool key_includes_seq, bool value_is_full,
214                                    bool block_contents_pinned = false,
215                                    BlockPrefixIndex* prefix_index = nullptr);
216 
217   // Report an approximation of how much memory has been used.
218   size_t ApproximateMemoryUsage() const;
219 
global_seqno()220   SequenceNumber global_seqno() const { return global_seqno_; }
221 
222  private:
223   BlockContents contents_;
224   const char* data_;         // contents_.data.data()
225   size_t size_;              // contents_.data.size()
226   uint32_t restart_offset_;  // Offset in data_ of restart array
227   uint32_t num_restarts_;
228   std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
229   // All keys in the block will have seqno = global_seqno_, regardless of
230   // the encoded value (kDisableGlobalSequenceNumber means disabled)
231   const SequenceNumber global_seqno_;
232 
233   DataBlockHashIndex data_block_hash_index_;
234 };
235 
236 template <class TValue>
237 class BlockIter : public InternalIteratorBase<TValue> {
238  public:
InitializeBase(const Comparator * comparator,const char * data,uint32_t restarts,uint32_t num_restarts,SequenceNumber global_seqno,bool block_contents_pinned)239   void InitializeBase(const Comparator* comparator, const char* data,
240                       uint32_t restarts, uint32_t num_restarts,
241                       SequenceNumber global_seqno, bool block_contents_pinned) {
242     assert(data_ == nullptr);  // Ensure it is called only once
243     assert(num_restarts > 0);  // Ensure the param is valid
244 
245     comparator_ = comparator;
246     data_ = data;
247     restarts_ = restarts;
248     num_restarts_ = num_restarts;
249     current_ = restarts_;
250     restart_index_ = num_restarts_;
251     global_seqno_ = global_seqno;
252     block_contents_pinned_ = block_contents_pinned;
253     cache_handle_ = nullptr;
254   }
255 
256   // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
257   // nothing. Calls cleanup functions.
InvalidateBase(Status s)258   void InvalidateBase(Status s) {
259     // Assert that the BlockIter is never deleted while Pinning is Enabled.
260     assert(!pinned_iters_mgr_ ||
261            (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
262 
263     data_ = nullptr;
264     current_ = restarts_;
265     status_ = s;
266 
267     // Call cleanup callbacks.
268     Cleanable::Reset();
269   }
270 
Valid()271   bool Valid() const override { return current_ < restarts_; }
status()272   Status status() const override { return status_; }
key()273   Slice key() const override {
274     assert(Valid());
275     return key_.GetKey();
276   }
277 
278 #ifndef NDEBUG
~BlockIter()279   ~BlockIter() override {
280     // Assert that the BlockIter is never deleted while Pinning is Enabled.
281     assert(!pinned_iters_mgr_ ||
282            (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
283   }
SetPinnedItersMgr(PinnedIteratorsManager * pinned_iters_mgr)284   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
285     pinned_iters_mgr_ = pinned_iters_mgr;
286   }
287   PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
288 #endif
289 
IsKeyPinned()290   bool IsKeyPinned() const override {
291     return block_contents_pinned_ && key_pinned_;
292   }
293 
IsValuePinned()294   bool IsValuePinned() const override { return block_contents_pinned_; }
295 
TEST_CurrentEntrySize()296   size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
297 
ValueOffset()298   uint32_t ValueOffset() const {
299     return static_cast<uint32_t>(value_.data() - data_);
300   }
301 
SetCacheHandle(Cache::Handle * handle)302   void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; }
303 
cache_handle()304   Cache::Handle* cache_handle() { return cache_handle_; }
305 
306  protected:
307   // Note: The type could be changed to InternalKeyComparator but we see a weird
308   // performance drop by that.
309   const Comparator* comparator_;
310   const char* data_;       // underlying block contents
311   uint32_t num_restarts_;  // Number of uint32_t entries in restart array
312 
313   // Index of restart block in which current_ or current_-1 falls
314   uint32_t restart_index_;
315   uint32_t restarts_;  // Offset of restart array (list of fixed32)
316   // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
317   uint32_t current_;
318   IterKey key_;
319   Slice value_;
320   Status status_;
321   bool key_pinned_;
322   // Whether the block data is guaranteed to outlive this iterator, and
323   // as long as the cleanup functions are transferred to another class,
324   // e.g. PinnableSlice, the pointer to the bytes will still be valid.
325   bool block_contents_pinned_;
326   SequenceNumber global_seqno_;
327 
328  private:
329   // Store the cache handle, if the block is cached. We need this since the
330   // only other place the handle is stored is as an argument to the Cleanable
331   // function callback, which is hard to retrieve. When multiple value
332   // PinnableSlices reference the block, they need the cache handle in order
333   // to bump up the ref count
334   Cache::Handle* cache_handle_;
335 
336  public:
337   // Return the offset in data_ just past the end of the current entry.
NextEntryOffset()338   inline uint32_t NextEntryOffset() const {
339     // NOTE: We don't support blocks bigger than 2GB
340     return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
341   }
342 
GetRestartPoint(uint32_t index)343   uint32_t GetRestartPoint(uint32_t index) {
344     assert(index < num_restarts_);
345     return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
346   }
347 
SeekToRestartPoint(uint32_t index)348   void SeekToRestartPoint(uint32_t index) {
349     key_.Clear();
350     restart_index_ = index;
351     // current_ will be fixed by ParseNextKey();
352 
353     // ParseNextKey() starts at the end of value_, so set value_ accordingly
354     uint32_t offset = GetRestartPoint(index);
355     value_ = Slice(data_ + offset, 0);
356   }
357 
358   void CorruptionError();
359 
360   template <typename DecodeKeyFunc>
361   inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
362                          uint32_t* index, const Comparator* comp);
363 };
364 
365 class DataBlockIter final : public BlockIter<Slice> {
366  public:
DataBlockIter()367   DataBlockIter()
368       : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
DataBlockIter(const Comparator * comparator,const Comparator * user_comparator,const char * data,uint32_t restarts,uint32_t num_restarts,SequenceNumber global_seqno,BlockReadAmpBitmap * read_amp_bitmap,bool block_contents_pinned,DataBlockHashIndex * data_block_hash_index)369   DataBlockIter(const Comparator* comparator, const Comparator* user_comparator,
370                 const char* data, uint32_t restarts, uint32_t num_restarts,
371                 SequenceNumber global_seqno,
372                 BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
373                 DataBlockHashIndex* data_block_hash_index)
374       : DataBlockIter() {
375     Initialize(comparator, user_comparator, data, restarts, num_restarts,
376                global_seqno, read_amp_bitmap, block_contents_pinned,
377                data_block_hash_index);
378   }
Initialize(const Comparator * comparator,const Comparator * user_comparator,const char * data,uint32_t restarts,uint32_t num_restarts,SequenceNumber global_seqno,BlockReadAmpBitmap * read_amp_bitmap,bool block_contents_pinned,DataBlockHashIndex * data_block_hash_index)379   void Initialize(const Comparator* comparator,
380                   const Comparator* user_comparator, const char* data,
381                   uint32_t restarts, uint32_t num_restarts,
382                   SequenceNumber global_seqno,
383                   BlockReadAmpBitmap* read_amp_bitmap,
384                   bool block_contents_pinned,
385                   DataBlockHashIndex* data_block_hash_index) {
386     InitializeBase(comparator, data, restarts, num_restarts, global_seqno,
387                    block_contents_pinned);
388     user_comparator_ = user_comparator;
389     key_.SetIsUserKey(false);
390     read_amp_bitmap_ = read_amp_bitmap;
391     last_bitmap_offset_ = current_ + 1;
392     data_block_hash_index_ = data_block_hash_index;
393   }
394 
value()395   Slice value() const override {
396     assert(Valid());
397     if (read_amp_bitmap_ && current_ < restarts_ &&
398         current_ != last_bitmap_offset_) {
399       read_amp_bitmap_->Mark(current_ /* current entry offset */,
400                              NextEntryOffset() - 1);
401       last_bitmap_offset_ = current_;
402     }
403     return value_;
404   }
405 
406   void Seek(const Slice& target) override;
407 
SeekForGet(const Slice & target)408   inline bool SeekForGet(const Slice& target) {
409     if (!data_block_hash_index_) {
410       Seek(target);
411       return true;
412     }
413 
414     return SeekForGetImpl(target);
415   }
416 
417   void SeekForPrev(const Slice& target) override;
418 
419   void Prev() override;
420 
421   void Next() final override;
422 
423   // Try to advance to the next entry in the block. If there is data corruption
424   // or error, report it to the caller instead of aborting the process. May
425   // incur higher CPU overhead because we need to perform check on every entry.
426   void NextOrReport();
427 
428   void SeekToFirst() override;
429 
430   // Try to seek to the first entry in the block. If there is data corruption
431   // or error, report it to caller instead of aborting the process. May incur
432   // higher CPU overhead because we need to perform check on every entry.
433   void SeekToFirstOrReport();
434 
435   void SeekToLast() override;
436 
Invalidate(Status s)437   void Invalidate(Status s) {
438     InvalidateBase(s);
439     // Clear prev entries cache.
440     prev_entries_keys_buff_.clear();
441     prev_entries_.clear();
442     prev_entries_idx_ = -1;
443   }
444 
445  private:
446   // read-amp bitmap
447   BlockReadAmpBitmap* read_amp_bitmap_;
448   // last `current_` value we report to read-amp bitmp
449   mutable uint32_t last_bitmap_offset_;
450   struct CachedPrevEntry {
CachedPrevEntryCachedPrevEntry451     explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,
452                              size_t _key_offset, size_t _key_size, Slice _value)
453         : offset(_offset),
454           key_ptr(_key_ptr),
455           key_offset(_key_offset),
456           key_size(_key_size),
457           value(_value) {}
458 
459     // offset of entry in block
460     uint32_t offset;
461     // Pointer to key data in block (nullptr if key is delta-encoded)
462     const char* key_ptr;
463     // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr)
464     size_t key_offset;
465     // size of key
466     size_t key_size;
467     // value slice pointing to data in block
468     Slice value;
469   };
470   std::string prev_entries_keys_buff_;
471   std::vector<CachedPrevEntry> prev_entries_;
472   int32_t prev_entries_idx_ = -1;
473 
474   DataBlockHashIndex* data_block_hash_index_;
475   const Comparator* user_comparator_;
476 
477   template <typename DecodeEntryFunc>
478   inline bool ParseNextDataKey(const char* limit = nullptr);
479 
Compare(const IterKey & ikey,const Slice & b)480   inline int Compare(const IterKey& ikey, const Slice& b) const {
481     return comparator_->Compare(ikey.GetInternalKey(), b);
482   }
483 
484   bool SeekForGetImpl(const Slice& target);
485 };
486 
487 class IndexBlockIter final : public BlockIter<IndexValue> {
488  public:
IndexBlockIter()489   IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
490 
key()491   Slice key() const override {
492     assert(Valid());
493     return key_.GetKey();
494   }
495   // key_includes_seq, default true, means that the keys are in internal key
496   // format.
497   // value_is_full, default true, means that no delta encoding is
498   // applied to values.
Initialize(const Comparator * comparator,const Comparator * user_comparator,const char * data,uint32_t restarts,uint32_t num_restarts,SequenceNumber global_seqno,BlockPrefixIndex * prefix_index,bool have_first_key,bool key_includes_seq,bool value_is_full,bool block_contents_pinned)499   void Initialize(const Comparator* comparator,
500                   const Comparator* user_comparator, const char* data,
501                   uint32_t restarts, uint32_t num_restarts,
502                   SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
503                   bool have_first_key, bool key_includes_seq,
504                   bool value_is_full, bool block_contents_pinned) {
505     InitializeBase(key_includes_seq ? comparator : user_comparator, data,
506                    restarts, num_restarts, kDisableGlobalSequenceNumber,
507                    block_contents_pinned);
508     key_includes_seq_ = key_includes_seq;
509     key_.SetIsUserKey(!key_includes_seq_);
510     prefix_index_ = prefix_index;
511     value_delta_encoded_ = !value_is_full;
512     have_first_key_ = have_first_key;
513     if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) {
514       global_seqno_state_.reset(new GlobalSeqnoState(global_seqno));
515     } else {
516       global_seqno_state_.reset();
517     }
518   }
519 
user_key()520   Slice user_key() const override {
521     if (key_includes_seq_) {
522       return ExtractUserKey(key());
523     }
524     return key();
525   }
526 
value()527   IndexValue value() const override {
528     assert(Valid());
529     if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
530       return decoded_value_;
531     } else {
532       IndexValue entry;
533       Slice v = value_;
534       Status decode_s __attribute__((__unused__)) =
535           entry.DecodeFrom(&v, have_first_key_, nullptr);
536       assert(decode_s.ok());
537       return entry;
538     }
539   }
540 
541   // IndexBlockIter follows a different contract for prefix iterator
542   // from data iterators.
543   // If prefix of the seek key `target` exists in the file, it must
544   // return the same result as total order seek.
545   // If the prefix of `target` doesn't exist in the file, it can either
546   // return the result of total order seek, or set both of Valid() = false
547   // and status() = NotFound().
548   void Seek(const Slice& target) override;
549 
SeekForPrev(const Slice &)550   void SeekForPrev(const Slice&) override {
551     assert(false);
552     current_ = restarts_;
553     restart_index_ = num_restarts_;
554     status_ = Status::InvalidArgument(
555         "RocksDB internal error: should never call SeekForPrev() on index "
556         "blocks");
557     key_.Clear();
558     value_.clear();
559   }
560 
561   void Prev() override;
562 
563   void Next() override;
564 
565   void SeekToFirst() override;
566 
567   void SeekToLast() override;
568 
Invalidate(Status s)569   void Invalidate(Status s) { InvalidateBase(s); }
570 
IsValuePinned()571   bool IsValuePinned() const override {
572     return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
573   }
574 
575  private:
576   // Key is in InternalKey format
577   bool key_includes_seq_;
578   bool value_delta_encoded_;
579   bool have_first_key_;  // value includes first_internal_key
580   BlockPrefixIndex* prefix_index_;
581   // Whether the value is delta encoded. In that case the value is assumed to be
582   // BlockHandle. The first value in each restart interval is the full encoded
583   // BlockHandle; the restart of encoded size part of the BlockHandle. The
584   // offset of delta encoded BlockHandles is computed by adding the size of
585   // previous delta encoded values in the same restart interval to the offset of
586   // the first value in that restart interval.
587   IndexValue decoded_value_;
588 
589   // When sequence number overwriting is enabled, this struct contains the seqno
590   // to overwrite with, and current first_internal_key with overwritten seqno.
591   // This is rarely used, so we put it behind a pointer and only allocate when
592   // needed.
593   struct GlobalSeqnoState {
594     // First internal key according to current index entry, but with sequence
595     // number overwritten to global_seqno.
596     IterKey first_internal_key;
597     SequenceNumber global_seqno;
598 
GlobalSeqnoStateGlobalSeqnoState599     explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {}
600   };
601 
602   std::unique_ptr<GlobalSeqnoState> global_seqno_state_;
603 
604   // Set *prefix_may_exist to false if no key possibly share the same prefix
605   // as `target`. If not set, the result position should be the same as total
606   // order Seek.
607   bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist);
608   // Set *prefix_may_exist to false if no key can possibly share the same
609   // prefix as `target`. If not set, the result position should be the same
610   // as total order seek.
611   bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
612                             uint32_t left, uint32_t right, uint32_t* index,
613                             bool* prefix_may_exist);
614   inline int CompareBlockKey(uint32_t block_index, const Slice& target);
615 
Compare(const Slice & a,const Slice & b)616   inline int Compare(const Slice& a, const Slice& b) const {
617     return comparator_->Compare(a, b);
618   }
619 
Compare(const IterKey & ikey,const Slice & b)620   inline int Compare(const IterKey& ikey, const Slice& b) const {
621     return comparator_->Compare(ikey.GetKey(), b);
622   }
623 
624   inline bool ParseNextIndexKey();
625 
626   // When value_delta_encoded_ is enabled it decodes the value which is assumed
627   // to be BlockHandle and put it to decoded_value_
628   inline void DecodeCurrentValue(uint32_t shared);
629 };
630 
631 }  // namespace ROCKSDB_NAMESPACE
632