1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 // 6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 7 // Use of this source code is governed by a BSD-style license that can be 8 // found in the LICENSE file. See the AUTHORS file for names of contributors. 9 10 #pragma once 11 #include <stddef.h> 12 #include <stdint.h> 13 #include <string> 14 #include <vector> 15 16 #include "db/dbformat.h" 17 #include "db/pinned_iterators_manager.h" 18 #include "port/malloc.h" 19 #include "rocksdb/iterator.h" 20 #include "rocksdb/options.h" 21 #include "rocksdb/statistics.h" 22 #include "rocksdb/table.h" 23 #include "table/block_based/block_prefix_index.h" 24 #include "table/block_based/data_block_hash_index.h" 25 #include "table/format.h" 26 #include "table/internal_iterator.h" 27 #include "test_util/sync_point.h" 28 #include "util/random.h" 29 30 namespace ROCKSDB_NAMESPACE { 31 32 struct BlockContents; 33 class Comparator; 34 template <class TValue> 35 class BlockIter; 36 class DataBlockIter; 37 class IndexBlockIter; 38 class BlockPrefixIndex; 39 40 // BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data 41 // bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of 42 // bytes in the Block we update the bitmap and increment 43 // READ_AMP_ESTIMATE_USEFUL_BYTES. 44 class BlockReadAmpBitmap { 45 public: BlockReadAmpBitmap(size_t block_size,size_t bytes_per_bit,Statistics * statistics)46 explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit, 47 Statistics* statistics) 48 : bitmap_(nullptr), 49 bytes_per_bit_pow_(0), 50 statistics_(statistics), 51 rnd_(Random::GetTLSInstance()->Uniform( 52 static_cast<int>(bytes_per_bit))) { 53 TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_); 54 assert(block_size > 0 && bytes_per_bit > 0); 55 56 // convert bytes_per_bit to be a power of 2 57 while (bytes_per_bit >>= 1) { 58 bytes_per_bit_pow_++; 59 } 60 61 // num_bits_needed = ceil(block_size / bytes_per_bit) 62 size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1; 63 assert(num_bits_needed > 0); 64 65 // bitmap_size = ceil(num_bits_needed / kBitsPerEntry) 66 size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1; 67 68 // Create bitmap and set all the bits to 0 69 bitmap_ = new std::atomic<uint32_t>[bitmap_size](); 70 71 RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size); 72 } 73 ~BlockReadAmpBitmap()74 ~BlockReadAmpBitmap() { delete[] bitmap_; } 75 Mark(uint32_t start_offset,uint32_t end_offset)76 void Mark(uint32_t start_offset, uint32_t end_offset) { 77 assert(end_offset >= start_offset); 78 // Index of first bit in mask 79 uint32_t start_bit = 80 (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >> 81 bytes_per_bit_pow_; 82 // Index of last bit in mask + 1 83 uint32_t exclusive_end_bit = 84 (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_; 85 if (start_bit >= exclusive_end_bit) { 86 return; 87 } 88 assert(exclusive_end_bit > 0); 89 90 if (GetAndSet(start_bit) == 0) { 91 uint32_t new_useful_bytes = (exclusive_end_bit - start_bit) 92 << bytes_per_bit_pow_; 93 RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES, 94 new_useful_bytes); 95 } 96 } 97 GetStatistics()98 Statistics* GetStatistics() { 99 return statistics_.load(std::memory_order_relaxed); 100 } 101 SetStatistics(Statistics * stats)102 void SetStatistics(Statistics* stats) { statistics_.store(stats); } 103 GetBytesPerBit()104 uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; } 105 ApproximateMemoryUsage()106 size_t ApproximateMemoryUsage() const { 107 #ifdef ROCKSDB_MALLOC_USABLE_SIZE 108 return malloc_usable_size((void*)this); 109 #endif // ROCKSDB_MALLOC_USABLE_SIZE 110 return sizeof(*this); 111 } 112 113 private: 114 // Get the current value of bit at `bit_idx` and set it to 1 GetAndSet(uint32_t bit_idx)115 inline bool GetAndSet(uint32_t bit_idx) { 116 const uint32_t byte_idx = bit_idx / kBitsPerEntry; 117 const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry); 118 119 return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) & 120 bit_mask; 121 } 122 123 const uint32_t kBytesPersEntry = sizeof(uint32_t); // 4 bytes 124 const uint32_t kBitsPerEntry = kBytesPersEntry * 8; // 32 bits 125 126 // Bitmap used to record the bytes that we read, use atomic to protect 127 // against multiple threads updating the same bit 128 std::atomic<uint32_t>* bitmap_; 129 // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize 130 // muliplication and division 131 uint8_t bytes_per_bit_pow_; 132 // Pointer to DB Statistics object, Since this bitmap may outlive the DB 133 // this pointer maybe invalid, but the DB will update it to a valid pointer 134 // by using SetStatistics() before calling Mark() 135 std::atomic<Statistics*> statistics_; 136 uint32_t rnd_; 137 }; 138 139 // This Block class is not for any old block: it is designed to hold only 140 // uncompressed blocks containing sorted key-value pairs. It is thus 141 // suitable for storing uncompressed data blocks, index blocks (including 142 // partitions), range deletion blocks, properties blocks, metaindex blocks, 143 // as well as the top level of the partitioned filter structure (which is 144 // actually an index of the filter partitions). It is NOT suitable for 145 // compressed blocks in general, filter blocks/partitions, or compression 146 // dictionaries (since the latter do not contain sorted key-value pairs). 147 // Use BlockContents directly for those. 148 // 149 // See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format 150 // for details of the format and the various block types. 151 class Block { 152 public: 153 // Initialize the block with the specified contents. 154 explicit Block(BlockContents&& contents, SequenceNumber _global_seqno, 155 size_t read_amp_bytes_per_bit = 0, 156 Statistics* statistics = nullptr); 157 // No copying allowed 158 Block(const Block&) = delete; 159 void operator=(const Block&) = delete; 160 161 ~Block(); 162 size()163 size_t size() const { return size_; } data()164 const char* data() const { return data_; } 165 // The additional memory space taken by the block data. usable_size()166 size_t usable_size() const { return contents_.usable_size(); } 167 uint32_t NumRestarts() const; own_bytes()168 bool own_bytes() const { return contents_.own_bytes(); } 169 170 BlockBasedTableOptions::DataBlockIndexType IndexType() const; 171 172 // If comparator is InternalKeyComparator, user_comparator is its user 173 // comparator; they are equal otherwise. 174 // 175 // If iter is null, return new Iterator 176 // If iter is not null, update this one and return it as Iterator* 177 // 178 // Updates read_amp_bitmap_ if it is not nullptr. 179 // 180 // If `block_contents_pinned` is true, the caller will guarantee that when 181 // the cleanup functions are transferred from the iterator to other 182 // classes, e.g. PinnableSlice, the pointer to the bytes will still be 183 // valid. Either the iterator holds cache handle or ownership of some resource 184 // and release them in a release function, or caller is sure that the data 185 // will not go away (for example, it's from mmapped file which will not be 186 // closed). 187 // 188 // NOTE: for the hash based lookup, if a key prefix doesn't match any key, 189 // the iterator will simply be set as "invalid", rather than returning 190 // the key that is just pass the target key. 191 DataBlockIter* NewDataIterator(const Comparator* comparator, 192 const Comparator* user_comparator, 193 DataBlockIter* iter = nullptr, 194 Statistics* stats = nullptr, 195 bool block_contents_pinned = false); 196 197 // key_includes_seq, default true, means that the keys are in internal key 198 // format. 199 // value_is_full, default true, means that no delta encoding is 200 // applied to values. 201 // 202 // If `prefix_index` is not nullptr this block will do hash lookup for the key 203 // prefix. If total_order_seek is true, prefix_index_ is ignored. 204 // 205 // `have_first_key` controls whether IndexValue will contain 206 // first_internal_key. It affects data serialization format, so the same value 207 // have_first_key must be used when writing and reading index. 208 // It is determined by IndexType property of the table. 209 IndexBlockIter* NewIndexIterator(const Comparator* comparator, 210 const Comparator* user_comparator, 211 IndexBlockIter* iter, Statistics* stats, 212 bool total_order_seek, bool have_first_key, 213 bool key_includes_seq, bool value_is_full, 214 bool block_contents_pinned = false, 215 BlockPrefixIndex* prefix_index = nullptr); 216 217 // Report an approximation of how much memory has been used. 218 size_t ApproximateMemoryUsage() const; 219 global_seqno()220 SequenceNumber global_seqno() const { return global_seqno_; } 221 222 private: 223 BlockContents contents_; 224 const char* data_; // contents_.data.data() 225 size_t size_; // contents_.data.size() 226 uint32_t restart_offset_; // Offset in data_ of restart array 227 uint32_t num_restarts_; 228 std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_; 229 // All keys in the block will have seqno = global_seqno_, regardless of 230 // the encoded value (kDisableGlobalSequenceNumber means disabled) 231 const SequenceNumber global_seqno_; 232 233 DataBlockHashIndex data_block_hash_index_; 234 }; 235 236 template <class TValue> 237 class BlockIter : public InternalIteratorBase<TValue> { 238 public: InitializeBase(const Comparator * comparator,const char * data,uint32_t restarts,uint32_t num_restarts,SequenceNumber global_seqno,bool block_contents_pinned)239 void InitializeBase(const Comparator* comparator, const char* data, 240 uint32_t restarts, uint32_t num_restarts, 241 SequenceNumber global_seqno, bool block_contents_pinned) { 242 assert(data_ == nullptr); // Ensure it is called only once 243 assert(num_restarts > 0); // Ensure the param is valid 244 245 comparator_ = comparator; 246 data_ = data; 247 restarts_ = restarts; 248 num_restarts_ = num_restarts; 249 current_ = restarts_; 250 restart_index_ = num_restarts_; 251 global_seqno_ = global_seqno; 252 block_contents_pinned_ = block_contents_pinned; 253 cache_handle_ = nullptr; 254 } 255 256 // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do 257 // nothing. Calls cleanup functions. InvalidateBase(Status s)258 void InvalidateBase(Status s) { 259 // Assert that the BlockIter is never deleted while Pinning is Enabled. 260 assert(!pinned_iters_mgr_ || 261 (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); 262 263 data_ = nullptr; 264 current_ = restarts_; 265 status_ = s; 266 267 // Call cleanup callbacks. 268 Cleanable::Reset(); 269 } 270 Valid()271 bool Valid() const override { return current_ < restarts_; } status()272 Status status() const override { return status_; } key()273 Slice key() const override { 274 assert(Valid()); 275 return key_.GetKey(); 276 } 277 278 #ifndef NDEBUG ~BlockIter()279 ~BlockIter() override { 280 // Assert that the BlockIter is never deleted while Pinning is Enabled. 281 assert(!pinned_iters_mgr_ || 282 (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); 283 } SetPinnedItersMgr(PinnedIteratorsManager * pinned_iters_mgr)284 void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { 285 pinned_iters_mgr_ = pinned_iters_mgr; 286 } 287 PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; 288 #endif 289 IsKeyPinned()290 bool IsKeyPinned() const override { 291 return block_contents_pinned_ && key_pinned_; 292 } 293 IsValuePinned()294 bool IsValuePinned() const override { return block_contents_pinned_; } 295 TEST_CurrentEntrySize()296 size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; } 297 ValueOffset()298 uint32_t ValueOffset() const { 299 return static_cast<uint32_t>(value_.data() - data_); 300 } 301 SetCacheHandle(Cache::Handle * handle)302 void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; } 303 cache_handle()304 Cache::Handle* cache_handle() { return cache_handle_; } 305 306 protected: 307 // Note: The type could be changed to InternalKeyComparator but we see a weird 308 // performance drop by that. 309 const Comparator* comparator_; 310 const char* data_; // underlying block contents 311 uint32_t num_restarts_; // Number of uint32_t entries in restart array 312 313 // Index of restart block in which current_ or current_-1 falls 314 uint32_t restart_index_; 315 uint32_t restarts_; // Offset of restart array (list of fixed32) 316 // current_ is offset in data_ of current entry. >= restarts_ if !Valid 317 uint32_t current_; 318 IterKey key_; 319 Slice value_; 320 Status status_; 321 bool key_pinned_; 322 // Whether the block data is guaranteed to outlive this iterator, and 323 // as long as the cleanup functions are transferred to another class, 324 // e.g. PinnableSlice, the pointer to the bytes will still be valid. 325 bool block_contents_pinned_; 326 SequenceNumber global_seqno_; 327 328 private: 329 // Store the cache handle, if the block is cached. We need this since the 330 // only other place the handle is stored is as an argument to the Cleanable 331 // function callback, which is hard to retrieve. When multiple value 332 // PinnableSlices reference the block, they need the cache handle in order 333 // to bump up the ref count 334 Cache::Handle* cache_handle_; 335 336 public: 337 // Return the offset in data_ just past the end of the current entry. NextEntryOffset()338 inline uint32_t NextEntryOffset() const { 339 // NOTE: We don't support blocks bigger than 2GB 340 return static_cast<uint32_t>((value_.data() + value_.size()) - data_); 341 } 342 GetRestartPoint(uint32_t index)343 uint32_t GetRestartPoint(uint32_t index) { 344 assert(index < num_restarts_); 345 return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); 346 } 347 SeekToRestartPoint(uint32_t index)348 void SeekToRestartPoint(uint32_t index) { 349 key_.Clear(); 350 restart_index_ = index; 351 // current_ will be fixed by ParseNextKey(); 352 353 // ParseNextKey() starts at the end of value_, so set value_ accordingly 354 uint32_t offset = GetRestartPoint(index); 355 value_ = Slice(data_ + offset, 0); 356 } 357 358 void CorruptionError(); 359 360 template <typename DecodeKeyFunc> 361 inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, 362 uint32_t* index, const Comparator* comp); 363 }; 364 365 class DataBlockIter final : public BlockIter<Slice> { 366 public: DataBlockIter()367 DataBlockIter() 368 : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} DataBlockIter(const Comparator * comparator,const Comparator * user_comparator,const char * data,uint32_t restarts,uint32_t num_restarts,SequenceNumber global_seqno,BlockReadAmpBitmap * read_amp_bitmap,bool block_contents_pinned,DataBlockHashIndex * data_block_hash_index)369 DataBlockIter(const Comparator* comparator, const Comparator* user_comparator, 370 const char* data, uint32_t restarts, uint32_t num_restarts, 371 SequenceNumber global_seqno, 372 BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned, 373 DataBlockHashIndex* data_block_hash_index) 374 : DataBlockIter() { 375 Initialize(comparator, user_comparator, data, restarts, num_restarts, 376 global_seqno, read_amp_bitmap, block_contents_pinned, 377 data_block_hash_index); 378 } Initialize(const Comparator * comparator,const Comparator * user_comparator,const char * data,uint32_t restarts,uint32_t num_restarts,SequenceNumber global_seqno,BlockReadAmpBitmap * read_amp_bitmap,bool block_contents_pinned,DataBlockHashIndex * data_block_hash_index)379 void Initialize(const Comparator* comparator, 380 const Comparator* user_comparator, const char* data, 381 uint32_t restarts, uint32_t num_restarts, 382 SequenceNumber global_seqno, 383 BlockReadAmpBitmap* read_amp_bitmap, 384 bool block_contents_pinned, 385 DataBlockHashIndex* data_block_hash_index) { 386 InitializeBase(comparator, data, restarts, num_restarts, global_seqno, 387 block_contents_pinned); 388 user_comparator_ = user_comparator; 389 key_.SetIsUserKey(false); 390 read_amp_bitmap_ = read_amp_bitmap; 391 last_bitmap_offset_ = current_ + 1; 392 data_block_hash_index_ = data_block_hash_index; 393 } 394 value()395 Slice value() const override { 396 assert(Valid()); 397 if (read_amp_bitmap_ && current_ < restarts_ && 398 current_ != last_bitmap_offset_) { 399 read_amp_bitmap_->Mark(current_ /* current entry offset */, 400 NextEntryOffset() - 1); 401 last_bitmap_offset_ = current_; 402 } 403 return value_; 404 } 405 406 void Seek(const Slice& target) override; 407 SeekForGet(const Slice & target)408 inline bool SeekForGet(const Slice& target) { 409 if (!data_block_hash_index_) { 410 Seek(target); 411 return true; 412 } 413 414 return SeekForGetImpl(target); 415 } 416 417 void SeekForPrev(const Slice& target) override; 418 419 void Prev() override; 420 421 void Next() final override; 422 423 // Try to advance to the next entry in the block. If there is data corruption 424 // or error, report it to the caller instead of aborting the process. May 425 // incur higher CPU overhead because we need to perform check on every entry. 426 void NextOrReport(); 427 428 void SeekToFirst() override; 429 430 // Try to seek to the first entry in the block. If there is data corruption 431 // or error, report it to caller instead of aborting the process. May incur 432 // higher CPU overhead because we need to perform check on every entry. 433 void SeekToFirstOrReport(); 434 435 void SeekToLast() override; 436 Invalidate(Status s)437 void Invalidate(Status s) { 438 InvalidateBase(s); 439 // Clear prev entries cache. 440 prev_entries_keys_buff_.clear(); 441 prev_entries_.clear(); 442 prev_entries_idx_ = -1; 443 } 444 445 private: 446 // read-amp bitmap 447 BlockReadAmpBitmap* read_amp_bitmap_; 448 // last `current_` value we report to read-amp bitmp 449 mutable uint32_t last_bitmap_offset_; 450 struct CachedPrevEntry { CachedPrevEntryCachedPrevEntry451 explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr, 452 size_t _key_offset, size_t _key_size, Slice _value) 453 : offset(_offset), 454 key_ptr(_key_ptr), 455 key_offset(_key_offset), 456 key_size(_key_size), 457 value(_value) {} 458 459 // offset of entry in block 460 uint32_t offset; 461 // Pointer to key data in block (nullptr if key is delta-encoded) 462 const char* key_ptr; 463 // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr) 464 size_t key_offset; 465 // size of key 466 size_t key_size; 467 // value slice pointing to data in block 468 Slice value; 469 }; 470 std::string prev_entries_keys_buff_; 471 std::vector<CachedPrevEntry> prev_entries_; 472 int32_t prev_entries_idx_ = -1; 473 474 DataBlockHashIndex* data_block_hash_index_; 475 const Comparator* user_comparator_; 476 477 template <typename DecodeEntryFunc> 478 inline bool ParseNextDataKey(const char* limit = nullptr); 479 Compare(const IterKey & ikey,const Slice & b)480 inline int Compare(const IterKey& ikey, const Slice& b) const { 481 return comparator_->Compare(ikey.GetInternalKey(), b); 482 } 483 484 bool SeekForGetImpl(const Slice& target); 485 }; 486 487 class IndexBlockIter final : public BlockIter<IndexValue> { 488 public: IndexBlockIter()489 IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {} 490 key()491 Slice key() const override { 492 assert(Valid()); 493 return key_.GetKey(); 494 } 495 // key_includes_seq, default true, means that the keys are in internal key 496 // format. 497 // value_is_full, default true, means that no delta encoding is 498 // applied to values. Initialize(const Comparator * comparator,const Comparator * user_comparator,const char * data,uint32_t restarts,uint32_t num_restarts,SequenceNumber global_seqno,BlockPrefixIndex * prefix_index,bool have_first_key,bool key_includes_seq,bool value_is_full,bool block_contents_pinned)499 void Initialize(const Comparator* comparator, 500 const Comparator* user_comparator, const char* data, 501 uint32_t restarts, uint32_t num_restarts, 502 SequenceNumber global_seqno, BlockPrefixIndex* prefix_index, 503 bool have_first_key, bool key_includes_seq, 504 bool value_is_full, bool block_contents_pinned) { 505 InitializeBase(key_includes_seq ? comparator : user_comparator, data, 506 restarts, num_restarts, kDisableGlobalSequenceNumber, 507 block_contents_pinned); 508 key_includes_seq_ = key_includes_seq; 509 key_.SetIsUserKey(!key_includes_seq_); 510 prefix_index_ = prefix_index; 511 value_delta_encoded_ = !value_is_full; 512 have_first_key_ = have_first_key; 513 if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) { 514 global_seqno_state_.reset(new GlobalSeqnoState(global_seqno)); 515 } else { 516 global_seqno_state_.reset(); 517 } 518 } 519 user_key()520 Slice user_key() const override { 521 if (key_includes_seq_) { 522 return ExtractUserKey(key()); 523 } 524 return key(); 525 } 526 value()527 IndexValue value() const override { 528 assert(Valid()); 529 if (value_delta_encoded_ || global_seqno_state_ != nullptr) { 530 return decoded_value_; 531 } else { 532 IndexValue entry; 533 Slice v = value_; 534 Status decode_s __attribute__((__unused__)) = 535 entry.DecodeFrom(&v, have_first_key_, nullptr); 536 assert(decode_s.ok()); 537 return entry; 538 } 539 } 540 541 // IndexBlockIter follows a different contract for prefix iterator 542 // from data iterators. 543 // If prefix of the seek key `target` exists in the file, it must 544 // return the same result as total order seek. 545 // If the prefix of `target` doesn't exist in the file, it can either 546 // return the result of total order seek, or set both of Valid() = false 547 // and status() = NotFound(). 548 void Seek(const Slice& target) override; 549 SeekForPrev(const Slice &)550 void SeekForPrev(const Slice&) override { 551 assert(false); 552 current_ = restarts_; 553 restart_index_ = num_restarts_; 554 status_ = Status::InvalidArgument( 555 "RocksDB internal error: should never call SeekForPrev() on index " 556 "blocks"); 557 key_.Clear(); 558 value_.clear(); 559 } 560 561 void Prev() override; 562 563 void Next() override; 564 565 void SeekToFirst() override; 566 567 void SeekToLast() override; 568 Invalidate(Status s)569 void Invalidate(Status s) { InvalidateBase(s); } 570 IsValuePinned()571 bool IsValuePinned() const override { 572 return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); 573 } 574 575 private: 576 // Key is in InternalKey format 577 bool key_includes_seq_; 578 bool value_delta_encoded_; 579 bool have_first_key_; // value includes first_internal_key 580 BlockPrefixIndex* prefix_index_; 581 // Whether the value is delta encoded. In that case the value is assumed to be 582 // BlockHandle. The first value in each restart interval is the full encoded 583 // BlockHandle; the restart of encoded size part of the BlockHandle. The 584 // offset of delta encoded BlockHandles is computed by adding the size of 585 // previous delta encoded values in the same restart interval to the offset of 586 // the first value in that restart interval. 587 IndexValue decoded_value_; 588 589 // When sequence number overwriting is enabled, this struct contains the seqno 590 // to overwrite with, and current first_internal_key with overwritten seqno. 591 // This is rarely used, so we put it behind a pointer and only allocate when 592 // needed. 593 struct GlobalSeqnoState { 594 // First internal key according to current index entry, but with sequence 595 // number overwritten to global_seqno. 596 IterKey first_internal_key; 597 SequenceNumber global_seqno; 598 GlobalSeqnoStateGlobalSeqnoState599 explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {} 600 }; 601 602 std::unique_ptr<GlobalSeqnoState> global_seqno_state_; 603 604 // Set *prefix_may_exist to false if no key possibly share the same prefix 605 // as `target`. If not set, the result position should be the same as total 606 // order Seek. 607 bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist); 608 // Set *prefix_may_exist to false if no key can possibly share the same 609 // prefix as `target`. If not set, the result position should be the same 610 // as total order seek. 611 bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, 612 uint32_t left, uint32_t right, uint32_t* index, 613 bool* prefix_may_exist); 614 inline int CompareBlockKey(uint32_t block_index, const Slice& target); 615 Compare(const Slice & a,const Slice & b)616 inline int Compare(const Slice& a, const Slice& b) const { 617 return comparator_->Compare(a, b); 618 } 619 Compare(const IterKey & ikey,const Slice & b)620 inline int Compare(const IterKey& ikey, const Slice& b) const { 621 return comparator_->Compare(ikey.GetKey(), b); 622 } 623 624 inline bool ParseNextIndexKey(); 625 626 // When value_delta_encoded_ is enabled it decodes the value which is assumed 627 // to be BlockHandle and put it to decoded_value_ 628 inline void DecodeCurrentValue(uint32_t shared); 629 }; 630 631 } // namespace ROCKSDB_NAMESPACE 632