1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 //
10 // Decodes the blocks generated by block_builder.cc.
11
12 #include "table/block_based/block.h"
13 #include <algorithm>
14 #include <string>
15 #include <unordered_map>
16 #include <vector>
17
18 #include "monitoring/perf_context_imp.h"
19 #include "port/port.h"
20 #include "port/stack_trace.h"
21 #include "rocksdb/comparator.h"
22 #include "table/block_based/block_prefix_index.h"
23 #include "table/block_based/data_block_footer.h"
24 #include "table/format.h"
25 #include "util/coding.h"
26
27 namespace ROCKSDB_NAMESPACE {
28
29 // Helper routine: decode the next block entry starting at "p",
30 // storing the number of shared key bytes, non_shared key bytes,
31 // and the length of the value in "*shared", "*non_shared", and
32 // "*value_length", respectively. Will not derefence past "limit".
33 //
34 // If any errors are detected, returns nullptr. Otherwise, returns a
35 // pointer to the key delta (just past the three decoded values).
36 struct DecodeEntry {
operator ()ROCKSDB_NAMESPACE::DecodeEntry37 inline const char* operator()(const char* p, const char* limit,
38 uint32_t* shared, uint32_t* non_shared,
39 uint32_t* value_length) {
40 // We need 2 bytes for shared and non_shared size. We also need one more
41 // byte either for value size or the actual value in case of value delta
42 // encoding.
43 assert(limit - p >= 3);
44 *shared = reinterpret_cast<const unsigned char*>(p)[0];
45 *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
46 *value_length = reinterpret_cast<const unsigned char*>(p)[2];
47 if ((*shared | *non_shared | *value_length) < 128) {
48 // Fast path: all three values are encoded in one byte each
49 p += 3;
50 } else {
51 if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
52 if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
53 if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
54 return nullptr;
55 }
56 }
57
58 // Using an assert in place of "return null" since we should not pay the
59 // cost of checking for corruption on every single key decoding
60 assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
61 return p;
62 }
63 };
64
65 // Helper routine: similar to DecodeEntry but does not have assertions.
66 // Instead, returns nullptr so that caller can detect and report failure.
67 struct CheckAndDecodeEntry {
operator ()ROCKSDB_NAMESPACE::CheckAndDecodeEntry68 inline const char* operator()(const char* p, const char* limit,
69 uint32_t* shared, uint32_t* non_shared,
70 uint32_t* value_length) {
71 // We need 2 bytes for shared and non_shared size. We also need one more
72 // byte either for value size or the actual value in case of value delta
73 // encoding.
74 if (limit - p < 3) {
75 return nullptr;
76 }
77 *shared = reinterpret_cast<const unsigned char*>(p)[0];
78 *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
79 *value_length = reinterpret_cast<const unsigned char*>(p)[2];
80 if ((*shared | *non_shared | *value_length) < 128) {
81 // Fast path: all three values are encoded in one byte each
82 p += 3;
83 } else {
84 if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
85 if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
86 if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
87 return nullptr;
88 }
89 }
90
91 if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
92 return nullptr;
93 }
94 return p;
95 }
96 };
97
98 struct DecodeKey {
operator ()ROCKSDB_NAMESPACE::DecodeKey99 inline const char* operator()(const char* p, const char* limit,
100 uint32_t* shared, uint32_t* non_shared) {
101 uint32_t value_length;
102 return DecodeEntry()(p, limit, shared, non_shared, &value_length);
103 }
104 };
105
106 // In format_version 4, which is used by index blocks, the value size is not
107 // encoded before the entry, as the value is known to be the handle with the
108 // known size.
109 struct DecodeKeyV4 {
operator ()ROCKSDB_NAMESPACE::DecodeKeyV4110 inline const char* operator()(const char* p, const char* limit,
111 uint32_t* shared, uint32_t* non_shared) {
112 // We need 2 bytes for shared and non_shared size. We also need one more
113 // byte either for value size or the actual value in case of value delta
114 // encoding.
115 if (limit - p < 3) return nullptr;
116 *shared = reinterpret_cast<const unsigned char*>(p)[0];
117 *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
118 if ((*shared | *non_shared) < 128) {
119 // Fast path: all three values are encoded in one byte each
120 p += 2;
121 } else {
122 if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
123 if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
124 }
125 return p;
126 }
127 };
128
NextImpl()129 void DataBlockIter::NextImpl() { ParseNextDataKey<DecodeEntry>(); }
130
NextOrReportImpl()131 void DataBlockIter::NextOrReportImpl() {
132 ParseNextDataKey<CheckAndDecodeEntry>();
133 }
134
NextImpl()135 void IndexBlockIter::NextImpl() { ParseNextIndexKey(); }
136
PrevImpl()137 void IndexBlockIter::PrevImpl() {
138 assert(Valid());
139 // Scan backwards to a restart point before current_
140 const uint32_t original = current_;
141 while (GetRestartPoint(restart_index_) >= original) {
142 if (restart_index_ == 0) {
143 // No more entries
144 current_ = restarts_;
145 restart_index_ = num_restarts_;
146 return;
147 }
148 restart_index_--;
149 }
150 SeekToRestartPoint(restart_index_);
151 // Loop until end of current entry hits the start of original entry
152 while (ParseNextIndexKey() && NextEntryOffset() < original) {
153 }
154 }
155
156 // Similar to IndexBlockIter::PrevImpl but also caches the prev entries
PrevImpl()157 void DataBlockIter::PrevImpl() {
158 assert(Valid());
159
160 assert(prev_entries_idx_ == -1 ||
161 static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
162 // Check if we can use cached prev_entries_
163 if (prev_entries_idx_ > 0 &&
164 prev_entries_[prev_entries_idx_].offset == current_) {
165 // Read cached CachedPrevEntry
166 prev_entries_idx_--;
167 const CachedPrevEntry& current_prev_entry =
168 prev_entries_[prev_entries_idx_];
169
170 const char* key_ptr = nullptr;
171 bool raw_key_cached;
172 if (current_prev_entry.key_ptr != nullptr) {
173 // The key is not delta encoded and stored in the data block
174 key_ptr = current_prev_entry.key_ptr;
175 raw_key_cached = false;
176 } else {
177 // The key is delta encoded and stored in prev_entries_keys_buff_
178 key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
179 raw_key_cached = true;
180 }
181 const Slice current_key(key_ptr, current_prev_entry.key_size);
182
183 current_ = current_prev_entry.offset;
184 // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience,
185 // not necessity. It is convenient since this class treats keys as pinned
186 // when `raw_key_` points to an outside buffer. So we cannot allow
187 // `raw_key_` point into Prev cache as it is a transient outside buffer
188 // (i.e., keys in it are not actually pinned).
189 raw_key_.SetKey(current_key, raw_key_cached /* copy */);
190 value_ = current_prev_entry.value;
191
192 return;
193 }
194
195 // Clear prev entries cache
196 prev_entries_idx_ = -1;
197 prev_entries_.clear();
198 prev_entries_keys_buff_.clear();
199
200 // Scan backwards to a restart point before current_
201 const uint32_t original = current_;
202 while (GetRestartPoint(restart_index_) >= original) {
203 if (restart_index_ == 0) {
204 // No more entries
205 current_ = restarts_;
206 restart_index_ = num_restarts_;
207 return;
208 }
209 restart_index_--;
210 }
211
212 SeekToRestartPoint(restart_index_);
213
214 do {
215 if (!ParseNextDataKey<DecodeEntry>()) {
216 break;
217 }
218 Slice current_key = raw_key_.GetKey();
219
220 if (raw_key_.IsKeyPinned()) {
221 // The key is not delta encoded
222 prev_entries_.emplace_back(current_, current_key.data(), 0,
223 current_key.size(), value());
224 } else {
225 // The key is delta encoded, cache decoded key in buffer
226 size_t new_key_offset = prev_entries_keys_buff_.size();
227 prev_entries_keys_buff_.append(current_key.data(), current_key.size());
228
229 prev_entries_.emplace_back(current_, nullptr, new_key_offset,
230 current_key.size(), value());
231 }
232 // Loop until end of current entry hits the start of original entry
233 } while (NextEntryOffset() < original);
234 prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
235 }
236
SeekImpl(const Slice & target)237 void DataBlockIter::SeekImpl(const Slice& target) {
238 Slice seek_key = target;
239 PERF_TIMER_GUARD(block_seek_nanos);
240 if (data_ == nullptr) { // Not init yet
241 return;
242 }
243 uint32_t index = 0;
244 bool skip_linear_scan = false;
245 bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
246
247 if (!ok) {
248 return;
249 }
250 FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
251 }
252
253 // Optimized Seek for point lookup for an internal key `target`
254 // target = "seek_user_key @ type | seqno".
255 //
256 // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
257 // or kTypeBlobIndex, this function behaves identically as Seek().
258 //
259 // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
260 // or kTypeBlobIndex:
261 //
262 // If the return value is FALSE, iter location is undefined, and it means:
263 // 1) there is no key in this block falling into the range:
264 // ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
265 // inclusive; AND
266 // 2) the last key of this block has a greater user_key from seek_user_key
267 //
268 // If the return value is TRUE, iter location has two possibilies:
269 // 1) If iter is valid, it is set to a location as if set by BinarySeek. In
270 // this case, it points to the first key with a larger user_key or a matching
271 // user_key with a seqno no greater than the seeking seqno.
272 // 2) If the iter is invalid, it means that either all the user_key is less
273 // than the seek_user_key, or the block ends with a matching user_key but
274 // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
275 // but larger type).
SeekForGetImpl(const Slice & target)276 bool DataBlockIter::SeekForGetImpl(const Slice& target) {
277 Slice target_user_key = ExtractUserKey(target);
278 uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
279 uint8_t entry =
280 data_block_hash_index_->Lookup(data_, map_offset, target_user_key);
281
282 if (entry == kCollision) {
283 // HashSeek not effective, falling back
284 SeekImpl(target);
285 return true;
286 }
287
288 if (entry == kNoEntry) {
289 // Even if we cannot find the user_key in this block, the result may
290 // exist in the next block. Consider this example:
291 //
292 // Block N: [aab@100, ... , app@120]
293 // boundary key: axy@50 (we make minimal assumption about a boundary key)
294 // Block N+1: [axy@10, ... ]
295 //
296 // If seek_key = axy@60, the search will starts from Block N.
297 // Even if the user_key is not found in the hash map, the caller still
298 // have to continue searching the next block.
299 //
300 // In this case, we pretend the key is the the last restart interval.
301 // The while-loop below will search the last restart interval for the
302 // key. It will stop at the first key that is larger than the seek_key,
303 // or to the end of the block if no one is larger.
304 entry = static_cast<uint8_t>(num_restarts_ - 1);
305 }
306
307 uint32_t restart_index = entry;
308
309 // check if the key is in the restart_interval
310 assert(restart_index < num_restarts_);
311 SeekToRestartPoint(restart_index);
312
313 const char* limit = nullptr;
314 if (restart_index_ + 1 < num_restarts_) {
315 limit = data_ + GetRestartPoint(restart_index_ + 1);
316 } else {
317 limit = data_ + restarts_;
318 }
319
320 while (true) {
321 // Here we only linear seek the target key inside the restart interval.
322 // If a key does not exist inside a restart interval, we avoid
323 // further searching the block content accross restart interval boundary.
324 //
325 // TODO(fwu): check the left and write boundary of the restart interval
326 // to avoid linear seek a target key that is out of range.
327 if (!ParseNextDataKey<DecodeEntry>(limit) ||
328 CompareCurrentKey(target) >= 0) {
329 // we stop at the first potential matching user key.
330 break;
331 }
332 }
333
334 if (current_ == restarts_) {
335 // Search reaches to the end of the block. There are three possibilites:
336 // 1) there is only one user_key match in the block (otherwise collsion).
337 // the matching user_key resides in the last restart interval, and it
338 // is the last key of the restart interval and of the block as well.
339 // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
340 //
341 // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
342 // AND all existing user_keys in the restart interval are smaller than
343 // seek_user_key.
344 //
345 // 3) The seek_key is a false positive and happens to be hashed to the
346 // last restart interval, AND all existing user_keys in the restart
347 // interval are smaller than seek_user_key.
348 //
349 // The result may exist in the next block each case, so we return true.
350 return true;
351 }
352
353 if (ucmp().Compare(raw_key_.GetUserKey(), target_user_key) != 0) {
354 // the key is not in this block and cannot be at the next block either.
355 return false;
356 }
357
358 // Here we are conservative and only support a limited set of cases
359 ValueType value_type = ExtractValueType(raw_key_.GetInternalKey());
360 if (value_type != ValueType::kTypeValue &&
361 value_type != ValueType::kTypeDeletion &&
362 value_type != ValueType::kTypeSingleDeletion &&
363 value_type != ValueType::kTypeBlobIndex) {
364 SeekImpl(target);
365 return true;
366 }
367
368 // Result found, and the iter is correctly set.
369 return true;
370 }
371
SeekImpl(const Slice & target)372 void IndexBlockIter::SeekImpl(const Slice& target) {
373 TEST_SYNC_POINT("IndexBlockIter::Seek:0");
374 PERF_TIMER_GUARD(block_seek_nanos);
375 if (data_ == nullptr) { // Not init yet
376 return;
377 }
378 Slice seek_key = target;
379 if (raw_key_.IsUserKey()) {
380 seek_key = ExtractUserKey(target);
381 }
382 status_ = Status::OK();
383 uint32_t index = 0;
384 bool skip_linear_scan = false;
385 bool ok = false;
386 if (prefix_index_) {
387 bool prefix_may_exist = true;
388 ok = PrefixSeek(target, &index, &prefix_may_exist);
389 if (!prefix_may_exist) {
390 // This is to let the caller to distinguish between non-existing prefix,
391 // and when key is larger than the last key, which both set Valid() to
392 // false.
393 current_ = restarts_;
394 status_ = Status::NotFound();
395 }
396 // restart interval must be one when hash search is enabled so the binary
397 // search simply lands at the right place.
398 skip_linear_scan = true;
399 } else if (value_delta_encoded_) {
400 ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan);
401 } else {
402 ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
403 }
404
405 if (!ok) {
406 return;
407 }
408 FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
409 }
410
SeekForPrevImpl(const Slice & target)411 void DataBlockIter::SeekForPrevImpl(const Slice& target) {
412 PERF_TIMER_GUARD(block_seek_nanos);
413 Slice seek_key = target;
414 if (data_ == nullptr) { // Not init yet
415 return;
416 }
417 uint32_t index = 0;
418 bool skip_linear_scan = false;
419 bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
420
421 if (!ok) {
422 return;
423 }
424 FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
425
426 if (!Valid()) {
427 SeekToLastImpl();
428 } else {
429 while (Valid() && CompareCurrentKey(seek_key) > 0) {
430 PrevImpl();
431 }
432 }
433 }
434
SeekToFirstImpl()435 void DataBlockIter::SeekToFirstImpl() {
436 if (data_ == nullptr) { // Not init yet
437 return;
438 }
439 SeekToRestartPoint(0);
440 ParseNextDataKey<DecodeEntry>();
441 }
442
SeekToFirstOrReportImpl()443 void DataBlockIter::SeekToFirstOrReportImpl() {
444 if (data_ == nullptr) { // Not init yet
445 return;
446 }
447 SeekToRestartPoint(0);
448 ParseNextDataKey<CheckAndDecodeEntry>();
449 }
450
SeekToFirstImpl()451 void IndexBlockIter::SeekToFirstImpl() {
452 if (data_ == nullptr) { // Not init yet
453 return;
454 }
455 status_ = Status::OK();
456 SeekToRestartPoint(0);
457 ParseNextIndexKey();
458 }
459
SeekToLastImpl()460 void DataBlockIter::SeekToLastImpl() {
461 if (data_ == nullptr) { // Not init yet
462 return;
463 }
464 SeekToRestartPoint(num_restarts_ - 1);
465 while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) {
466 // Keep skipping
467 }
468 }
469
SeekToLastImpl()470 void IndexBlockIter::SeekToLastImpl() {
471 if (data_ == nullptr) { // Not init yet
472 return;
473 }
474 status_ = Status::OK();
475 SeekToRestartPoint(num_restarts_ - 1);
476 while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
477 // Keep skipping
478 }
479 }
480
481 template <class TValue>
CorruptionError()482 void BlockIter<TValue>::CorruptionError() {
483 current_ = restarts_;
484 restart_index_ = num_restarts_;
485 status_ = Status::Corruption("bad entry in block");
486 raw_key_.Clear();
487 value_.clear();
488 }
489
490 template <typename DecodeEntryFunc>
ParseNextDataKey(const char * limit)491 bool DataBlockIter::ParseNextDataKey(const char* limit) {
492 current_ = NextEntryOffset();
493 const char* p = data_ + current_;
494 if (!limit) {
495 limit = data_ + restarts_; // Restarts come right after data
496 }
497
498 if (p >= limit) {
499 // No more entries to return. Mark as invalid.
500 current_ = restarts_;
501 restart_index_ = num_restarts_;
502 return false;
503 }
504
505 // Decode next entry
506 uint32_t shared, non_shared, value_length;
507 p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
508 if (p == nullptr || raw_key_.Size() < shared) {
509 CorruptionError();
510 return false;
511 } else {
512 if (shared == 0) {
513 // If this key doesn't share any bytes with prev key then we don't need
514 // to decode it and can use its address in the block directly.
515 raw_key_.SetKey(Slice(p, non_shared), false /* copy */);
516 } else {
517 // This key share `shared` bytes with prev key, we need to decode it
518 raw_key_.TrimAppend(shared, p, non_shared);
519 }
520
521 #ifndef NDEBUG
522 if (global_seqno_ != kDisableGlobalSequenceNumber) {
523 // If we are reading a file with a global sequence number we should
524 // expect that all encoded sequence numbers are zeros and any value
525 // type is kTypeValue, kTypeMerge, kTypeDeletion,
526 // kTypeDeletionWithTimestamp, or kTypeRangeDeletion.
527 uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey());
528 SequenceNumber seqno;
529 ValueType value_type;
530 UnPackSequenceAndType(packed, &seqno, &value_type);
531 assert(value_type == ValueType::kTypeValue ||
532 value_type == ValueType::kTypeMerge ||
533 value_type == ValueType::kTypeDeletion ||
534 value_type == ValueType::kTypeDeletionWithTimestamp ||
535 value_type == ValueType::kTypeRangeDeletion);
536 assert(seqno == 0);
537 }
538 #endif // NDEBUG
539
540 value_ = Slice(p + non_shared, value_length);
541 if (shared == 0) {
542 while (restart_index_ + 1 < num_restarts_ &&
543 GetRestartPoint(restart_index_ + 1) < current_) {
544 ++restart_index_;
545 }
546 }
547 // else we are in the middle of a restart interval and the restart_index_
548 // thus has not changed
549 return true;
550 }
551 }
552
ParseNextIndexKey()553 bool IndexBlockIter::ParseNextIndexKey() {
554 current_ = NextEntryOffset();
555 const char* p = data_ + current_;
556 const char* limit = data_ + restarts_; // Restarts come right after data
557 if (p >= limit) {
558 // No more entries to return. Mark as invalid.
559 current_ = restarts_;
560 restart_index_ = num_restarts_;
561 return false;
562 }
563
564 // Decode next entry
565 uint32_t shared, non_shared, value_length;
566 if (value_delta_encoded_) {
567 p = DecodeKeyV4()(p, limit, &shared, &non_shared);
568 value_length = 0;
569 } else {
570 p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
571 }
572 if (p == nullptr || raw_key_.Size() < shared) {
573 CorruptionError();
574 return false;
575 }
576 if (shared == 0) {
577 // If this key doesn't share any bytes with prev key then we don't need
578 // to decode it and can use its address in the block directly.
579 raw_key_.SetKey(Slice(p, non_shared), false /* copy */);
580 } else {
581 // This key share `shared` bytes with prev key, we need to decode it
582 raw_key_.TrimAppend(shared, p, non_shared);
583 }
584 value_ = Slice(p + non_shared, value_length);
585 if (shared == 0) {
586 while (restart_index_ + 1 < num_restarts_ &&
587 GetRestartPoint(restart_index_ + 1) < current_) {
588 ++restart_index_;
589 }
590 }
591 // else we are in the middle of a restart interval and the restart_index_
592 // thus has not changed
593 if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
594 DecodeCurrentValue(shared);
595 }
596 return true;
597 }
598
599 // The format:
600 // restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
601 // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
602 // ...
603 // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
604 // where, k is key, v is value, and its encoding is in parenthesis.
605 // The format of each key is (shared_size, non_shared_size, shared, non_shared)
606 // The format of each value, i.e., block handle, is (offset, size) whenever the
607 // shared_size is 0, which included the first entry in each restart point.
608 // Otherwise the format is delta-size = block handle size - size of last block
609 // handle.
DecodeCurrentValue(uint32_t shared)610 void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
611 Slice v(value_.data(), data_ + restarts_ - value_.data());
612 // Delta encoding is used if `shared` != 0.
613 Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
614 &v, have_first_key_,
615 (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr);
616 assert(decode_s.ok());
617 value_ = Slice(value_.data(), v.data() - value_.data());
618
619 if (global_seqno_state_ != nullptr) {
620 // Overwrite sequence number the same way as in DataBlockIter.
621
622 IterKey& first_internal_key = global_seqno_state_->first_internal_key;
623 first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
624 /* copy */ true);
625
626 assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
627
628 ValueType value_type = ExtractValueType(first_internal_key.GetKey());
629 assert(value_type == ValueType::kTypeValue ||
630 value_type == ValueType::kTypeMerge ||
631 value_type == ValueType::kTypeDeletion ||
632 value_type == ValueType::kTypeRangeDeletion);
633
634 first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
635 value_type);
636 decoded_value_.first_internal_key = first_internal_key.GetKey();
637 }
638 }
639
640 template <class TValue>
FindKeyAfterBinarySeek(const Slice & target,uint32_t index,bool skip_linear_scan)641 void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
642 uint32_t index,
643 bool skip_linear_scan) {
644 // SeekToRestartPoint() only does the lookup in the restart block. We need
645 // to follow it up with NextImpl() to position the iterator at the restart
646 // key.
647 SeekToRestartPoint(index);
648 NextImpl();
649
650 if (!skip_linear_scan) {
651 // Linear search (within restart block) for first key >= target
652 uint32_t max_offset;
653 if (index + 1 < num_restarts_) {
654 // We are in a non-last restart interval. Since `BinarySeek()` guarantees
655 // the next restart key is strictly greater than `target`, we can
656 // terminate upon reaching it without any additional key comparison.
657 max_offset = GetRestartPoint(index + 1);
658 } else {
659 // We are in the last restart interval. The while-loop will terminate by
660 // `Valid()` returning false upon advancing past the block's last key.
661 max_offset = port::kMaxUint32;
662 }
663 while (true) {
664 NextImpl();
665 if (!Valid()) {
666 break;
667 }
668 if (current_ == max_offset) {
669 assert(CompareCurrentKey(target) > 0);
670 break;
671 } else if (CompareCurrentKey(target) >= 0) {
672 break;
673 }
674 }
675 }
676 }
677
678 // Binary searches in restart array to find the starting restart point for the
679 // linear scan, and stores it in `*index`. Assumes restart array does not
680 // contain duplicate keys. It is guaranteed that the restart key at `*index + 1`
681 // is strictly greater than `target` or does not exist (this can be used to
682 // elide a comparison when linear scan reaches all the way to the next restart
683 // key). Furthermore, `*skip_linear_scan` is set to indicate whether the
684 // `*index`th restart key is the final result so that key does not need to be
685 // compared again later.
686 template <class TValue>
687 template <typename DecodeKeyFunc>
BinarySeek(const Slice & target,uint32_t * index,bool * skip_linear_scan)688 bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
689 bool* skip_linear_scan) {
690 if (restarts_ == 0) {
691 // SST files dedicated to range tombstones are written with index blocks
692 // that have no keys while also having `num_restarts_ == 1`. This would
693 // cause a problem for `BinarySeek()` as it'd try to access the first key
694 // which does not exist. We identify such blocks by the offset at which
695 // their restarts are stored, and return false to prevent any attempted
696 // key accesses.
697 return false;
698 }
699
700 *skip_linear_scan = false;
701 // Loop invariants:
702 // - Restart key at index `left` is less than or equal to the target key. The
703 // sentinel index `-1` is considered to have a key that is less than all
704 // keys.
705 // - Any restart keys after index `right` are strictly greater than the target
706 // key.
707 int64_t left = -1, right = num_restarts_ - 1;
708 while (left != right) {
709 // The `mid` is computed by rounding up so it lands in (`left`, `right`].
710 int64_t mid = left + (right - left + 1) / 2;
711 uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid));
712 uint32_t shared, non_shared;
713 const char* key_ptr = DecodeKeyFunc()(
714 data_ + region_offset, data_ + restarts_, &shared, &non_shared);
715 if (key_ptr == nullptr || (shared != 0)) {
716 CorruptionError();
717 return false;
718 }
719 Slice mid_key(key_ptr, non_shared);
720 raw_key_.SetKey(mid_key, false /* copy */);
721 int cmp = CompareCurrentKey(target);
722 if (cmp < 0) {
723 // Key at "mid" is smaller than "target". Therefore all
724 // blocks before "mid" are uninteresting.
725 left = mid;
726 } else if (cmp > 0) {
727 // Key at "mid" is >= "target". Therefore all blocks at or
728 // after "mid" are uninteresting.
729 right = mid - 1;
730 } else {
731 *skip_linear_scan = true;
732 left = right = mid;
733 }
734 }
735
736 if (left == -1) {
737 // All keys in the block were strictly greater than `target`. So the very
738 // first key in the block is the final seek result.
739 *skip_linear_scan = true;
740 *index = 0;
741 } else {
742 *index = static_cast<uint32_t>(left);
743 }
744 return true;
745 }
746
747 // Compare target key and the block key of the block of `block_index`.
748 // Return -1 if error.
CompareBlockKey(uint32_t block_index,const Slice & target)749 int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
750 uint32_t region_offset = GetRestartPoint(block_index);
751 uint32_t shared, non_shared;
752 const char* key_ptr =
753 value_delta_encoded_
754 ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
755 &non_shared)
756 : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
757 &non_shared);
758 if (key_ptr == nullptr || (shared != 0)) {
759 CorruptionError();
760 return 1; // Return target is smaller
761 }
762 Slice block_key(key_ptr, non_shared);
763 raw_key_.SetKey(block_key, false /* copy */);
764 return CompareCurrentKey(target);
765 }
766
767 // Binary search in block_ids to find the first block
768 // with a key >= target
BinaryBlockIndexSeek(const Slice & target,uint32_t * block_ids,uint32_t left,uint32_t right,uint32_t * index,bool * prefix_may_exist)769 bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
770 uint32_t* block_ids, uint32_t left,
771 uint32_t right, uint32_t* index,
772 bool* prefix_may_exist) {
773 assert(left <= right);
774 assert(index);
775 assert(prefix_may_exist);
776 *prefix_may_exist = true;
777 uint32_t left_bound = left;
778
779 while (left <= right) {
780 uint32_t mid = (right + left) / 2;
781
782 int cmp = CompareBlockKey(block_ids[mid], target);
783 if (!status_.ok()) {
784 return false;
785 }
786 if (cmp < 0) {
787 // Key at "target" is larger than "mid". Therefore all
788 // blocks before or at "mid" are uninteresting.
789 left = mid + 1;
790 } else {
791 // Key at "target" is <= "mid". Therefore all blocks
792 // after "mid" are uninteresting.
793 // If there is only one block left, we found it.
794 if (left == right) break;
795 right = mid;
796 }
797 }
798
799 if (left == right) {
800 // In one of the two following cases:
801 // (1) left is the first one of block_ids
802 // (2) there is a gap of blocks between block of `left` and `left-1`.
803 // we can further distinguish the case of key in the block or key not
804 // existing, by comparing the target key and the key of the previous
805 // block to the left of the block found.
806 if (block_ids[left] > 0 &&
807 (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
808 CompareBlockKey(block_ids[left] - 1, target) > 0) {
809 current_ = restarts_;
810 *prefix_may_exist = false;
811 return false;
812 }
813
814 *index = block_ids[left];
815 return true;
816 } else {
817 assert(left > right);
818
819 // If the next block key is larger than seek key, it is possible that
820 // no key shares the prefix with `target`, or all keys with the same
821 // prefix as `target` are smaller than prefix. In the latter case,
822 // we are mandated to set the position the same as the total order.
823 // In the latter case, either:
824 // (1) `target` falls into the range of the next block. In this case,
825 // we can place the iterator to the next block, or
826 // (2) `target` is larger than all block keys. In this case we can
827 // keep the iterator invalidate without setting `prefix_may_exist`
828 // to false.
829 // We might sometimes end up with setting the total order position
830 // while there is no key sharing the prefix as `target`, but it
831 // still follows the contract.
832 uint32_t right_index = block_ids[right];
833 assert(right_index + 1 <= num_restarts_);
834 if (right_index + 1 < num_restarts_) {
835 if (CompareBlockKey(right_index + 1, target) >= 0) {
836 *index = right_index + 1;
837 return true;
838 } else {
839 // We have to set the flag here because we are not positioning
840 // the iterator to the total order position.
841 *prefix_may_exist = false;
842 }
843 }
844
845 // Mark iterator invalid
846 current_ = restarts_;
847 return false;
848 }
849 }
850
PrefixSeek(const Slice & target,uint32_t * index,bool * prefix_may_exist)851 bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
852 bool* prefix_may_exist) {
853 assert(index);
854 assert(prefix_may_exist);
855 assert(prefix_index_);
856 *prefix_may_exist = true;
857 Slice seek_key = target;
858 if (raw_key_.IsUserKey()) {
859 seek_key = ExtractUserKey(target);
860 }
861 uint32_t* block_ids = nullptr;
862 uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
863
864 if (num_blocks == 0) {
865 current_ = restarts_;
866 *prefix_may_exist = false;
867 return false;
868 } else {
869 assert(block_ids);
870 return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
871 prefix_may_exist);
872 }
873 }
874
NumRestarts() const875 uint32_t Block::NumRestarts() const {
876 assert(size_ >= 2 * sizeof(uint32_t));
877 uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
878 uint32_t num_restarts = block_footer;
879 if (size_ > kMaxBlockSizeSupportedByHashIndex) {
880 // In BlockBuilder, we have ensured a block with HashIndex is less than
881 // kMaxBlockSizeSupportedByHashIndex (64KiB).
882 //
883 // Therefore, if we encounter a block with a size > 64KiB, the block
884 // cannot have HashIndex. So the footer will directly interpreted as
885 // num_restarts.
886 //
887 // Such check is for backward compatibility. We can ensure legacy block
888 // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
889 // correctly as no HashIndex even if the MSB of num_restarts is set.
890 return num_restarts;
891 }
892 BlockBasedTableOptions::DataBlockIndexType index_type;
893 UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
894 return num_restarts;
895 }
896
IndexType() const897 BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
898 assert(size_ >= 2 * sizeof(uint32_t));
899 if (size_ > kMaxBlockSizeSupportedByHashIndex) {
900 // The check is for the same reason as that in NumRestarts()
901 return BlockBasedTableOptions::kDataBlockBinarySearch;
902 }
903 uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
904 uint32_t num_restarts = block_footer;
905 BlockBasedTableOptions::DataBlockIndexType index_type;
906 UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
907 return index_type;
908 }
909
~Block()910 Block::~Block() {
911 // This sync point can be re-enabled if RocksDB can control the
912 // initialization order of any/all static options created by the user.
913 // TEST_SYNC_POINT("Block::~Block");
914 }
915
Block(BlockContents && contents,size_t read_amp_bytes_per_bit,Statistics * statistics)916 Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
917 Statistics* statistics)
918 : contents_(std::move(contents)),
919 data_(contents_.data.data()),
920 size_(contents_.data.size()),
921 restart_offset_(0),
922 num_restarts_(0) {
923 TEST_SYNC_POINT("Block::Block:0");
924 if (size_ < sizeof(uint32_t)) {
925 size_ = 0; // Error marker
926 } else {
927 // Should only decode restart points for uncompressed blocks
928 num_restarts_ = NumRestarts();
929 switch (IndexType()) {
930 case BlockBasedTableOptions::kDataBlockBinarySearch:
931 restart_offset_ = static_cast<uint32_t>(size_) -
932 (1 + num_restarts_) * sizeof(uint32_t);
933 if (restart_offset_ > size_ - sizeof(uint32_t)) {
934 // The size is too small for NumRestarts() and therefore
935 // restart_offset_ wrapped around.
936 size_ = 0;
937 }
938 break;
939 case BlockBasedTableOptions::kDataBlockBinaryAndHash:
940 if (size_ < sizeof(uint32_t) /* block footer */ +
941 sizeof(uint16_t) /* NUM_BUCK */) {
942 size_ = 0;
943 break;
944 }
945
946 uint16_t map_offset;
947 data_block_hash_index_.Initialize(
948 contents.data.data(),
949 static_cast<uint16_t>(contents.data.size() -
950 sizeof(uint32_t)), /*chop off
951 NUM_RESTARTS*/
952 &map_offset);
953
954 restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
955
956 if (restart_offset_ > map_offset) {
957 // map_offset is too small for NumRestarts() and
958 // therefore restart_offset_ wrapped around.
959 size_ = 0;
960 break;
961 }
962 break;
963 default:
964 size_ = 0; // Error marker
965 }
966 }
967 if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
968 read_amp_bitmap_.reset(new BlockReadAmpBitmap(
969 restart_offset_, read_amp_bytes_per_bit, statistics));
970 }
971 }
972
NewDataIterator(const Comparator * raw_ucmp,SequenceNumber global_seqno,DataBlockIter * iter,Statistics * stats,bool block_contents_pinned)973 DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
974 SequenceNumber global_seqno,
975 DataBlockIter* iter, Statistics* stats,
976 bool block_contents_pinned) {
977 DataBlockIter* ret_iter;
978 if (iter != nullptr) {
979 ret_iter = iter;
980 } else {
981 ret_iter = new DataBlockIter;
982 }
983 if (size_ < 2 * sizeof(uint32_t)) {
984 ret_iter->Invalidate(Status::Corruption("bad block contents"));
985 return ret_iter;
986 }
987 if (num_restarts_ == 0) {
988 // Empty block.
989 ret_iter->Invalidate(Status::OK());
990 return ret_iter;
991 } else {
992 ret_iter->Initialize(
993 raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno,
994 read_amp_bitmap_.get(), block_contents_pinned,
995 data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
996 if (read_amp_bitmap_) {
997 if (read_amp_bitmap_->GetStatistics() != stats) {
998 // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
999 read_amp_bitmap_->SetStatistics(stats);
1000 }
1001 }
1002 }
1003
1004 return ret_iter;
1005 }
1006
NewIndexIterator(const Comparator * raw_ucmp,SequenceNumber global_seqno,IndexBlockIter * iter,Statistics *,bool total_order_seek,bool have_first_key,bool key_includes_seq,bool value_is_full,bool block_contents_pinned,BlockPrefixIndex * prefix_index)1007 IndexBlockIter* Block::NewIndexIterator(
1008 const Comparator* raw_ucmp, SequenceNumber global_seqno,
1009 IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek,
1010 bool have_first_key, bool key_includes_seq, bool value_is_full,
1011 bool block_contents_pinned, BlockPrefixIndex* prefix_index) {
1012 IndexBlockIter* ret_iter;
1013 if (iter != nullptr) {
1014 ret_iter = iter;
1015 } else {
1016 ret_iter = new IndexBlockIter;
1017 }
1018 if (size_ < 2 * sizeof(uint32_t)) {
1019 ret_iter->Invalidate(Status::Corruption("bad block contents"));
1020 return ret_iter;
1021 }
1022 if (num_restarts_ == 0) {
1023 // Empty block.
1024 ret_iter->Invalidate(Status::OK());
1025 return ret_iter;
1026 } else {
1027 BlockPrefixIndex* prefix_index_ptr =
1028 total_order_seek ? nullptr : prefix_index;
1029 ret_iter->Initialize(raw_ucmp, data_, restart_offset_, num_restarts_,
1030 global_seqno, prefix_index_ptr, have_first_key,
1031 key_includes_seq, value_is_full,
1032 block_contents_pinned);
1033 }
1034
1035 return ret_iter;
1036 }
1037
ApproximateMemoryUsage() const1038 size_t Block::ApproximateMemoryUsage() const {
1039 size_t usage = usable_size();
1040 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
1041 usage += malloc_usable_size((void*)this);
1042 #else
1043 usage += sizeof(*this);
1044 #endif // ROCKSDB_MALLOC_USABLE_SIZE
1045 if (read_amp_bitmap_) {
1046 usage += read_amp_bitmap_->ApproximateMemoryUsage();
1047 }
1048 return usage;
1049 }
1050
1051 } // namespace ROCKSDB_NAMESPACE
1052