1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 //
10 // Decodes the blocks generated by block_builder.cc.
11
12 #include "table/block_based/block.h"
13 #include <algorithm>
14 #include <string>
15 #include <unordered_map>
16 #include <vector>
17
18 #include "logging/logging.h"
19 #include "monitoring/perf_context_imp.h"
20 #include "port/port.h"
21 #include "port/stack_trace.h"
22 #include "rocksdb/comparator.h"
23 #include "table/block_based/block_prefix_index.h"
24 #include "table/block_based/data_block_footer.h"
25 #include "table/format.h"
26 #include "util/coding.h"
27
28 namespace ROCKSDB_NAMESPACE {
29
30 // Helper routine: decode the next block entry starting at "p",
31 // storing the number of shared key bytes, non_shared key bytes,
32 // and the length of the value in "*shared", "*non_shared", and
33 // "*value_length", respectively. Will not derefence past "limit".
34 //
35 // If any errors are detected, returns nullptr. Otherwise, returns a
36 // pointer to the key delta (just past the three decoded values).
37 struct DecodeEntry {
operator ()ROCKSDB_NAMESPACE::DecodeEntry38 inline const char* operator()(const char* p, const char* limit,
39 uint32_t* shared, uint32_t* non_shared,
40 uint32_t* value_length) {
41 // We need 2 bytes for shared and non_shared size. We also need one more
42 // byte either for value size or the actual value in case of value delta
43 // encoding.
44 assert(limit - p >= 3);
45 *shared = reinterpret_cast<const unsigned char*>(p)[0];
46 *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
47 *value_length = reinterpret_cast<const unsigned char*>(p)[2];
48 if ((*shared | *non_shared | *value_length) < 128) {
49 // Fast path: all three values are encoded in one byte each
50 p += 3;
51 } else {
52 if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
53 if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
54 if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
55 return nullptr;
56 }
57 }
58
59 // Using an assert in place of "return null" since we should not pay the
60 // cost of checking for corruption on every single key decoding
61 assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
62 return p;
63 }
64 };
65
66 // Helper routine: similar to DecodeEntry but does not have assertions.
67 // Instead, returns nullptr so that caller can detect and report failure.
68 struct CheckAndDecodeEntry {
operator ()ROCKSDB_NAMESPACE::CheckAndDecodeEntry69 inline const char* operator()(const char* p, const char* limit,
70 uint32_t* shared, uint32_t* non_shared,
71 uint32_t* value_length) {
72 // We need 2 bytes for shared and non_shared size. We also need one more
73 // byte either for value size or the actual value in case of value delta
74 // encoding.
75 if (limit - p < 3) {
76 return nullptr;
77 }
78 *shared = reinterpret_cast<const unsigned char*>(p)[0];
79 *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
80 *value_length = reinterpret_cast<const unsigned char*>(p)[2];
81 if ((*shared | *non_shared | *value_length) < 128) {
82 // Fast path: all three values are encoded in one byte each
83 p += 3;
84 } else {
85 if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
86 if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
87 if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
88 return nullptr;
89 }
90 }
91
92 if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
93 return nullptr;
94 }
95 return p;
96 }
97 };
98
99 struct DecodeKey {
operator ()ROCKSDB_NAMESPACE::DecodeKey100 inline const char* operator()(const char* p, const char* limit,
101 uint32_t* shared, uint32_t* non_shared) {
102 uint32_t value_length;
103 return DecodeEntry()(p, limit, shared, non_shared, &value_length);
104 }
105 };
106
107 // In format_version 4, which is used by index blocks, the value size is not
108 // encoded before the entry, as the value is known to be the handle with the
109 // known size.
110 struct DecodeKeyV4 {
operator ()ROCKSDB_NAMESPACE::DecodeKeyV4111 inline const char* operator()(const char* p, const char* limit,
112 uint32_t* shared, uint32_t* non_shared) {
113 // We need 2 bytes for shared and non_shared size. We also need one more
114 // byte either for value size or the actual value in case of value delta
115 // encoding.
116 if (limit - p < 3) return nullptr;
117 *shared = reinterpret_cast<const unsigned char*>(p)[0];
118 *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
119 if ((*shared | *non_shared) < 128) {
120 // Fast path: all three values are encoded in one byte each
121 p += 2;
122 } else {
123 if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
124 if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
125 }
126 return p;
127 }
128 };
129
Next()130 void DataBlockIter::Next() {
131 assert(Valid());
132 ParseNextDataKey<DecodeEntry>();
133 }
134
NextOrReport()135 void DataBlockIter::NextOrReport() {
136 assert(Valid());
137 ParseNextDataKey<CheckAndDecodeEntry>();
138 }
139
Next()140 void IndexBlockIter::Next() {
141 assert(Valid());
142 ParseNextIndexKey();
143 }
144
Prev()145 void IndexBlockIter::Prev() {
146 assert(Valid());
147 // Scan backwards to a restart point before current_
148 const uint32_t original = current_;
149 while (GetRestartPoint(restart_index_) >= original) {
150 if (restart_index_ == 0) {
151 // No more entries
152 current_ = restarts_;
153 restart_index_ = num_restarts_;
154 return;
155 }
156 restart_index_--;
157 }
158 SeekToRestartPoint(restart_index_);
159 // Loop until end of current entry hits the start of original entry
160 while (ParseNextIndexKey() && NextEntryOffset() < original) {
161 }
162 }
163
164 // Similar to IndexBlockIter::Prev but also caches the prev entries
Prev()165 void DataBlockIter::Prev() {
166 assert(Valid());
167
168 assert(prev_entries_idx_ == -1 ||
169 static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
170 // Check if we can use cached prev_entries_
171 if (prev_entries_idx_ > 0 &&
172 prev_entries_[prev_entries_idx_].offset == current_) {
173 // Read cached CachedPrevEntry
174 prev_entries_idx_--;
175 const CachedPrevEntry& current_prev_entry =
176 prev_entries_[prev_entries_idx_];
177
178 const char* key_ptr = nullptr;
179 if (current_prev_entry.key_ptr != nullptr) {
180 // The key is not delta encoded and stored in the data block
181 key_ptr = current_prev_entry.key_ptr;
182 key_pinned_ = true;
183 } else {
184 // The key is delta encoded and stored in prev_entries_keys_buff_
185 key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
186 key_pinned_ = false;
187 }
188 const Slice current_key(key_ptr, current_prev_entry.key_size);
189
190 current_ = current_prev_entry.offset;
191 key_.SetKey(current_key, false /* copy */);
192 value_ = current_prev_entry.value;
193
194 return;
195 }
196
197 // Clear prev entries cache
198 prev_entries_idx_ = -1;
199 prev_entries_.clear();
200 prev_entries_keys_buff_.clear();
201
202 // Scan backwards to a restart point before current_
203 const uint32_t original = current_;
204 while (GetRestartPoint(restart_index_) >= original) {
205 if (restart_index_ == 0) {
206 // No more entries
207 current_ = restarts_;
208 restart_index_ = num_restarts_;
209 return;
210 }
211 restart_index_--;
212 }
213
214 SeekToRestartPoint(restart_index_);
215
216 do {
217 if (!ParseNextDataKey<DecodeEntry>()) {
218 break;
219 }
220 Slice current_key = key();
221
222 if (key_.IsKeyPinned()) {
223 // The key is not delta encoded
224 prev_entries_.emplace_back(current_, current_key.data(), 0,
225 current_key.size(), value());
226 } else {
227 // The key is delta encoded, cache decoded key in buffer
228 size_t new_key_offset = prev_entries_keys_buff_.size();
229 prev_entries_keys_buff_.append(current_key.data(), current_key.size());
230
231 prev_entries_.emplace_back(current_, nullptr, new_key_offset,
232 current_key.size(), value());
233 }
234 // Loop until end of current entry hits the start of original entry
235 } while (NextEntryOffset() < original);
236 prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
237 }
238
Seek(const Slice & target)239 void DataBlockIter::Seek(const Slice& target) {
240 Slice seek_key = target;
241 PERF_TIMER_GUARD(block_seek_nanos);
242 if (data_ == nullptr) { // Not init yet
243 return;
244 }
245 uint32_t index = 0;
246 bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
247 comparator_);
248
249 if (!ok) {
250 return;
251 }
252 SeekToRestartPoint(index);
253
254 // Linear search (within restart block) for first key >= target
255 while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) {
256 }
257 }
258
259 // Optimized Seek for point lookup for an internal key `target`
260 // target = "seek_user_key @ type | seqno".
261 //
262 // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
263 // or kTypeBlobIndex, this function behaves identically as Seek().
264 //
265 // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
266 // or kTypeBlobIndex:
267 //
268 // If the return value is FALSE, iter location is undefined, and it means:
269 // 1) there is no key in this block falling into the range:
270 // ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
271 // inclusive; AND
272 // 2) the last key of this block has a greater user_key from seek_user_key
273 //
274 // If the return value is TRUE, iter location has two possibilies:
275 // 1) If iter is valid, it is set to a location as if set by BinarySeek. In
276 // this case, it points to the first key_ with a larger user_key or a
277 // matching user_key with a seqno no greater than the seeking seqno.
278 // 2) If the iter is invalid, it means that either all the user_key is less
279 // than the seek_user_key, or the block ends with a matching user_key but
280 // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
281 // but larger type).
SeekForGetImpl(const Slice & target)282 bool DataBlockIter::SeekForGetImpl(const Slice& target) {
283 Slice target_user_key = ExtractUserKey(target);
284 uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
285 uint8_t entry =
286 data_block_hash_index_->Lookup(data_, map_offset, target_user_key);
287
288 if (entry == kCollision) {
289 // HashSeek not effective, falling back
290 Seek(target);
291 return true;
292 }
293
294 if (entry == kNoEntry) {
295 // Even if we cannot find the user_key in this block, the result may
296 // exist in the next block. Consider this exmpale:
297 //
298 // Block N: [aab@100, ... , app@120]
299 // bounary key: axy@50 (we make minimal assumption about a boundary key)
300 // Block N+1: [axy@10, ... ]
301 //
302 // If seek_key = axy@60, the search will starts from Block N.
303 // Even if the user_key is not found in the hash map, the caller still
304 // have to conntinue searching the next block.
305 //
306 // In this case, we pretend the key is the the last restart interval.
307 // The while-loop below will search the last restart interval for the
308 // key. It will stop at the first key that is larger than the seek_key,
309 // or to the end of the block if no one is larger.
310 entry = static_cast<uint8_t>(num_restarts_ - 1);
311 }
312
313 uint32_t restart_index = entry;
314
315 // check if the key is in the restart_interval
316 assert(restart_index < num_restarts_);
317 SeekToRestartPoint(restart_index);
318
319 const char* limit = nullptr;
320 if (restart_index_ + 1 < num_restarts_) {
321 limit = data_ + GetRestartPoint(restart_index_ + 1);
322 } else {
323 limit = data_ + restarts_;
324 }
325
326 while (true) {
327 // Here we only linear seek the target key inside the restart interval.
328 // If a key does not exist inside a restart interval, we avoid
329 // further searching the block content accross restart interval boundary.
330 //
331 // TODO(fwu): check the left and write boundary of the restart interval
332 // to avoid linear seek a target key that is out of range.
333 if (!ParseNextDataKey<DecodeEntry>(limit) || Compare(key_, target) >= 0) {
334 // we stop at the first potential matching user key.
335 break;
336 }
337 }
338
339 if (current_ == restarts_) {
340 // Search reaches to the end of the block. There are three possibilites:
341 // 1) there is only one user_key match in the block (otherwise collsion).
342 // the matching user_key resides in the last restart interval, and it
343 // is the last key of the restart interval and of the block as well.
344 // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
345 //
346 // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
347 // AND all existing user_keys in the restart interval are smaller than
348 // seek_user_key.
349 //
350 // 3) The seek_key is a false positive and happens to be hashed to the
351 // last restart interval, AND all existing user_keys in the restart
352 // interval are smaller than seek_user_key.
353 //
354 // The result may exist in the next block each case, so we return true.
355 return true;
356 }
357
358 if (user_comparator_->Compare(key_.GetUserKey(), target_user_key) != 0) {
359 // the key is not in this block and cannot be at the next block either.
360 return false;
361 }
362
363 // Here we are conservative and only support a limited set of cases
364 ValueType value_type = ExtractValueType(key_.GetKey());
365 if (value_type != ValueType::kTypeValue &&
366 value_type != ValueType::kTypeDeletion &&
367 value_type != ValueType::kTypeSingleDeletion &&
368 value_type != ValueType::kTypeBlobIndex) {
369 Seek(target);
370 return true;
371 }
372
373 // Result found, and the iter is correctly set.
374 return true;
375 }
376
Seek(const Slice & target)377 void IndexBlockIter::Seek(const Slice& target) {
378 TEST_SYNC_POINT("IndexBlockIter::Seek:0");
379 Slice seek_key = target;
380 if (!key_includes_seq_) {
381 seek_key = ExtractUserKey(target);
382 }
383 PERF_TIMER_GUARD(block_seek_nanos);
384 if (data_ == nullptr) { // Not init yet
385 return;
386 }
387 status_ = Status::OK();
388 uint32_t index = 0;
389 bool ok = false;
390 if (prefix_index_) {
391 bool prefix_may_exist = true;
392 ok = PrefixSeek(target, &index, &prefix_may_exist);
393 if (!prefix_may_exist) {
394 // This is to let the caller to distinguish between non-existing prefix,
395 // and when key is larger than the last key, which both set Valid() to
396 // false.
397 current_ = restarts_;
398 status_ = Status::NotFound();
399 }
400 } else if (value_delta_encoded_) {
401 ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index,
402 comparator_);
403 } else {
404 ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
405 comparator_);
406 }
407
408 if (!ok) {
409 return;
410 }
411 SeekToRestartPoint(index);
412
413 // Linear search (within restart block) for first key >= target
414 while (ParseNextIndexKey() && Compare(key_, seek_key) < 0) {
415 }
416 }
417
SeekForPrev(const Slice & target)418 void DataBlockIter::SeekForPrev(const Slice& target) {
419 PERF_TIMER_GUARD(block_seek_nanos);
420 Slice seek_key = target;
421 if (data_ == nullptr) { // Not init yet
422 return;
423 }
424 uint32_t index = 0;
425 bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
426 comparator_);
427
428 if (!ok) {
429 return;
430 }
431 SeekToRestartPoint(index);
432
433 // Linear search (within restart block) for first key >= seek_key
434 while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) {
435 }
436 if (!Valid()) {
437 SeekToLast();
438 } else {
439 while (Valid() && Compare(key_, seek_key) > 0) {
440 Prev();
441 }
442 }
443 }
444
SeekToFirst()445 void DataBlockIter::SeekToFirst() {
446 if (data_ == nullptr) { // Not init yet
447 return;
448 }
449 SeekToRestartPoint(0);
450 ParseNextDataKey<DecodeEntry>();
451 }
452
SeekToFirstOrReport()453 void DataBlockIter::SeekToFirstOrReport() {
454 if (data_ == nullptr) { // Not init yet
455 return;
456 }
457 SeekToRestartPoint(0);
458 ParseNextDataKey<CheckAndDecodeEntry>();
459 }
460
SeekToFirst()461 void IndexBlockIter::SeekToFirst() {
462 if (data_ == nullptr) { // Not init yet
463 return;
464 }
465 status_ = Status::OK();
466 SeekToRestartPoint(0);
467 ParseNextIndexKey();
468 }
469
SeekToLast()470 void DataBlockIter::SeekToLast() {
471 if (data_ == nullptr) { // Not init yet
472 return;
473 }
474 SeekToRestartPoint(num_restarts_ - 1);
475 while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) {
476 // Keep skipping
477 }
478 }
479
SeekToLast()480 void IndexBlockIter::SeekToLast() {
481 if (data_ == nullptr) { // Not init yet
482 return;
483 }
484 status_ = Status::OK();
485 SeekToRestartPoint(num_restarts_ - 1);
486 while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
487 // Keep skipping
488 }
489 }
490
491 template <class TValue>
CorruptionError()492 void BlockIter<TValue>::CorruptionError() {
493 current_ = restarts_;
494 restart_index_ = num_restarts_;
495 status_ = Status::Corruption("bad entry in block");
496 key_.Clear();
497 value_.clear();
498 }
499
500 template <typename DecodeEntryFunc>
ParseNextDataKey(const char * limit)501 bool DataBlockIter::ParseNextDataKey(const char* limit) {
502 current_ = NextEntryOffset();
503 const char* p = data_ + current_;
504 if (!limit) {
505 limit = data_ + restarts_; // Restarts come right after data
506 }
507
508 if (p >= limit) {
509 // No more entries to return. Mark as invalid.
510 current_ = restarts_;
511 restart_index_ = num_restarts_;
512 return false;
513 }
514
515 // Decode next entry
516 uint32_t shared, non_shared, value_length;
517 p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
518 if (p == nullptr || key_.Size() < shared) {
519 CorruptionError();
520 return false;
521 } else {
522 if (shared == 0) {
523 // If this key dont share any bytes with prev key then we dont need
524 // to decode it and can use it's address in the block directly.
525 key_.SetKey(Slice(p, non_shared), false /* copy */);
526 key_pinned_ = true;
527 } else {
528 // This key share `shared` bytes with prev key, we need to decode it
529 key_.TrimAppend(shared, p, non_shared);
530 key_pinned_ = false;
531 }
532
533 if (global_seqno_ != kDisableGlobalSequenceNumber) {
534 // If we are reading a file with a global sequence number we should
535 // expect that all encoded sequence numbers are zeros and any value
536 // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion.
537 assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0);
538
539 ValueType value_type = ExtractValueType(key_.GetKey());
540 assert(value_type == ValueType::kTypeValue ||
541 value_type == ValueType::kTypeMerge ||
542 value_type == ValueType::kTypeDeletion ||
543 value_type == ValueType::kTypeRangeDeletion);
544
545 if (key_pinned_) {
546 // TODO(tec): Investigate updating the seqno in the loaded block
547 // directly instead of doing a copy and update.
548
549 // We cannot use the key address in the block directly because
550 // we have a global_seqno_ that will overwrite the encoded one.
551 key_.OwnKey();
552 key_pinned_ = false;
553 }
554
555 key_.UpdateInternalKey(global_seqno_, value_type);
556 }
557
558 value_ = Slice(p + non_shared, value_length);
559 if (shared == 0) {
560 while (restart_index_ + 1 < num_restarts_ &&
561 GetRestartPoint(restart_index_ + 1) < current_) {
562 ++restart_index_;
563 }
564 }
565 // else we are in the middle of a restart interval and the restart_index_
566 // thus has not changed
567 return true;
568 }
569 }
570
ParseNextIndexKey()571 bool IndexBlockIter::ParseNextIndexKey() {
572 current_ = NextEntryOffset();
573 const char* p = data_ + current_;
574 const char* limit = data_ + restarts_; // Restarts come right after data
575 if (p >= limit) {
576 // No more entries to return. Mark as invalid.
577 current_ = restarts_;
578 restart_index_ = num_restarts_;
579 return false;
580 }
581
582 // Decode next entry
583 uint32_t shared, non_shared, value_length;
584 if (value_delta_encoded_) {
585 p = DecodeKeyV4()(p, limit, &shared, &non_shared);
586 value_length = 0;
587 } else {
588 p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
589 }
590 if (p == nullptr || key_.Size() < shared) {
591 CorruptionError();
592 return false;
593 }
594 if (shared == 0) {
595 // If this key dont share any bytes with prev key then we dont need
596 // to decode it and can use it's address in the block directly.
597 key_.SetKey(Slice(p, non_shared), false /* copy */);
598 key_pinned_ = true;
599 } else {
600 // This key share `shared` bytes with prev key, we need to decode it
601 key_.TrimAppend(shared, p, non_shared);
602 key_pinned_ = false;
603 }
604 value_ = Slice(p + non_shared, value_length);
605 if (shared == 0) {
606 while (restart_index_ + 1 < num_restarts_ &&
607 GetRestartPoint(restart_index_ + 1) < current_) {
608 ++restart_index_;
609 }
610 }
611 // else we are in the middle of a restart interval and the restart_index_
612 // thus has not changed
613 if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
614 DecodeCurrentValue(shared);
615 }
616 return true;
617 }
618
619 // The format:
620 // restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
621 // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
622 // ...
623 // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
624 // where, k is key, v is value, and its encoding is in parenthesis.
625 // The format of each key is (shared_size, non_shared_size, shared, non_shared)
626 // The format of each value, i.e., block hanlde, is (offset, size) whenever the
627 // shared_size is 0, which included the first entry in each restart point.
628 // Otherwise the format is delta-size = block handle size - size of last block
629 // handle.
DecodeCurrentValue(uint32_t shared)630 void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
631 Slice v(value_.data(), data_ + restarts_ - value_.data());
632 // Delta encoding is used if `shared` != 0.
633 Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
634 &v, have_first_key_,
635 (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr);
636 assert(decode_s.ok());
637 value_ = Slice(value_.data(), v.data() - value_.data());
638
639 if (global_seqno_state_ != nullptr) {
640 // Overwrite sequence number the same way as in DataBlockIter.
641
642 IterKey& first_internal_key = global_seqno_state_->first_internal_key;
643 first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
644 /* copy */ true);
645
646 assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
647
648 ValueType value_type = ExtractValueType(first_internal_key.GetKey());
649 assert(value_type == ValueType::kTypeValue ||
650 value_type == ValueType::kTypeMerge ||
651 value_type == ValueType::kTypeDeletion ||
652 value_type == ValueType::kTypeRangeDeletion);
653
654 first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
655 value_type);
656 decoded_value_.first_internal_key = first_internal_key.GetKey();
657 }
658 }
659
660 // Binary search in restart array to find the first restart point that
661 // is either the last restart point with a key less than target,
662 // which means the key of next restart point is larger than target, or
663 // the first restart point with a key = target
664 template <class TValue>
665 template <typename DecodeKeyFunc>
BinarySeek(const Slice & target,uint32_t left,uint32_t right,uint32_t * index,const Comparator * comp)666 bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t left,
667 uint32_t right, uint32_t* index,
668 const Comparator* comp) {
669 assert(left <= right);
670
671 while (left < right) {
672 uint32_t mid = (left + right + 1) / 2;
673 uint32_t region_offset = GetRestartPoint(mid);
674 uint32_t shared, non_shared;
675 const char* key_ptr = DecodeKeyFunc()(
676 data_ + region_offset, data_ + restarts_, &shared, &non_shared);
677 if (key_ptr == nullptr || (shared != 0)) {
678 CorruptionError();
679 return false;
680 }
681 Slice mid_key(key_ptr, non_shared);
682 int cmp = comp->Compare(mid_key, target);
683 if (cmp < 0) {
684 // Key at "mid" is smaller than "target". Therefore all
685 // blocks before "mid" are uninteresting.
686 left = mid;
687 } else if (cmp > 0) {
688 // Key at "mid" is >= "target". Therefore all blocks at or
689 // after "mid" are uninteresting.
690 right = mid - 1;
691 } else {
692 left = right = mid;
693 }
694 }
695
696 *index = left;
697 return true;
698 }
699
700 // Compare target key and the block key of the block of `block_index`.
701 // Return -1 if error.
CompareBlockKey(uint32_t block_index,const Slice & target)702 int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
703 uint32_t region_offset = GetRestartPoint(block_index);
704 uint32_t shared, non_shared;
705 const char* key_ptr =
706 value_delta_encoded_
707 ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
708 &non_shared)
709 : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
710 &non_shared);
711 if (key_ptr == nullptr || (shared != 0)) {
712 CorruptionError();
713 return 1; // Return target is smaller
714 }
715 Slice block_key(key_ptr, non_shared);
716 return Compare(block_key, target);
717 }
718
719 // Binary search in block_ids to find the first block
720 // with a key >= target
BinaryBlockIndexSeek(const Slice & target,uint32_t * block_ids,uint32_t left,uint32_t right,uint32_t * index,bool * prefix_may_exist)721 bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
722 uint32_t* block_ids, uint32_t left,
723 uint32_t right, uint32_t* index,
724 bool* prefix_may_exist) {
725 assert(left <= right);
726 assert(index);
727 assert(prefix_may_exist);
728 *prefix_may_exist = true;
729 uint32_t left_bound = left;
730
731 while (left <= right) {
732 uint32_t mid = (right + left) / 2;
733
734 int cmp = CompareBlockKey(block_ids[mid], target);
735 if (!status_.ok()) {
736 return false;
737 }
738 if (cmp < 0) {
739 // Key at "target" is larger than "mid". Therefore all
740 // blocks before or at "mid" are uninteresting.
741 left = mid + 1;
742 } else {
743 // Key at "target" is <= "mid". Therefore all blocks
744 // after "mid" are uninteresting.
745 // If there is only one block left, we found it.
746 if (left == right) break;
747 right = mid;
748 }
749 }
750
751 if (left == right) {
752 // In one of the two following cases:
753 // (1) left is the first one of block_ids
754 // (2) there is a gap of blocks between block of `left` and `left-1`.
755 // we can further distinguish the case of key in the block or key not
756 // existing, by comparing the target key and the key of the previous
757 // block to the left of the block found.
758 if (block_ids[left] > 0 &&
759 (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
760 CompareBlockKey(block_ids[left] - 1, target) > 0) {
761 current_ = restarts_;
762 *prefix_may_exist = false;
763 return false;
764 }
765
766 *index = block_ids[left];
767 return true;
768 } else {
769 assert(left > right);
770
771 // If the next block key is larger than seek key, it is possible that
772 // no key shares the prefix with `target`, or all keys with the same
773 // prefix as `target` are smaller than prefix. In the latter case,
774 // we are mandated to set the position the same as the total order.
775 // In the latter case, either:
776 // (1) `target` falls into the range of the next block. In this case,
777 // we can place the iterator to the next block, or
778 // (2) `target` is larger than all block keys. In this case we can
779 // keep the iterator invalidate without setting `prefix_may_exist`
780 // to false.
781 // We might sometimes end up with setting the total order position
782 // while there is no key sharing the prefix as `target`, but it
783 // still follows the contract.
784 uint32_t right_index = block_ids[right];
785 assert(right_index + 1 <= num_restarts_);
786 if (right_index + 1 < num_restarts_) {
787 if (CompareBlockKey(right_index + 1, target) >= 0) {
788 *index = right_index + 1;
789 return true;
790 } else {
791 // We have to set the flag here because we are not positioning
792 // the iterator to the total order position.
793 *prefix_may_exist = false;
794 }
795 }
796
797 // Mark iterator invalid
798 current_ = restarts_;
799 return false;
800 }
801 }
802
PrefixSeek(const Slice & target,uint32_t * index,bool * prefix_may_exist)803 bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
804 bool* prefix_may_exist) {
805 assert(index);
806 assert(prefix_may_exist);
807 assert(prefix_index_);
808 *prefix_may_exist = true;
809 Slice seek_key = target;
810 if (!key_includes_seq_) {
811 seek_key = ExtractUserKey(target);
812 }
813 uint32_t* block_ids = nullptr;
814 uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
815
816 if (num_blocks == 0) {
817 current_ = restarts_;
818 *prefix_may_exist = false;
819 return false;
820 } else {
821 assert(block_ids);
822 return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
823 prefix_may_exist);
824 }
825 }
826
NumRestarts() const827 uint32_t Block::NumRestarts() const {
828 assert(size_ >= 2 * sizeof(uint32_t));
829 uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
830 uint32_t num_restarts = block_footer;
831 if (size_ > kMaxBlockSizeSupportedByHashIndex) {
832 // In BlockBuilder, we have ensured a block with HashIndex is less than
833 // kMaxBlockSizeSupportedByHashIndex (64KiB).
834 //
835 // Therefore, if we encounter a block with a size > 64KiB, the block
836 // cannot have HashIndex. So the footer will directly interpreted as
837 // num_restarts.
838 //
839 // Such check is for backward compatibility. We can ensure legacy block
840 // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
841 // correctly as no HashIndex even if the MSB of num_restarts is set.
842 return num_restarts;
843 }
844 BlockBasedTableOptions::DataBlockIndexType index_type;
845 UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
846 return num_restarts;
847 }
848
IndexType() const849 BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
850 assert(size_ >= 2 * sizeof(uint32_t));
851 if (size_ > kMaxBlockSizeSupportedByHashIndex) {
852 // The check is for the same reason as that in NumRestarts()
853 return BlockBasedTableOptions::kDataBlockBinarySearch;
854 }
855 uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
856 uint32_t num_restarts = block_footer;
857 BlockBasedTableOptions::DataBlockIndexType index_type;
858 UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
859 return index_type;
860 }
861
~Block()862 Block::~Block() {
863 // This sync point can be re-enabled if RocksDB can control the
864 // initialization order of any/all static options created by the user.
865 // TEST_SYNC_POINT("Block::~Block");
866 }
867
Block(BlockContents && contents,SequenceNumber _global_seqno,size_t read_amp_bytes_per_bit,Statistics * statistics)868 Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
869 size_t read_amp_bytes_per_bit, Statistics* statistics)
870 : contents_(std::move(contents)),
871 data_(contents_.data.data()),
872 size_(contents_.data.size()),
873 restart_offset_(0),
874 num_restarts_(0),
875 global_seqno_(_global_seqno) {
876 TEST_SYNC_POINT("Block::Block:0");
877 if (size_ < sizeof(uint32_t)) {
878 size_ = 0; // Error marker
879 } else {
880 // Should only decode restart points for uncompressed blocks
881 num_restarts_ = NumRestarts();
882 switch (IndexType()) {
883 case BlockBasedTableOptions::kDataBlockBinarySearch:
884 restart_offset_ = static_cast<uint32_t>(size_) -
885 (1 + num_restarts_) * sizeof(uint32_t);
886 if (restart_offset_ > size_ - sizeof(uint32_t)) {
887 // The size is too small for NumRestarts() and therefore
888 // restart_offset_ wrapped around.
889 size_ = 0;
890 }
891 break;
892 case BlockBasedTableOptions::kDataBlockBinaryAndHash:
893 if (size_ < sizeof(uint32_t) /* block footer */ +
894 sizeof(uint16_t) /* NUM_BUCK */) {
895 size_ = 0;
896 break;
897 }
898
899 uint16_t map_offset;
900 data_block_hash_index_.Initialize(
901 contents.data.data(),
902 static_cast<uint16_t>(contents.data.size() -
903 sizeof(uint32_t)), /*chop off
904 NUM_RESTARTS*/
905 &map_offset);
906
907 restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
908
909 if (restart_offset_ > map_offset) {
910 // map_offset is too small for NumRestarts() and
911 // therefore restart_offset_ wrapped around.
912 size_ = 0;
913 break;
914 }
915 break;
916 default:
917 size_ = 0; // Error marker
918 }
919 }
920 if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
921 read_amp_bitmap_.reset(new BlockReadAmpBitmap(
922 restart_offset_, read_amp_bytes_per_bit, statistics));
923 }
924 }
925
NewDataIterator(const Comparator * cmp,const Comparator * ucmp,DataBlockIter * iter,Statistics * stats,bool block_contents_pinned)926 DataBlockIter* Block::NewDataIterator(const Comparator* cmp,
927 const Comparator* ucmp,
928 DataBlockIter* iter, Statistics* stats,
929 bool block_contents_pinned) {
930 DataBlockIter* ret_iter;
931 if (iter != nullptr) {
932 ret_iter = iter;
933 } else {
934 ret_iter = new DataBlockIter;
935 }
936 if (size_ < 2 * sizeof(uint32_t)) {
937 ret_iter->Invalidate(Status::Corruption("bad block contents"));
938 return ret_iter;
939 }
940 if (num_restarts_ == 0) {
941 // Empty block.
942 ret_iter->Invalidate(Status::OK());
943 return ret_iter;
944 } else {
945 ret_iter->Initialize(
946 cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_,
947 read_amp_bitmap_.get(), block_contents_pinned,
948 data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
949 if (read_amp_bitmap_) {
950 if (read_amp_bitmap_->GetStatistics() != stats) {
951 // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
952 read_amp_bitmap_->SetStatistics(stats);
953 }
954 }
955 }
956
957 return ret_iter;
958 }
959
NewIndexIterator(const Comparator * cmp,const Comparator * ucmp,IndexBlockIter * iter,Statistics *,bool total_order_seek,bool have_first_key,bool key_includes_seq,bool value_is_full,bool block_contents_pinned,BlockPrefixIndex * prefix_index)960 IndexBlockIter* Block::NewIndexIterator(
961 const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter,
962 Statistics* /*stats*/, bool total_order_seek, bool have_first_key,
963 bool key_includes_seq, bool value_is_full, bool block_contents_pinned,
964 BlockPrefixIndex* prefix_index) {
965 IndexBlockIter* ret_iter;
966 if (iter != nullptr) {
967 ret_iter = iter;
968 } else {
969 ret_iter = new IndexBlockIter;
970 }
971 if (size_ < 2 * sizeof(uint32_t)) {
972 ret_iter->Invalidate(Status::Corruption("bad block contents"));
973 return ret_iter;
974 }
975 if (num_restarts_ == 0) {
976 // Empty block.
977 ret_iter->Invalidate(Status::OK());
978 return ret_iter;
979 } else {
980 BlockPrefixIndex* prefix_index_ptr =
981 total_order_seek ? nullptr : prefix_index;
982 ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
983 global_seqno_, prefix_index_ptr, have_first_key,
984 key_includes_seq, value_is_full,
985 block_contents_pinned);
986 }
987
988 return ret_iter;
989 }
990
ApproximateMemoryUsage() const991 size_t Block::ApproximateMemoryUsage() const {
992 size_t usage = usable_size();
993 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
994 usage += malloc_usable_size((void*)this);
995 #else
996 usage += sizeof(*this);
997 #endif // ROCKSDB_MALLOC_USABLE_SIZE
998 if (read_amp_bitmap_) {
999 usage += read_amp_bitmap_->ApproximateMemoryUsage();
1000 }
1001 return usage;
1002 }
1003
1004 } // namespace ROCKSDB_NAMESPACE
1005