1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 //
10 // Decodes the blocks generated by block_builder.cc.
11 
12 #include "table/block_based/block.h"
13 #include <algorithm>
14 #include <string>
15 #include <unordered_map>
16 #include <vector>
17 
18 #include "logging/logging.h"
19 #include "monitoring/perf_context_imp.h"
20 #include "port/port.h"
21 #include "port/stack_trace.h"
22 #include "rocksdb/comparator.h"
23 #include "table/block_based/block_prefix_index.h"
24 #include "table/block_based/data_block_footer.h"
25 #include "table/format.h"
26 #include "util/coding.h"
27 
28 namespace ROCKSDB_NAMESPACE {
29 
30 // Helper routine: decode the next block entry starting at "p",
31 // storing the number of shared key bytes, non_shared key bytes,
32 // and the length of the value in "*shared", "*non_shared", and
33 // "*value_length", respectively.  Will not derefence past "limit".
34 //
35 // If any errors are detected, returns nullptr.  Otherwise, returns a
36 // pointer to the key delta (just past the three decoded values).
37 struct DecodeEntry {
operator ()ROCKSDB_NAMESPACE::DecodeEntry38   inline const char* operator()(const char* p, const char* limit,
39                                 uint32_t* shared, uint32_t* non_shared,
40                                 uint32_t* value_length) {
41     // We need 2 bytes for shared and non_shared size. We also need one more
42     // byte either for value size or the actual value in case of value delta
43     // encoding.
44     assert(limit - p >= 3);
45     *shared = reinterpret_cast<const unsigned char*>(p)[0];
46     *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
47     *value_length = reinterpret_cast<const unsigned char*>(p)[2];
48     if ((*shared | *non_shared | *value_length) < 128) {
49       // Fast path: all three values are encoded in one byte each
50       p += 3;
51     } else {
52       if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
53       if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
54       if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
55         return nullptr;
56       }
57     }
58 
59     // Using an assert in place of "return null" since we should not pay the
60     // cost of checking for corruption on every single key decoding
61     assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
62     return p;
63   }
64 };
65 
66 // Helper routine: similar to DecodeEntry but does not have assertions.
67 // Instead, returns nullptr so that caller can detect and report failure.
68 struct CheckAndDecodeEntry {
operator ()ROCKSDB_NAMESPACE::CheckAndDecodeEntry69   inline const char* operator()(const char* p, const char* limit,
70                                 uint32_t* shared, uint32_t* non_shared,
71                                 uint32_t* value_length) {
72     // We need 2 bytes for shared and non_shared size. We also need one more
73     // byte either for value size or the actual value in case of value delta
74     // encoding.
75     if (limit - p < 3) {
76       return nullptr;
77     }
78     *shared = reinterpret_cast<const unsigned char*>(p)[0];
79     *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
80     *value_length = reinterpret_cast<const unsigned char*>(p)[2];
81     if ((*shared | *non_shared | *value_length) < 128) {
82       // Fast path: all three values are encoded in one byte each
83       p += 3;
84     } else {
85       if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
86       if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
87       if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
88         return nullptr;
89       }
90     }
91 
92     if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
93       return nullptr;
94     }
95     return p;
96   }
97 };
98 
99 struct DecodeKey {
operator ()ROCKSDB_NAMESPACE::DecodeKey100   inline const char* operator()(const char* p, const char* limit,
101                                 uint32_t* shared, uint32_t* non_shared) {
102     uint32_t value_length;
103     return DecodeEntry()(p, limit, shared, non_shared, &value_length);
104   }
105 };
106 
107 // In format_version 4, which is used by index blocks, the value size is not
108 // encoded before the entry, as the value is known to be the handle with the
109 // known size.
110 struct DecodeKeyV4 {
operator ()ROCKSDB_NAMESPACE::DecodeKeyV4111   inline const char* operator()(const char* p, const char* limit,
112                                 uint32_t* shared, uint32_t* non_shared) {
113     // We need 2 bytes for shared and non_shared size. We also need one more
114     // byte either for value size or the actual value in case of value delta
115     // encoding.
116     if (limit - p < 3) return nullptr;
117     *shared = reinterpret_cast<const unsigned char*>(p)[0];
118     *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
119     if ((*shared | *non_shared) < 128) {
120       // Fast path: all three values are encoded in one byte each
121       p += 2;
122     } else {
123       if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
124       if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
125     }
126     return p;
127   }
128 };
129 
Next()130 void DataBlockIter::Next() {
131   assert(Valid());
132   ParseNextDataKey<DecodeEntry>();
133 }
134 
NextOrReport()135 void DataBlockIter::NextOrReport() {
136   assert(Valid());
137   ParseNextDataKey<CheckAndDecodeEntry>();
138 }
139 
Next()140 void IndexBlockIter::Next() {
141   assert(Valid());
142   ParseNextIndexKey();
143 }
144 
Prev()145 void IndexBlockIter::Prev() {
146   assert(Valid());
147   // Scan backwards to a restart point before current_
148   const uint32_t original = current_;
149   while (GetRestartPoint(restart_index_) >= original) {
150     if (restart_index_ == 0) {
151       // No more entries
152       current_ = restarts_;
153       restart_index_ = num_restarts_;
154       return;
155     }
156     restart_index_--;
157   }
158   SeekToRestartPoint(restart_index_);
159   // Loop until end of current entry hits the start of original entry
160   while (ParseNextIndexKey() && NextEntryOffset() < original) {
161   }
162 }
163 
164 // Similar to IndexBlockIter::Prev but also caches the prev entries
Prev()165 void DataBlockIter::Prev() {
166   assert(Valid());
167 
168   assert(prev_entries_idx_ == -1 ||
169          static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
170   // Check if we can use cached prev_entries_
171   if (prev_entries_idx_ > 0 &&
172       prev_entries_[prev_entries_idx_].offset == current_) {
173     // Read cached CachedPrevEntry
174     prev_entries_idx_--;
175     const CachedPrevEntry& current_prev_entry =
176         prev_entries_[prev_entries_idx_];
177 
178     const char* key_ptr = nullptr;
179     if (current_prev_entry.key_ptr != nullptr) {
180       // The key is not delta encoded and stored in the data block
181       key_ptr = current_prev_entry.key_ptr;
182       key_pinned_ = true;
183     } else {
184       // The key is delta encoded and stored in prev_entries_keys_buff_
185       key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
186       key_pinned_ = false;
187     }
188     const Slice current_key(key_ptr, current_prev_entry.key_size);
189 
190     current_ = current_prev_entry.offset;
191     key_.SetKey(current_key, false /* copy */);
192     value_ = current_prev_entry.value;
193 
194     return;
195   }
196 
197   // Clear prev entries cache
198   prev_entries_idx_ = -1;
199   prev_entries_.clear();
200   prev_entries_keys_buff_.clear();
201 
202   // Scan backwards to a restart point before current_
203   const uint32_t original = current_;
204   while (GetRestartPoint(restart_index_) >= original) {
205     if (restart_index_ == 0) {
206       // No more entries
207       current_ = restarts_;
208       restart_index_ = num_restarts_;
209       return;
210     }
211     restart_index_--;
212   }
213 
214   SeekToRestartPoint(restart_index_);
215 
216   do {
217     if (!ParseNextDataKey<DecodeEntry>()) {
218       break;
219     }
220     Slice current_key = key();
221 
222     if (key_.IsKeyPinned()) {
223       // The key is not delta encoded
224       prev_entries_.emplace_back(current_, current_key.data(), 0,
225                                  current_key.size(), value());
226     } else {
227       // The key is delta encoded, cache decoded key in buffer
228       size_t new_key_offset = prev_entries_keys_buff_.size();
229       prev_entries_keys_buff_.append(current_key.data(), current_key.size());
230 
231       prev_entries_.emplace_back(current_, nullptr, new_key_offset,
232                                  current_key.size(), value());
233     }
234     // Loop until end of current entry hits the start of original entry
235   } while (NextEntryOffset() < original);
236   prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
237 }
238 
Seek(const Slice & target)239 void DataBlockIter::Seek(const Slice& target) {
240   Slice seek_key = target;
241   PERF_TIMER_GUARD(block_seek_nanos);
242   if (data_ == nullptr) {  // Not init yet
243     return;
244   }
245   uint32_t index = 0;
246   bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
247                                   comparator_);
248 
249   if (!ok) {
250     return;
251   }
252   SeekToRestartPoint(index);
253 
254   // Linear search (within restart block) for first key >= target
255   while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) {
256   }
257 }
258 
259 // Optimized Seek for point lookup for an internal key `target`
260 // target = "seek_user_key @ type | seqno".
261 //
262 // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
263 // or kTypeBlobIndex, this function behaves identically as Seek().
264 //
265 // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
266 // or kTypeBlobIndex:
267 //
268 // If the return value is FALSE, iter location is undefined, and it means:
269 // 1) there is no key in this block falling into the range:
270 //    ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
271 //    inclusive; AND
272 // 2) the last key of this block has a greater user_key from seek_user_key
273 //
274 // If the return value is TRUE, iter location has two possibilies:
275 // 1) If iter is valid, it is set to a location as if set by BinarySeek. In
276 //    this case, it points to the first key_ with a larger user_key or a
277 //    matching user_key with a seqno no greater than the seeking seqno.
278 // 2) If the iter is invalid, it means that either all the user_key is less
279 //    than the seek_user_key, or the block ends with a matching user_key but
280 //    with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
281 //    but larger type).
SeekForGetImpl(const Slice & target)282 bool DataBlockIter::SeekForGetImpl(const Slice& target) {
283   Slice target_user_key = ExtractUserKey(target);
284   uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
285   uint8_t entry =
286       data_block_hash_index_->Lookup(data_, map_offset, target_user_key);
287 
288   if (entry == kCollision) {
289     // HashSeek not effective, falling back
290     Seek(target);
291     return true;
292   }
293 
294   if (entry == kNoEntry) {
295     // Even if we cannot find the user_key in this block, the result may
296     // exist in the next block. Consider this exmpale:
297     //
298     // Block N:    [aab@100, ... , app@120]
299     // bounary key: axy@50 (we make minimal assumption about a boundary key)
300     // Block N+1:  [axy@10, ...   ]
301     //
302     // If seek_key = axy@60, the search will starts from Block N.
303     // Even if the user_key is not found in the hash map, the caller still
304     // have to conntinue searching the next block.
305     //
306     // In this case, we pretend the key is the the last restart interval.
307     // The while-loop below will search the last restart interval for the
308     // key. It will stop at the first key that is larger than the seek_key,
309     // or to the end of the block if no one is larger.
310     entry = static_cast<uint8_t>(num_restarts_ - 1);
311   }
312 
313   uint32_t restart_index = entry;
314 
315   // check if the key is in the restart_interval
316   assert(restart_index < num_restarts_);
317   SeekToRestartPoint(restart_index);
318 
319   const char* limit = nullptr;
320   if (restart_index_ + 1 < num_restarts_) {
321     limit = data_ + GetRestartPoint(restart_index_ + 1);
322   } else {
323     limit = data_ + restarts_;
324   }
325 
326   while (true) {
327     // Here we only linear seek the target key inside the restart interval.
328     // If a key does not exist inside a restart interval, we avoid
329     // further searching the block content accross restart interval boundary.
330     //
331     // TODO(fwu): check the left and write boundary of the restart interval
332     // to avoid linear seek a target key that is out of range.
333     if (!ParseNextDataKey<DecodeEntry>(limit) || Compare(key_, target) >= 0) {
334       // we stop at the first potential matching user key.
335       break;
336     }
337   }
338 
339   if (current_ == restarts_) {
340     // Search reaches to the end of the block. There are three possibilites:
341     // 1) there is only one user_key match in the block (otherwise collsion).
342     //    the matching user_key resides in the last restart interval, and it
343     //    is the last key of the restart interval and of the block as well.
344     //    ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
345     //
346     // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
347     //    AND all existing user_keys in the restart interval are smaller than
348     //    seek_user_key.
349     //
350     // 3) The seek_key is a false positive and happens to be hashed to the
351     //    last restart interval, AND all existing user_keys in the restart
352     //    interval are smaller than seek_user_key.
353     //
354     // The result may exist in the next block each case, so we return true.
355     return true;
356   }
357 
358   if (user_comparator_->Compare(key_.GetUserKey(), target_user_key) != 0) {
359     // the key is not in this block and cannot be at the next block either.
360     return false;
361   }
362 
363   // Here we are conservative and only support a limited set of cases
364   ValueType value_type = ExtractValueType(key_.GetKey());
365   if (value_type != ValueType::kTypeValue &&
366       value_type != ValueType::kTypeDeletion &&
367       value_type != ValueType::kTypeSingleDeletion &&
368       value_type != ValueType::kTypeBlobIndex) {
369     Seek(target);
370     return true;
371   }
372 
373   // Result found, and the iter is correctly set.
374   return true;
375 }
376 
Seek(const Slice & target)377 void IndexBlockIter::Seek(const Slice& target) {
378   TEST_SYNC_POINT("IndexBlockIter::Seek:0");
379   Slice seek_key = target;
380   if (!key_includes_seq_) {
381     seek_key = ExtractUserKey(target);
382   }
383   PERF_TIMER_GUARD(block_seek_nanos);
384   if (data_ == nullptr) {  // Not init yet
385     return;
386   }
387   status_ = Status::OK();
388   uint32_t index = 0;
389   bool ok = false;
390   if (prefix_index_) {
391     bool prefix_may_exist = true;
392     ok = PrefixSeek(target, &index, &prefix_may_exist);
393     if (!prefix_may_exist) {
394       // This is to let the caller to distinguish between non-existing prefix,
395       // and when key is larger than the last key, which both set Valid() to
396       // false.
397       current_ = restarts_;
398       status_ = Status::NotFound();
399     }
400   } else if (value_delta_encoded_) {
401     ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index,
402                                  comparator_);
403   } else {
404     ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
405                                comparator_);
406   }
407 
408   if (!ok) {
409     return;
410   }
411   SeekToRestartPoint(index);
412 
413   // Linear search (within restart block) for first key >= target
414   while (ParseNextIndexKey() && Compare(key_, seek_key) < 0) {
415   }
416 }
417 
SeekForPrev(const Slice & target)418 void DataBlockIter::SeekForPrev(const Slice& target) {
419   PERF_TIMER_GUARD(block_seek_nanos);
420   Slice seek_key = target;
421   if (data_ == nullptr) {  // Not init yet
422     return;
423   }
424   uint32_t index = 0;
425   bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
426                                   comparator_);
427 
428   if (!ok) {
429     return;
430   }
431   SeekToRestartPoint(index);
432 
433   // Linear search (within restart block) for first key >= seek_key
434   while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) {
435   }
436   if (!Valid()) {
437     SeekToLast();
438   } else {
439     while (Valid() && Compare(key_, seek_key) > 0) {
440       Prev();
441     }
442   }
443 }
444 
SeekToFirst()445 void DataBlockIter::SeekToFirst() {
446   if (data_ == nullptr) {  // Not init yet
447     return;
448   }
449   SeekToRestartPoint(0);
450   ParseNextDataKey<DecodeEntry>();
451 }
452 
SeekToFirstOrReport()453 void DataBlockIter::SeekToFirstOrReport() {
454   if (data_ == nullptr) {  // Not init yet
455     return;
456   }
457   SeekToRestartPoint(0);
458   ParseNextDataKey<CheckAndDecodeEntry>();
459 }
460 
SeekToFirst()461 void IndexBlockIter::SeekToFirst() {
462   if (data_ == nullptr) {  // Not init yet
463     return;
464   }
465   status_ = Status::OK();
466   SeekToRestartPoint(0);
467   ParseNextIndexKey();
468 }
469 
SeekToLast()470 void DataBlockIter::SeekToLast() {
471   if (data_ == nullptr) {  // Not init yet
472     return;
473   }
474   SeekToRestartPoint(num_restarts_ - 1);
475   while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) {
476     // Keep skipping
477   }
478 }
479 
SeekToLast()480 void IndexBlockIter::SeekToLast() {
481   if (data_ == nullptr) {  // Not init yet
482     return;
483   }
484   status_ = Status::OK();
485   SeekToRestartPoint(num_restarts_ - 1);
486   while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
487     // Keep skipping
488   }
489 }
490 
491 template <class TValue>
CorruptionError()492 void BlockIter<TValue>::CorruptionError() {
493   current_ = restarts_;
494   restart_index_ = num_restarts_;
495   status_ = Status::Corruption("bad entry in block");
496   key_.Clear();
497   value_.clear();
498 }
499 
500 template <typename DecodeEntryFunc>
ParseNextDataKey(const char * limit)501 bool DataBlockIter::ParseNextDataKey(const char* limit) {
502   current_ = NextEntryOffset();
503   const char* p = data_ + current_;
504   if (!limit) {
505     limit = data_ + restarts_;  // Restarts come right after data
506   }
507 
508   if (p >= limit) {
509     // No more entries to return.  Mark as invalid.
510     current_ = restarts_;
511     restart_index_ = num_restarts_;
512     return false;
513   }
514 
515   // Decode next entry
516   uint32_t shared, non_shared, value_length;
517   p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
518   if (p == nullptr || key_.Size() < shared) {
519     CorruptionError();
520     return false;
521   } else {
522     if (shared == 0) {
523       // If this key dont share any bytes with prev key then we dont need
524       // to decode it and can use it's address in the block directly.
525       key_.SetKey(Slice(p, non_shared), false /* copy */);
526       key_pinned_ = true;
527     } else {
528       // This key share `shared` bytes with prev key, we need to decode it
529       key_.TrimAppend(shared, p, non_shared);
530       key_pinned_ = false;
531     }
532 
533     if (global_seqno_ != kDisableGlobalSequenceNumber) {
534       // If we are reading a file with a global sequence number we should
535       // expect that all encoded sequence numbers are zeros and any value
536       // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion.
537       assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0);
538 
539       ValueType value_type = ExtractValueType(key_.GetKey());
540       assert(value_type == ValueType::kTypeValue ||
541              value_type == ValueType::kTypeMerge ||
542              value_type == ValueType::kTypeDeletion ||
543              value_type == ValueType::kTypeRangeDeletion);
544 
545       if (key_pinned_) {
546         // TODO(tec): Investigate updating the seqno in the loaded block
547         // directly instead of doing a copy and update.
548 
549         // We cannot use the key address in the block directly because
550         // we have a global_seqno_ that will overwrite the encoded one.
551         key_.OwnKey();
552         key_pinned_ = false;
553       }
554 
555       key_.UpdateInternalKey(global_seqno_, value_type);
556     }
557 
558     value_ = Slice(p + non_shared, value_length);
559     if (shared == 0) {
560       while (restart_index_ + 1 < num_restarts_ &&
561              GetRestartPoint(restart_index_ + 1) < current_) {
562         ++restart_index_;
563       }
564     }
565     // else we are in the middle of a restart interval and the restart_index_
566     // thus has not changed
567     return true;
568   }
569 }
570 
ParseNextIndexKey()571 bool IndexBlockIter::ParseNextIndexKey() {
572   current_ = NextEntryOffset();
573   const char* p = data_ + current_;
574   const char* limit = data_ + restarts_;  // Restarts come right after data
575   if (p >= limit) {
576     // No more entries to return.  Mark as invalid.
577     current_ = restarts_;
578     restart_index_ = num_restarts_;
579     return false;
580   }
581 
582   // Decode next entry
583   uint32_t shared, non_shared, value_length;
584   if (value_delta_encoded_) {
585     p = DecodeKeyV4()(p, limit, &shared, &non_shared);
586     value_length = 0;
587   } else {
588     p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
589   }
590   if (p == nullptr || key_.Size() < shared) {
591     CorruptionError();
592     return false;
593   }
594   if (shared == 0) {
595     // If this key dont share any bytes with prev key then we dont need
596     // to decode it and can use it's address in the block directly.
597     key_.SetKey(Slice(p, non_shared), false /* copy */);
598     key_pinned_ = true;
599   } else {
600     // This key share `shared` bytes with prev key, we need to decode it
601     key_.TrimAppend(shared, p, non_shared);
602     key_pinned_ = false;
603   }
604   value_ = Slice(p + non_shared, value_length);
605   if (shared == 0) {
606     while (restart_index_ + 1 < num_restarts_ &&
607            GetRestartPoint(restart_index_ + 1) < current_) {
608       ++restart_index_;
609     }
610   }
611   // else we are in the middle of a restart interval and the restart_index_
612   // thus has not changed
613   if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
614     DecodeCurrentValue(shared);
615   }
616   return true;
617 }
618 
619 // The format:
620 // restart_point   0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
621 // restart_point   1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
622 // ...
623 // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
624 // where, k is key, v is value, and its encoding is in parenthesis.
625 // The format of each key is (shared_size, non_shared_size, shared, non_shared)
626 // The format of each value, i.e., block hanlde, is (offset, size) whenever the
627 // shared_size is 0, which included the first entry in each restart point.
628 // Otherwise the format is delta-size = block handle size - size of last block
629 // handle.
DecodeCurrentValue(uint32_t shared)630 void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
631   Slice v(value_.data(), data_ + restarts_ - value_.data());
632   // Delta encoding is used if `shared` != 0.
633   Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
634       &v, have_first_key_,
635       (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr);
636   assert(decode_s.ok());
637   value_ = Slice(value_.data(), v.data() - value_.data());
638 
639   if (global_seqno_state_ != nullptr) {
640     // Overwrite sequence number the same way as in DataBlockIter.
641 
642     IterKey& first_internal_key = global_seqno_state_->first_internal_key;
643     first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
644                                       /* copy */ true);
645 
646     assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
647 
648     ValueType value_type = ExtractValueType(first_internal_key.GetKey());
649     assert(value_type == ValueType::kTypeValue ||
650            value_type == ValueType::kTypeMerge ||
651            value_type == ValueType::kTypeDeletion ||
652            value_type == ValueType::kTypeRangeDeletion);
653 
654     first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
655                                          value_type);
656     decoded_value_.first_internal_key = first_internal_key.GetKey();
657   }
658 }
659 
660 // Binary search in restart array to find the first restart point that
661 // is either the last restart point with a key less than target,
662 // which means the key of next restart point is larger than target, or
663 // the first restart point with a key = target
664 template <class TValue>
665 template <typename DecodeKeyFunc>
BinarySeek(const Slice & target,uint32_t left,uint32_t right,uint32_t * index,const Comparator * comp)666 bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t left,
667                                    uint32_t right, uint32_t* index,
668                                    const Comparator* comp) {
669   assert(left <= right);
670 
671   while (left < right) {
672     uint32_t mid = (left + right + 1) / 2;
673     uint32_t region_offset = GetRestartPoint(mid);
674     uint32_t shared, non_shared;
675     const char* key_ptr = DecodeKeyFunc()(
676         data_ + region_offset, data_ + restarts_, &shared, &non_shared);
677     if (key_ptr == nullptr || (shared != 0)) {
678       CorruptionError();
679       return false;
680     }
681     Slice mid_key(key_ptr, non_shared);
682     int cmp = comp->Compare(mid_key, target);
683     if (cmp < 0) {
684       // Key at "mid" is smaller than "target". Therefore all
685       // blocks before "mid" are uninteresting.
686       left = mid;
687     } else if (cmp > 0) {
688       // Key at "mid" is >= "target". Therefore all blocks at or
689       // after "mid" are uninteresting.
690       right = mid - 1;
691     } else {
692       left = right = mid;
693     }
694   }
695 
696   *index = left;
697   return true;
698 }
699 
700 // Compare target key and the block key of the block of `block_index`.
701 // Return -1 if error.
CompareBlockKey(uint32_t block_index,const Slice & target)702 int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
703   uint32_t region_offset = GetRestartPoint(block_index);
704   uint32_t shared, non_shared;
705   const char* key_ptr =
706       value_delta_encoded_
707           ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
708                           &non_shared)
709           : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
710                         &non_shared);
711   if (key_ptr == nullptr || (shared != 0)) {
712     CorruptionError();
713     return 1;  // Return target is smaller
714   }
715   Slice block_key(key_ptr, non_shared);
716   return Compare(block_key, target);
717 }
718 
719 // Binary search in block_ids to find the first block
720 // with a key >= target
BinaryBlockIndexSeek(const Slice & target,uint32_t * block_ids,uint32_t left,uint32_t right,uint32_t * index,bool * prefix_may_exist)721 bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
722                                           uint32_t* block_ids, uint32_t left,
723                                           uint32_t right, uint32_t* index,
724                                           bool* prefix_may_exist) {
725   assert(left <= right);
726   assert(index);
727   assert(prefix_may_exist);
728   *prefix_may_exist = true;
729   uint32_t left_bound = left;
730 
731   while (left <= right) {
732     uint32_t mid = (right + left) / 2;
733 
734     int cmp = CompareBlockKey(block_ids[mid], target);
735     if (!status_.ok()) {
736       return false;
737     }
738     if (cmp < 0) {
739       // Key at "target" is larger than "mid". Therefore all
740       // blocks before or at "mid" are uninteresting.
741       left = mid + 1;
742     } else {
743       // Key at "target" is <= "mid". Therefore all blocks
744       // after "mid" are uninteresting.
745       // If there is only one block left, we found it.
746       if (left == right) break;
747       right = mid;
748     }
749   }
750 
751   if (left == right) {
752     // In one of the two following cases:
753     // (1) left is the first one of block_ids
754     // (2) there is a gap of blocks between block of `left` and `left-1`.
755     // we can further distinguish the case of key in the block or key not
756     // existing, by comparing the target key and the key of the previous
757     // block to the left of the block found.
758     if (block_ids[left] > 0 &&
759         (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
760         CompareBlockKey(block_ids[left] - 1, target) > 0) {
761       current_ = restarts_;
762       *prefix_may_exist = false;
763       return false;
764     }
765 
766     *index = block_ids[left];
767     return true;
768   } else {
769     assert(left > right);
770 
771     // If the next block key is larger than seek key, it is possible that
772     // no key shares the prefix with `target`, or all keys with the same
773     // prefix as `target` are smaller than prefix. In the latter case,
774     // we are mandated to set the position the same as the total order.
775     // In the latter case, either:
776     // (1) `target` falls into the range of the next block. In this case,
777     //     we can place the iterator to the next block, or
778     // (2) `target` is larger than all block keys. In this case we can
779     //     keep the iterator invalidate without setting `prefix_may_exist`
780     //     to false.
781     // We might sometimes end up with setting the total order position
782     // while there is no key sharing the prefix as `target`, but it
783     // still follows the contract.
784     uint32_t right_index = block_ids[right];
785     assert(right_index + 1 <= num_restarts_);
786     if (right_index + 1 < num_restarts_) {
787       if (CompareBlockKey(right_index + 1, target) >= 0) {
788         *index = right_index + 1;
789         return true;
790       } else {
791         // We have to set the flag here because we are not positioning
792         // the iterator to the total order position.
793         *prefix_may_exist = false;
794       }
795     }
796 
797     // Mark iterator invalid
798     current_ = restarts_;
799     return false;
800   }
801 }
802 
PrefixSeek(const Slice & target,uint32_t * index,bool * prefix_may_exist)803 bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
804                                 bool* prefix_may_exist) {
805   assert(index);
806   assert(prefix_may_exist);
807   assert(prefix_index_);
808   *prefix_may_exist = true;
809   Slice seek_key = target;
810   if (!key_includes_seq_) {
811     seek_key = ExtractUserKey(target);
812   }
813   uint32_t* block_ids = nullptr;
814   uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
815 
816   if (num_blocks == 0) {
817     current_ = restarts_;
818     *prefix_may_exist = false;
819     return false;
820   } else {
821     assert(block_ids);
822     return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
823                                 prefix_may_exist);
824   }
825 }
826 
NumRestarts() const827 uint32_t Block::NumRestarts() const {
828   assert(size_ >= 2 * sizeof(uint32_t));
829   uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
830   uint32_t num_restarts = block_footer;
831   if (size_ > kMaxBlockSizeSupportedByHashIndex) {
832     // In BlockBuilder, we have ensured a block with HashIndex is less than
833     // kMaxBlockSizeSupportedByHashIndex (64KiB).
834     //
835     // Therefore, if we encounter a block with a size > 64KiB, the block
836     // cannot have HashIndex. So the footer will directly interpreted as
837     // num_restarts.
838     //
839     // Such check is for backward compatibility. We can ensure legacy block
840     // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
841     // correctly as no HashIndex even if the MSB of num_restarts is set.
842     return num_restarts;
843   }
844   BlockBasedTableOptions::DataBlockIndexType index_type;
845   UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
846   return num_restarts;
847 }
848 
IndexType() const849 BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
850   assert(size_ >= 2 * sizeof(uint32_t));
851   if (size_ > kMaxBlockSizeSupportedByHashIndex) {
852     // The check is for the same reason as that in NumRestarts()
853     return BlockBasedTableOptions::kDataBlockBinarySearch;
854   }
855   uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
856   uint32_t num_restarts = block_footer;
857   BlockBasedTableOptions::DataBlockIndexType index_type;
858   UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
859   return index_type;
860 }
861 
~Block()862 Block::~Block() {
863   // This sync point can be re-enabled if RocksDB can control the
864   // initialization order of any/all static options created by the user.
865   // TEST_SYNC_POINT("Block::~Block");
866 }
867 
Block(BlockContents && contents,SequenceNumber _global_seqno,size_t read_amp_bytes_per_bit,Statistics * statistics)868 Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
869              size_t read_amp_bytes_per_bit, Statistics* statistics)
870     : contents_(std::move(contents)),
871       data_(contents_.data.data()),
872       size_(contents_.data.size()),
873       restart_offset_(0),
874       num_restarts_(0),
875       global_seqno_(_global_seqno) {
876   TEST_SYNC_POINT("Block::Block:0");
877   if (size_ < sizeof(uint32_t)) {
878     size_ = 0;  // Error marker
879   } else {
880     // Should only decode restart points for uncompressed blocks
881     num_restarts_ = NumRestarts();
882     switch (IndexType()) {
883       case BlockBasedTableOptions::kDataBlockBinarySearch:
884         restart_offset_ = static_cast<uint32_t>(size_) -
885                           (1 + num_restarts_) * sizeof(uint32_t);
886         if (restart_offset_ > size_ - sizeof(uint32_t)) {
887           // The size is too small for NumRestarts() and therefore
888           // restart_offset_ wrapped around.
889           size_ = 0;
890         }
891         break;
892       case BlockBasedTableOptions::kDataBlockBinaryAndHash:
893         if (size_ < sizeof(uint32_t) /* block footer */ +
894                         sizeof(uint16_t) /* NUM_BUCK */) {
895           size_ = 0;
896           break;
897         }
898 
899         uint16_t map_offset;
900         data_block_hash_index_.Initialize(
901             contents.data.data(),
902             static_cast<uint16_t>(contents.data.size() -
903                                   sizeof(uint32_t)), /*chop off
904                                                  NUM_RESTARTS*/
905             &map_offset);
906 
907         restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
908 
909         if (restart_offset_ > map_offset) {
910           // map_offset is too small for NumRestarts() and
911           // therefore restart_offset_ wrapped around.
912           size_ = 0;
913           break;
914         }
915         break;
916       default:
917         size_ = 0;  // Error marker
918     }
919   }
920   if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
921     read_amp_bitmap_.reset(new BlockReadAmpBitmap(
922         restart_offset_, read_amp_bytes_per_bit, statistics));
923   }
924 }
925 
NewDataIterator(const Comparator * cmp,const Comparator * ucmp,DataBlockIter * iter,Statistics * stats,bool block_contents_pinned)926 DataBlockIter* Block::NewDataIterator(const Comparator* cmp,
927                                       const Comparator* ucmp,
928                                       DataBlockIter* iter, Statistics* stats,
929                                       bool block_contents_pinned) {
930   DataBlockIter* ret_iter;
931   if (iter != nullptr) {
932     ret_iter = iter;
933   } else {
934     ret_iter = new DataBlockIter;
935   }
936   if (size_ < 2 * sizeof(uint32_t)) {
937     ret_iter->Invalidate(Status::Corruption("bad block contents"));
938     return ret_iter;
939   }
940   if (num_restarts_ == 0) {
941     // Empty block.
942     ret_iter->Invalidate(Status::OK());
943     return ret_iter;
944   } else {
945     ret_iter->Initialize(
946         cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_,
947         read_amp_bitmap_.get(), block_contents_pinned,
948         data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
949     if (read_amp_bitmap_) {
950       if (read_amp_bitmap_->GetStatistics() != stats) {
951         // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
952         read_amp_bitmap_->SetStatistics(stats);
953       }
954     }
955   }
956 
957   return ret_iter;
958 }
959 
NewIndexIterator(const Comparator * cmp,const Comparator * ucmp,IndexBlockIter * iter,Statistics *,bool total_order_seek,bool have_first_key,bool key_includes_seq,bool value_is_full,bool block_contents_pinned,BlockPrefixIndex * prefix_index)960 IndexBlockIter* Block::NewIndexIterator(
961     const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter,
962     Statistics* /*stats*/, bool total_order_seek, bool have_first_key,
963     bool key_includes_seq, bool value_is_full, bool block_contents_pinned,
964     BlockPrefixIndex* prefix_index) {
965   IndexBlockIter* ret_iter;
966   if (iter != nullptr) {
967     ret_iter = iter;
968   } else {
969     ret_iter = new IndexBlockIter;
970   }
971   if (size_ < 2 * sizeof(uint32_t)) {
972     ret_iter->Invalidate(Status::Corruption("bad block contents"));
973     return ret_iter;
974   }
975   if (num_restarts_ == 0) {
976     // Empty block.
977     ret_iter->Invalidate(Status::OK());
978     return ret_iter;
979   } else {
980     BlockPrefixIndex* prefix_index_ptr =
981         total_order_seek ? nullptr : prefix_index;
982     ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
983                          global_seqno_, prefix_index_ptr, have_first_key,
984                          key_includes_seq, value_is_full,
985                          block_contents_pinned);
986   }
987 
988   return ret_iter;
989 }
990 
ApproximateMemoryUsage() const991 size_t Block::ApproximateMemoryUsage() const {
992   size_t usage = usable_size();
993 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
994   usage += malloc_usable_size((void*)this);
995 #else
996   usage += sizeof(*this);
997 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
998   if (read_amp_bitmap_) {
999     usage += read_amp_bitmap_->ApproximateMemoryUsage();
1000   }
1001   return usage;
1002 }
1003 
1004 }  // namespace ROCKSDB_NAMESPACE
1005