1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 //
10 // Decodes the blocks generated by block_builder.cc.
11 
12 #include "table/block_based/block.h"
13 #include <algorithm>
14 #include <string>
15 #include <unordered_map>
16 #include <vector>
17 
18 #include "monitoring/perf_context_imp.h"
19 #include "port/port.h"
20 #include "port/stack_trace.h"
21 #include "rocksdb/comparator.h"
22 #include "table/block_based/block_prefix_index.h"
23 #include "table/block_based/data_block_footer.h"
24 #include "table/format.h"
25 #include "util/coding.h"
26 
27 namespace ROCKSDB_NAMESPACE {
28 
29 // Helper routine: decode the next block entry starting at "p",
30 // storing the number of shared key bytes, non_shared key bytes,
31 // and the length of the value in "*shared", "*non_shared", and
32 // "*value_length", respectively.  Will not derefence past "limit".
33 //
34 // If any errors are detected, returns nullptr.  Otherwise, returns a
35 // pointer to the key delta (just past the three decoded values).
36 struct DecodeEntry {
operator ()ROCKSDB_NAMESPACE::DecodeEntry37   inline const char* operator()(const char* p, const char* limit,
38                                 uint32_t* shared, uint32_t* non_shared,
39                                 uint32_t* value_length) {
40     // We need 2 bytes for shared and non_shared size. We also need one more
41     // byte either for value size or the actual value in case of value delta
42     // encoding.
43     assert(limit - p >= 3);
44     *shared = reinterpret_cast<const unsigned char*>(p)[0];
45     *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
46     *value_length = reinterpret_cast<const unsigned char*>(p)[2];
47     if ((*shared | *non_shared | *value_length) < 128) {
48       // Fast path: all three values are encoded in one byte each
49       p += 3;
50     } else {
51       if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
52       if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
53       if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
54         return nullptr;
55       }
56     }
57 
58     // Using an assert in place of "return null" since we should not pay the
59     // cost of checking for corruption on every single key decoding
60     assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
61     return p;
62   }
63 };
64 
65 // Helper routine: similar to DecodeEntry but does not have assertions.
66 // Instead, returns nullptr so that caller can detect and report failure.
67 struct CheckAndDecodeEntry {
operator ()ROCKSDB_NAMESPACE::CheckAndDecodeEntry68   inline const char* operator()(const char* p, const char* limit,
69                                 uint32_t* shared, uint32_t* non_shared,
70                                 uint32_t* value_length) {
71     // We need 2 bytes for shared and non_shared size. We also need one more
72     // byte either for value size or the actual value in case of value delta
73     // encoding.
74     if (limit - p < 3) {
75       return nullptr;
76     }
77     *shared = reinterpret_cast<const unsigned char*>(p)[0];
78     *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
79     *value_length = reinterpret_cast<const unsigned char*>(p)[2];
80     if ((*shared | *non_shared | *value_length) < 128) {
81       // Fast path: all three values are encoded in one byte each
82       p += 3;
83     } else {
84       if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
85       if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
86       if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
87         return nullptr;
88       }
89     }
90 
91     if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
92       return nullptr;
93     }
94     return p;
95   }
96 };
97 
98 struct DecodeKey {
operator ()ROCKSDB_NAMESPACE::DecodeKey99   inline const char* operator()(const char* p, const char* limit,
100                                 uint32_t* shared, uint32_t* non_shared) {
101     uint32_t value_length;
102     return DecodeEntry()(p, limit, shared, non_shared, &value_length);
103   }
104 };
105 
106 // In format_version 4, which is used by index blocks, the value size is not
107 // encoded before the entry, as the value is known to be the handle with the
108 // known size.
109 struct DecodeKeyV4 {
operator ()ROCKSDB_NAMESPACE::DecodeKeyV4110   inline const char* operator()(const char* p, const char* limit,
111                                 uint32_t* shared, uint32_t* non_shared) {
112     // We need 2 bytes for shared and non_shared size. We also need one more
113     // byte either for value size or the actual value in case of value delta
114     // encoding.
115     if (limit - p < 3) return nullptr;
116     *shared = reinterpret_cast<const unsigned char*>(p)[0];
117     *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
118     if ((*shared | *non_shared) < 128) {
119       // Fast path: all three values are encoded in one byte each
120       p += 2;
121     } else {
122       if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
123       if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
124     }
125     return p;
126   }
127 };
128 
NextImpl()129 void DataBlockIter::NextImpl() { ParseNextDataKey<DecodeEntry>(); }
130 
NextOrReportImpl()131 void DataBlockIter::NextOrReportImpl() {
132   ParseNextDataKey<CheckAndDecodeEntry>();
133 }
134 
NextImpl()135 void IndexBlockIter::NextImpl() { ParseNextIndexKey(); }
136 
PrevImpl()137 void IndexBlockIter::PrevImpl() {
138   assert(Valid());
139   // Scan backwards to a restart point before current_
140   const uint32_t original = current_;
141   while (GetRestartPoint(restart_index_) >= original) {
142     if (restart_index_ == 0) {
143       // No more entries
144       current_ = restarts_;
145       restart_index_ = num_restarts_;
146       return;
147     }
148     restart_index_--;
149   }
150   SeekToRestartPoint(restart_index_);
151   // Loop until end of current entry hits the start of original entry
152   while (ParseNextIndexKey() && NextEntryOffset() < original) {
153   }
154 }
155 
156 // Similar to IndexBlockIter::PrevImpl but also caches the prev entries
PrevImpl()157 void DataBlockIter::PrevImpl() {
158   assert(Valid());
159 
160   assert(prev_entries_idx_ == -1 ||
161          static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
162   // Check if we can use cached prev_entries_
163   if (prev_entries_idx_ > 0 &&
164       prev_entries_[prev_entries_idx_].offset == current_) {
165     // Read cached CachedPrevEntry
166     prev_entries_idx_--;
167     const CachedPrevEntry& current_prev_entry =
168         prev_entries_[prev_entries_idx_];
169 
170     const char* key_ptr = nullptr;
171     bool raw_key_cached;
172     if (current_prev_entry.key_ptr != nullptr) {
173       // The key is not delta encoded and stored in the data block
174       key_ptr = current_prev_entry.key_ptr;
175       raw_key_cached = false;
176     } else {
177       // The key is delta encoded and stored in prev_entries_keys_buff_
178       key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
179       raw_key_cached = true;
180     }
181     const Slice current_key(key_ptr, current_prev_entry.key_size);
182 
183     current_ = current_prev_entry.offset;
184     // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience,
185     // not necessity. It is convenient since this class treats keys as pinned
186     // when `raw_key_` points to an outside buffer. So we cannot allow
187     // `raw_key_` point into Prev cache as it is a transient outside buffer
188     // (i.e., keys in it are not actually pinned).
189     raw_key_.SetKey(current_key, raw_key_cached /* copy */);
190     value_ = current_prev_entry.value;
191 
192     return;
193   }
194 
195   // Clear prev entries cache
196   prev_entries_idx_ = -1;
197   prev_entries_.clear();
198   prev_entries_keys_buff_.clear();
199 
200   // Scan backwards to a restart point before current_
201   const uint32_t original = current_;
202   while (GetRestartPoint(restart_index_) >= original) {
203     if (restart_index_ == 0) {
204       // No more entries
205       current_ = restarts_;
206       restart_index_ = num_restarts_;
207       return;
208     }
209     restart_index_--;
210   }
211 
212   SeekToRestartPoint(restart_index_);
213 
214   do {
215     if (!ParseNextDataKey<DecodeEntry>()) {
216       break;
217     }
218     Slice current_key = raw_key_.GetKey();
219 
220     if (raw_key_.IsKeyPinned()) {
221       // The key is not delta encoded
222       prev_entries_.emplace_back(current_, current_key.data(), 0,
223                                  current_key.size(), value());
224     } else {
225       // The key is delta encoded, cache decoded key in buffer
226       size_t new_key_offset = prev_entries_keys_buff_.size();
227       prev_entries_keys_buff_.append(current_key.data(), current_key.size());
228 
229       prev_entries_.emplace_back(current_, nullptr, new_key_offset,
230                                  current_key.size(), value());
231     }
232     // Loop until end of current entry hits the start of original entry
233   } while (NextEntryOffset() < original);
234   prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
235 }
236 
SeekImpl(const Slice & target)237 void DataBlockIter::SeekImpl(const Slice& target) {
238   Slice seek_key = target;
239   PERF_TIMER_GUARD(block_seek_nanos);
240   if (data_ == nullptr) {  // Not init yet
241     return;
242   }
243   uint32_t index = 0;
244   bool skip_linear_scan = false;
245   bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
246 
247   if (!ok) {
248     return;
249   }
250   FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
251 }
252 
253 // Optimized Seek for point lookup for an internal key `target`
254 // target = "seek_user_key @ type | seqno".
255 //
256 // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
257 // or kTypeBlobIndex, this function behaves identically as Seek().
258 //
259 // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
260 // or kTypeBlobIndex:
261 //
262 // If the return value is FALSE, iter location is undefined, and it means:
263 // 1) there is no key in this block falling into the range:
264 //    ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
265 //    inclusive; AND
266 // 2) the last key of this block has a greater user_key from seek_user_key
267 //
268 // If the return value is TRUE, iter location has two possibilies:
269 // 1) If iter is valid, it is set to a location as if set by BinarySeek. In
270 //    this case, it points to the first key with a larger user_key or a matching
271 //    user_key with a seqno no greater than the seeking seqno.
272 // 2) If the iter is invalid, it means that either all the user_key is less
273 //    than the seek_user_key, or the block ends with a matching user_key but
274 //    with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
275 //    but larger type).
SeekForGetImpl(const Slice & target)276 bool DataBlockIter::SeekForGetImpl(const Slice& target) {
277   Slice target_user_key = ExtractUserKey(target);
278   uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
279   uint8_t entry =
280       data_block_hash_index_->Lookup(data_, map_offset, target_user_key);
281 
282   if (entry == kCollision) {
283     // HashSeek not effective, falling back
284     SeekImpl(target);
285     return true;
286   }
287 
288   if (entry == kNoEntry) {
289     // Even if we cannot find the user_key in this block, the result may
290     // exist in the next block. Consider this example:
291     //
292     // Block N:    [aab@100, ... , app@120]
293     // boundary key: axy@50 (we make minimal assumption about a boundary key)
294     // Block N+1:  [axy@10, ...   ]
295     //
296     // If seek_key = axy@60, the search will starts from Block N.
297     // Even if the user_key is not found in the hash map, the caller still
298     // have to continue searching the next block.
299     //
300     // In this case, we pretend the key is the the last restart interval.
301     // The while-loop below will search the last restart interval for the
302     // key. It will stop at the first key that is larger than the seek_key,
303     // or to the end of the block if no one is larger.
304     entry = static_cast<uint8_t>(num_restarts_ - 1);
305   }
306 
307   uint32_t restart_index = entry;
308 
309   // check if the key is in the restart_interval
310   assert(restart_index < num_restarts_);
311   SeekToRestartPoint(restart_index);
312 
313   const char* limit = nullptr;
314   if (restart_index_ + 1 < num_restarts_) {
315     limit = data_ + GetRestartPoint(restart_index_ + 1);
316   } else {
317     limit = data_ + restarts_;
318   }
319 
320   while (true) {
321     // Here we only linear seek the target key inside the restart interval.
322     // If a key does not exist inside a restart interval, we avoid
323     // further searching the block content accross restart interval boundary.
324     //
325     // TODO(fwu): check the left and write boundary of the restart interval
326     // to avoid linear seek a target key that is out of range.
327     if (!ParseNextDataKey<DecodeEntry>(limit) ||
328         CompareCurrentKey(target) >= 0) {
329       // we stop at the first potential matching user key.
330       break;
331     }
332   }
333 
334   if (current_ == restarts_) {
335     // Search reaches to the end of the block. There are three possibilites:
336     // 1) there is only one user_key match in the block (otherwise collsion).
337     //    the matching user_key resides in the last restart interval, and it
338     //    is the last key of the restart interval and of the block as well.
339     //    ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
340     //
341     // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
342     //    AND all existing user_keys in the restart interval are smaller than
343     //    seek_user_key.
344     //
345     // 3) The seek_key is a false positive and happens to be hashed to the
346     //    last restart interval, AND all existing user_keys in the restart
347     //    interval are smaller than seek_user_key.
348     //
349     // The result may exist in the next block each case, so we return true.
350     return true;
351   }
352 
353   if (ucmp().Compare(raw_key_.GetUserKey(), target_user_key) != 0) {
354     // the key is not in this block and cannot be at the next block either.
355     return false;
356   }
357 
358   // Here we are conservative and only support a limited set of cases
359   ValueType value_type = ExtractValueType(raw_key_.GetInternalKey());
360   if (value_type != ValueType::kTypeValue &&
361       value_type != ValueType::kTypeDeletion &&
362       value_type != ValueType::kTypeSingleDeletion &&
363       value_type != ValueType::kTypeBlobIndex) {
364     SeekImpl(target);
365     return true;
366   }
367 
368   // Result found, and the iter is correctly set.
369   return true;
370 }
371 
SeekImpl(const Slice & target)372 void IndexBlockIter::SeekImpl(const Slice& target) {
373   TEST_SYNC_POINT("IndexBlockIter::Seek:0");
374   PERF_TIMER_GUARD(block_seek_nanos);
375   if (data_ == nullptr) {  // Not init yet
376     return;
377   }
378   Slice seek_key = target;
379   if (raw_key_.IsUserKey()) {
380     seek_key = ExtractUserKey(target);
381   }
382   status_ = Status::OK();
383   uint32_t index = 0;
384   bool skip_linear_scan = false;
385   bool ok = false;
386   if (prefix_index_) {
387     bool prefix_may_exist = true;
388     ok = PrefixSeek(target, &index, &prefix_may_exist);
389     if (!prefix_may_exist) {
390       // This is to let the caller to distinguish between non-existing prefix,
391       // and when key is larger than the last key, which both set Valid() to
392       // false.
393       current_ = restarts_;
394       status_ = Status::NotFound();
395     }
396     // restart interval must be one when hash search is enabled so the binary
397     // search simply lands at the right place.
398     skip_linear_scan = true;
399   } else if (value_delta_encoded_) {
400     ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan);
401   } else {
402     ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
403   }
404 
405   if (!ok) {
406     return;
407   }
408   FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
409 }
410 
SeekForPrevImpl(const Slice & target)411 void DataBlockIter::SeekForPrevImpl(const Slice& target) {
412   PERF_TIMER_GUARD(block_seek_nanos);
413   Slice seek_key = target;
414   if (data_ == nullptr) {  // Not init yet
415     return;
416   }
417   uint32_t index = 0;
418   bool skip_linear_scan = false;
419   bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
420 
421   if (!ok) {
422     return;
423   }
424   FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
425 
426   if (!Valid()) {
427     SeekToLastImpl();
428   } else {
429     while (Valid() && CompareCurrentKey(seek_key) > 0) {
430       PrevImpl();
431     }
432   }
433 }
434 
SeekToFirstImpl()435 void DataBlockIter::SeekToFirstImpl() {
436   if (data_ == nullptr) {  // Not init yet
437     return;
438   }
439   SeekToRestartPoint(0);
440   ParseNextDataKey<DecodeEntry>();
441 }
442 
SeekToFirstOrReportImpl()443 void DataBlockIter::SeekToFirstOrReportImpl() {
444   if (data_ == nullptr) {  // Not init yet
445     return;
446   }
447   SeekToRestartPoint(0);
448   ParseNextDataKey<CheckAndDecodeEntry>();
449 }
450 
SeekToFirstImpl()451 void IndexBlockIter::SeekToFirstImpl() {
452   if (data_ == nullptr) {  // Not init yet
453     return;
454   }
455   status_ = Status::OK();
456   SeekToRestartPoint(0);
457   ParseNextIndexKey();
458 }
459 
SeekToLastImpl()460 void DataBlockIter::SeekToLastImpl() {
461   if (data_ == nullptr) {  // Not init yet
462     return;
463   }
464   SeekToRestartPoint(num_restarts_ - 1);
465   while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) {
466     // Keep skipping
467   }
468 }
469 
SeekToLastImpl()470 void IndexBlockIter::SeekToLastImpl() {
471   if (data_ == nullptr) {  // Not init yet
472     return;
473   }
474   status_ = Status::OK();
475   SeekToRestartPoint(num_restarts_ - 1);
476   while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
477     // Keep skipping
478   }
479 }
480 
481 template <class TValue>
CorruptionError()482 void BlockIter<TValue>::CorruptionError() {
483   current_ = restarts_;
484   restart_index_ = num_restarts_;
485   status_ = Status::Corruption("bad entry in block");
486   raw_key_.Clear();
487   value_.clear();
488 }
489 
490 template <typename DecodeEntryFunc>
ParseNextDataKey(const char * limit)491 bool DataBlockIter::ParseNextDataKey(const char* limit) {
492   current_ = NextEntryOffset();
493   const char* p = data_ + current_;
494   if (!limit) {
495     limit = data_ + restarts_;  // Restarts come right after data
496   }
497 
498   if (p >= limit) {
499     // No more entries to return.  Mark as invalid.
500     current_ = restarts_;
501     restart_index_ = num_restarts_;
502     return false;
503   }
504 
505   // Decode next entry
506   uint32_t shared, non_shared, value_length;
507   p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
508   if (p == nullptr || raw_key_.Size() < shared) {
509     CorruptionError();
510     return false;
511   } else {
512     if (shared == 0) {
513       // If this key doesn't share any bytes with prev key then we don't need
514       // to decode it and can use its address in the block directly.
515       raw_key_.SetKey(Slice(p, non_shared), false /* copy */);
516     } else {
517       // This key share `shared` bytes with prev key, we need to decode it
518       raw_key_.TrimAppend(shared, p, non_shared);
519     }
520 
521 #ifndef NDEBUG
522     if (global_seqno_ != kDisableGlobalSequenceNumber) {
523       // If we are reading a file with a global sequence number we should
524       // expect that all encoded sequence numbers are zeros and any value
525       // type is kTypeValue, kTypeMerge, kTypeDeletion,
526       // kTypeDeletionWithTimestamp, or kTypeRangeDeletion.
527       uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey());
528       SequenceNumber seqno;
529       ValueType value_type;
530       UnPackSequenceAndType(packed, &seqno, &value_type);
531       assert(value_type == ValueType::kTypeValue ||
532              value_type == ValueType::kTypeMerge ||
533              value_type == ValueType::kTypeDeletion ||
534              value_type == ValueType::kTypeDeletionWithTimestamp ||
535              value_type == ValueType::kTypeRangeDeletion);
536       assert(seqno == 0);
537     }
538 #endif  // NDEBUG
539 
540     value_ = Slice(p + non_shared, value_length);
541     if (shared == 0) {
542       while (restart_index_ + 1 < num_restarts_ &&
543              GetRestartPoint(restart_index_ + 1) < current_) {
544         ++restart_index_;
545       }
546     }
547     // else we are in the middle of a restart interval and the restart_index_
548     // thus has not changed
549     return true;
550   }
551 }
552 
ParseNextIndexKey()553 bool IndexBlockIter::ParseNextIndexKey() {
554   current_ = NextEntryOffset();
555   const char* p = data_ + current_;
556   const char* limit = data_ + restarts_;  // Restarts come right after data
557   if (p >= limit) {
558     // No more entries to return.  Mark as invalid.
559     current_ = restarts_;
560     restart_index_ = num_restarts_;
561     return false;
562   }
563 
564   // Decode next entry
565   uint32_t shared, non_shared, value_length;
566   if (value_delta_encoded_) {
567     p = DecodeKeyV4()(p, limit, &shared, &non_shared);
568     value_length = 0;
569   } else {
570     p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
571   }
572   if (p == nullptr || raw_key_.Size() < shared) {
573     CorruptionError();
574     return false;
575   }
576   if (shared == 0) {
577     // If this key doesn't share any bytes with prev key then we don't need
578     // to decode it and can use its address in the block directly.
579     raw_key_.SetKey(Slice(p, non_shared), false /* copy */);
580   } else {
581     // This key share `shared` bytes with prev key, we need to decode it
582     raw_key_.TrimAppend(shared, p, non_shared);
583   }
584   value_ = Slice(p + non_shared, value_length);
585   if (shared == 0) {
586     while (restart_index_ + 1 < num_restarts_ &&
587            GetRestartPoint(restart_index_ + 1) < current_) {
588       ++restart_index_;
589     }
590   }
591   // else we are in the middle of a restart interval and the restart_index_
592   // thus has not changed
593   if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
594     DecodeCurrentValue(shared);
595   }
596   return true;
597 }
598 
599 // The format:
600 // restart_point   0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
601 // restart_point   1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
602 // ...
603 // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
604 // where, k is key, v is value, and its encoding is in parenthesis.
605 // The format of each key is (shared_size, non_shared_size, shared, non_shared)
606 // The format of each value, i.e., block handle, is (offset, size) whenever the
607 // shared_size is 0, which included the first entry in each restart point.
608 // Otherwise the format is delta-size = block handle size - size of last block
609 // handle.
DecodeCurrentValue(uint32_t shared)610 void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
611   Slice v(value_.data(), data_ + restarts_ - value_.data());
612   // Delta encoding is used if `shared` != 0.
613   Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
614       &v, have_first_key_,
615       (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr);
616   assert(decode_s.ok());
617   value_ = Slice(value_.data(), v.data() - value_.data());
618 
619   if (global_seqno_state_ != nullptr) {
620     // Overwrite sequence number the same way as in DataBlockIter.
621 
622     IterKey& first_internal_key = global_seqno_state_->first_internal_key;
623     first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
624                                       /* copy */ true);
625 
626     assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
627 
628     ValueType value_type = ExtractValueType(first_internal_key.GetKey());
629     assert(value_type == ValueType::kTypeValue ||
630            value_type == ValueType::kTypeMerge ||
631            value_type == ValueType::kTypeDeletion ||
632            value_type == ValueType::kTypeRangeDeletion);
633 
634     first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
635                                          value_type);
636     decoded_value_.first_internal_key = first_internal_key.GetKey();
637   }
638 }
639 
640 template <class TValue>
FindKeyAfterBinarySeek(const Slice & target,uint32_t index,bool skip_linear_scan)641 void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
642                                                uint32_t index,
643                                                bool skip_linear_scan) {
644   // SeekToRestartPoint() only does the lookup in the restart block. We need
645   // to follow it up with NextImpl() to position the iterator at the restart
646   // key.
647   SeekToRestartPoint(index);
648   NextImpl();
649 
650   if (!skip_linear_scan) {
651     // Linear search (within restart block) for first key >= target
652     uint32_t max_offset;
653     if (index + 1 < num_restarts_) {
654       // We are in a non-last restart interval. Since `BinarySeek()` guarantees
655       // the next restart key is strictly greater than `target`, we can
656       // terminate upon reaching it without any additional key comparison.
657       max_offset = GetRestartPoint(index + 1);
658     } else {
659       // We are in the last restart interval. The while-loop will terminate by
660       // `Valid()` returning false upon advancing past the block's last key.
661       max_offset = port::kMaxUint32;
662     }
663     while (true) {
664       NextImpl();
665       if (!Valid()) {
666         break;
667       }
668       if (current_ == max_offset) {
669         assert(CompareCurrentKey(target) > 0);
670         break;
671       } else if (CompareCurrentKey(target) >= 0) {
672         break;
673       }
674     }
675   }
676 }
677 
678 // Binary searches in restart array to find the starting restart point for the
679 // linear scan, and stores it in `*index`. Assumes restart array does not
680 // contain duplicate keys. It is guaranteed that the restart key at `*index + 1`
681 // is strictly greater than `target` or does not exist (this can be used to
682 // elide a comparison when linear scan reaches all the way to the next restart
683 // key). Furthermore, `*skip_linear_scan` is set to indicate whether the
684 // `*index`th restart key is the final result so that key does not need to be
685 // compared again later.
686 template <class TValue>
687 template <typename DecodeKeyFunc>
BinarySeek(const Slice & target,uint32_t * index,bool * skip_linear_scan)688 bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
689                                    bool* skip_linear_scan) {
690   if (restarts_ == 0) {
691     // SST files dedicated to range tombstones are written with index blocks
692     // that have no keys while also having `num_restarts_ == 1`. This would
693     // cause a problem for `BinarySeek()` as it'd try to access the first key
694     // which does not exist. We identify such blocks by the offset at which
695     // their restarts are stored, and return false to prevent any attempted
696     // key accesses.
697     return false;
698   }
699 
700   *skip_linear_scan = false;
701   // Loop invariants:
702   // - Restart key at index `left` is less than or equal to the target key. The
703   //   sentinel index `-1` is considered to have a key that is less than all
704   //   keys.
705   // - Any restart keys after index `right` are strictly greater than the target
706   //   key.
707   int64_t left = -1, right = num_restarts_ - 1;
708   while (left != right) {
709     // The `mid` is computed by rounding up so it lands in (`left`, `right`].
710     int64_t mid = left + (right - left + 1) / 2;
711     uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid));
712     uint32_t shared, non_shared;
713     const char* key_ptr = DecodeKeyFunc()(
714         data_ + region_offset, data_ + restarts_, &shared, &non_shared);
715     if (key_ptr == nullptr || (shared != 0)) {
716       CorruptionError();
717       return false;
718     }
719     Slice mid_key(key_ptr, non_shared);
720     raw_key_.SetKey(mid_key, false /* copy */);
721     int cmp = CompareCurrentKey(target);
722     if (cmp < 0) {
723       // Key at "mid" is smaller than "target". Therefore all
724       // blocks before "mid" are uninteresting.
725       left = mid;
726     } else if (cmp > 0) {
727       // Key at "mid" is >= "target". Therefore all blocks at or
728       // after "mid" are uninteresting.
729       right = mid - 1;
730     } else {
731       *skip_linear_scan = true;
732       left = right = mid;
733     }
734   }
735 
736   if (left == -1) {
737     // All keys in the block were strictly greater than `target`. So the very
738     // first key in the block is the final seek result.
739     *skip_linear_scan = true;
740     *index = 0;
741   } else {
742     *index = static_cast<uint32_t>(left);
743   }
744   return true;
745 }
746 
747 // Compare target key and the block key of the block of `block_index`.
748 // Return -1 if error.
CompareBlockKey(uint32_t block_index,const Slice & target)749 int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
750   uint32_t region_offset = GetRestartPoint(block_index);
751   uint32_t shared, non_shared;
752   const char* key_ptr =
753       value_delta_encoded_
754           ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
755                           &non_shared)
756           : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
757                         &non_shared);
758   if (key_ptr == nullptr || (shared != 0)) {
759     CorruptionError();
760     return 1;  // Return target is smaller
761   }
762   Slice block_key(key_ptr, non_shared);
763   raw_key_.SetKey(block_key, false /* copy */);
764   return CompareCurrentKey(target);
765 }
766 
767 // Binary search in block_ids to find the first block
768 // with a key >= target
BinaryBlockIndexSeek(const Slice & target,uint32_t * block_ids,uint32_t left,uint32_t right,uint32_t * index,bool * prefix_may_exist)769 bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
770                                           uint32_t* block_ids, uint32_t left,
771                                           uint32_t right, uint32_t* index,
772                                           bool* prefix_may_exist) {
773   assert(left <= right);
774   assert(index);
775   assert(prefix_may_exist);
776   *prefix_may_exist = true;
777   uint32_t left_bound = left;
778 
779   while (left <= right) {
780     uint32_t mid = (right + left) / 2;
781 
782     int cmp = CompareBlockKey(block_ids[mid], target);
783     if (!status_.ok()) {
784       return false;
785     }
786     if (cmp < 0) {
787       // Key at "target" is larger than "mid". Therefore all
788       // blocks before or at "mid" are uninteresting.
789       left = mid + 1;
790     } else {
791       // Key at "target" is <= "mid". Therefore all blocks
792       // after "mid" are uninteresting.
793       // If there is only one block left, we found it.
794       if (left == right) break;
795       right = mid;
796     }
797   }
798 
799   if (left == right) {
800     // In one of the two following cases:
801     // (1) left is the first one of block_ids
802     // (2) there is a gap of blocks between block of `left` and `left-1`.
803     // we can further distinguish the case of key in the block or key not
804     // existing, by comparing the target key and the key of the previous
805     // block to the left of the block found.
806     if (block_ids[left] > 0 &&
807         (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
808         CompareBlockKey(block_ids[left] - 1, target) > 0) {
809       current_ = restarts_;
810       *prefix_may_exist = false;
811       return false;
812     }
813 
814     *index = block_ids[left];
815     return true;
816   } else {
817     assert(left > right);
818 
819     // If the next block key is larger than seek key, it is possible that
820     // no key shares the prefix with `target`, or all keys with the same
821     // prefix as `target` are smaller than prefix. In the latter case,
822     // we are mandated to set the position the same as the total order.
823     // In the latter case, either:
824     // (1) `target` falls into the range of the next block. In this case,
825     //     we can place the iterator to the next block, or
826     // (2) `target` is larger than all block keys. In this case we can
827     //     keep the iterator invalidate without setting `prefix_may_exist`
828     //     to false.
829     // We might sometimes end up with setting the total order position
830     // while there is no key sharing the prefix as `target`, but it
831     // still follows the contract.
832     uint32_t right_index = block_ids[right];
833     assert(right_index + 1 <= num_restarts_);
834     if (right_index + 1 < num_restarts_) {
835       if (CompareBlockKey(right_index + 1, target) >= 0) {
836         *index = right_index + 1;
837         return true;
838       } else {
839         // We have to set the flag here because we are not positioning
840         // the iterator to the total order position.
841         *prefix_may_exist = false;
842       }
843     }
844 
845     // Mark iterator invalid
846     current_ = restarts_;
847     return false;
848   }
849 }
850 
PrefixSeek(const Slice & target,uint32_t * index,bool * prefix_may_exist)851 bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
852                                 bool* prefix_may_exist) {
853   assert(index);
854   assert(prefix_may_exist);
855   assert(prefix_index_);
856   *prefix_may_exist = true;
857   Slice seek_key = target;
858   if (raw_key_.IsUserKey()) {
859     seek_key = ExtractUserKey(target);
860   }
861   uint32_t* block_ids = nullptr;
862   uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
863 
864   if (num_blocks == 0) {
865     current_ = restarts_;
866     *prefix_may_exist = false;
867     return false;
868   } else {
869     assert(block_ids);
870     return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
871                                 prefix_may_exist);
872   }
873 }
874 
NumRestarts() const875 uint32_t Block::NumRestarts() const {
876   assert(size_ >= 2 * sizeof(uint32_t));
877   uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
878   uint32_t num_restarts = block_footer;
879   if (size_ > kMaxBlockSizeSupportedByHashIndex) {
880     // In BlockBuilder, we have ensured a block with HashIndex is less than
881     // kMaxBlockSizeSupportedByHashIndex (64KiB).
882     //
883     // Therefore, if we encounter a block with a size > 64KiB, the block
884     // cannot have HashIndex. So the footer will directly interpreted as
885     // num_restarts.
886     //
887     // Such check is for backward compatibility. We can ensure legacy block
888     // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
889     // correctly as no HashIndex even if the MSB of num_restarts is set.
890     return num_restarts;
891   }
892   BlockBasedTableOptions::DataBlockIndexType index_type;
893   UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
894   return num_restarts;
895 }
896 
IndexType() const897 BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
898   assert(size_ >= 2 * sizeof(uint32_t));
899   if (size_ > kMaxBlockSizeSupportedByHashIndex) {
900     // The check is for the same reason as that in NumRestarts()
901     return BlockBasedTableOptions::kDataBlockBinarySearch;
902   }
903   uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
904   uint32_t num_restarts = block_footer;
905   BlockBasedTableOptions::DataBlockIndexType index_type;
906   UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
907   return index_type;
908 }
909 
~Block()910 Block::~Block() {
911   // This sync point can be re-enabled if RocksDB can control the
912   // initialization order of any/all static options created by the user.
913   // TEST_SYNC_POINT("Block::~Block");
914 }
915 
Block(BlockContents && contents,size_t read_amp_bytes_per_bit,Statistics * statistics)916 Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
917              Statistics* statistics)
918     : contents_(std::move(contents)),
919       data_(contents_.data.data()),
920       size_(contents_.data.size()),
921       restart_offset_(0),
922       num_restarts_(0) {
923   TEST_SYNC_POINT("Block::Block:0");
924   if (size_ < sizeof(uint32_t)) {
925     size_ = 0;  // Error marker
926   } else {
927     // Should only decode restart points for uncompressed blocks
928     num_restarts_ = NumRestarts();
929     switch (IndexType()) {
930       case BlockBasedTableOptions::kDataBlockBinarySearch:
931         restart_offset_ = static_cast<uint32_t>(size_) -
932                           (1 + num_restarts_) * sizeof(uint32_t);
933         if (restart_offset_ > size_ - sizeof(uint32_t)) {
934           // The size is too small for NumRestarts() and therefore
935           // restart_offset_ wrapped around.
936           size_ = 0;
937         }
938         break;
939       case BlockBasedTableOptions::kDataBlockBinaryAndHash:
940         if (size_ < sizeof(uint32_t) /* block footer */ +
941                         sizeof(uint16_t) /* NUM_BUCK */) {
942           size_ = 0;
943           break;
944         }
945 
946         uint16_t map_offset;
947         data_block_hash_index_.Initialize(
948             contents.data.data(),
949             static_cast<uint16_t>(contents.data.size() -
950                                   sizeof(uint32_t)), /*chop off
951                                                  NUM_RESTARTS*/
952             &map_offset);
953 
954         restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
955 
956         if (restart_offset_ > map_offset) {
957           // map_offset is too small for NumRestarts() and
958           // therefore restart_offset_ wrapped around.
959           size_ = 0;
960           break;
961         }
962         break;
963       default:
964         size_ = 0;  // Error marker
965     }
966   }
967   if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
968     read_amp_bitmap_.reset(new BlockReadAmpBitmap(
969         restart_offset_, read_amp_bytes_per_bit, statistics));
970   }
971 }
972 
NewDataIterator(const Comparator * raw_ucmp,SequenceNumber global_seqno,DataBlockIter * iter,Statistics * stats,bool block_contents_pinned)973 DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
974                                       SequenceNumber global_seqno,
975                                       DataBlockIter* iter, Statistics* stats,
976                                       bool block_contents_pinned) {
977   DataBlockIter* ret_iter;
978   if (iter != nullptr) {
979     ret_iter = iter;
980   } else {
981     ret_iter = new DataBlockIter;
982   }
983   if (size_ < 2 * sizeof(uint32_t)) {
984     ret_iter->Invalidate(Status::Corruption("bad block contents"));
985     return ret_iter;
986   }
987   if (num_restarts_ == 0) {
988     // Empty block.
989     ret_iter->Invalidate(Status::OK());
990     return ret_iter;
991   } else {
992     ret_iter->Initialize(
993         raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno,
994         read_amp_bitmap_.get(), block_contents_pinned,
995         data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
996     if (read_amp_bitmap_) {
997       if (read_amp_bitmap_->GetStatistics() != stats) {
998         // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
999         read_amp_bitmap_->SetStatistics(stats);
1000       }
1001     }
1002   }
1003 
1004   return ret_iter;
1005 }
1006 
NewIndexIterator(const Comparator * raw_ucmp,SequenceNumber global_seqno,IndexBlockIter * iter,Statistics *,bool total_order_seek,bool have_first_key,bool key_includes_seq,bool value_is_full,bool block_contents_pinned,BlockPrefixIndex * prefix_index)1007 IndexBlockIter* Block::NewIndexIterator(
1008     const Comparator* raw_ucmp, SequenceNumber global_seqno,
1009     IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek,
1010     bool have_first_key, bool key_includes_seq, bool value_is_full,
1011     bool block_contents_pinned, BlockPrefixIndex* prefix_index) {
1012   IndexBlockIter* ret_iter;
1013   if (iter != nullptr) {
1014     ret_iter = iter;
1015   } else {
1016     ret_iter = new IndexBlockIter;
1017   }
1018   if (size_ < 2 * sizeof(uint32_t)) {
1019     ret_iter->Invalidate(Status::Corruption("bad block contents"));
1020     return ret_iter;
1021   }
1022   if (num_restarts_ == 0) {
1023     // Empty block.
1024     ret_iter->Invalidate(Status::OK());
1025     return ret_iter;
1026   } else {
1027     BlockPrefixIndex* prefix_index_ptr =
1028         total_order_seek ? nullptr : prefix_index;
1029     ret_iter->Initialize(raw_ucmp, data_, restart_offset_, num_restarts_,
1030                          global_seqno, prefix_index_ptr, have_first_key,
1031                          key_includes_seq, value_is_full,
1032                          block_contents_pinned);
1033   }
1034 
1035   return ret_iter;
1036 }
1037 
ApproximateMemoryUsage() const1038 size_t Block::ApproximateMemoryUsage() const {
1039   size_t usage = usable_size();
1040 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
1041   usage += malloc_usable_size((void*)this);
1042 #else
1043   usage += sizeof(*this);
1044 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
1045   if (read_amp_bitmap_) {
1046     usage += read_amp_bitmap_->ApproximateMemoryUsage();
1047   }
1048   return usage;
1049 }
1050 
1051 }  // namespace ROCKSDB_NAMESPACE
1052