1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #ifndef ROCKSDB_LITE
11 #include "table/cuckoo/cuckoo_table_reader.h"
12 
13 #include <algorithm>
14 #include <limits>
15 #include <string>
16 #include <utility>
17 #include <vector>
18 #include "memory/arena.h"
19 #include "rocksdb/iterator.h"
20 #include "rocksdb/table.h"
21 #include "table/cuckoo/cuckoo_table_factory.h"
22 #include "table/get_context.h"
23 #include "table/internal_iterator.h"
24 #include "table/meta_blocks.h"
25 #include "util/coding.h"
26 
27 namespace ROCKSDB_NAMESPACE {
28 namespace {
29 const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
30 const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
31 }
32 
33 extern const uint64_t kCuckooTableMagicNumber;
34 
CuckooTableReader(const ImmutableCFOptions & ioptions,std::unique_ptr<RandomAccessFileReader> && file,uint64_t file_size,const Comparator * comparator,uint64_t (* get_slice_hash)(const Slice &,uint32_t,uint64_t))35 CuckooTableReader::CuckooTableReader(
36     const ImmutableCFOptions& ioptions,
37     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
38     const Comparator* comparator,
39     uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
40     : file_(std::move(file)),
41       is_last_level_(false),
42       identity_as_first_hash_(false),
43       use_module_hash_(false),
44       num_hash_func_(0),
45       unused_key_(""),
46       key_length_(0),
47       user_key_length_(0),
48       value_length_(0),
49       bucket_length_(0),
50       cuckoo_block_size_(0),
51       cuckoo_block_bytes_minus_one_(0),
52       table_size_(0),
53       ucomp_(comparator),
54       get_slice_hash_(get_slice_hash) {
55   if (!ioptions.allow_mmap_reads) {
56     status_ = Status::InvalidArgument("File is not mmaped");
57   }
58   TableProperties* props = nullptr;
59   status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber,
60       ioptions, &props, true /* compression_type_missing */);
61   if (!status_.ok()) {
62     return;
63   }
64   table_props_.reset(props);
65   auto& user_props = props->user_collected_properties;
66   auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
67   if (hash_funs == user_props.end()) {
68     status_ = Status::Corruption("Number of hash functions not found");
69     return;
70   }
71   num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
72   auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
73   if (unused_key == user_props.end()) {
74     status_ = Status::Corruption("Empty bucket value not found");
75     return;
76   }
77   unused_key_ = unused_key->second;
78 
79   key_length_ = static_cast<uint32_t>(props->fixed_key_len);
80   auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
81   if (user_key_len == user_props.end()) {
82     status_ = Status::Corruption("User key length not found");
83     return;
84   }
85   user_key_length_ = *reinterpret_cast<const uint32_t*>(
86       user_key_len->second.data());
87 
88   auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
89   if (value_length == user_props.end()) {
90     status_ = Status::Corruption("Value length not found");
91     return;
92   }
93   value_length_ = *reinterpret_cast<const uint32_t*>(
94       value_length->second.data());
95   bucket_length_ = key_length_ + value_length_;
96 
97   auto hash_table_size = user_props.find(
98       CuckooTablePropertyNames::kHashTableSize);
99   if (hash_table_size == user_props.end()) {
100     status_ = Status::Corruption("Hash table size not found");
101     return;
102   }
103   table_size_ = *reinterpret_cast<const uint64_t*>(
104       hash_table_size->second.data());
105 
106   auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
107   if (is_last_level == user_props.end()) {
108     status_ = Status::Corruption("Is last level not found");
109     return;
110   }
111   is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
112 
113   auto identity_as_first_hash = user_props.find(
114       CuckooTablePropertyNames::kIdentityAsFirstHash);
115   if (identity_as_first_hash == user_props.end()) {
116     status_ = Status::Corruption("identity as first hash not found");
117     return;
118   }
119   identity_as_first_hash_ = *reinterpret_cast<const bool*>(
120       identity_as_first_hash->second.data());
121 
122   auto use_module_hash = user_props.find(
123       CuckooTablePropertyNames::kUseModuleHash);
124   if (use_module_hash == user_props.end()) {
125     status_ = Status::Corruption("hash type is not found");
126     return;
127   }
128   use_module_hash_ = *reinterpret_cast<const bool*>(
129       use_module_hash->second.data());
130   auto cuckoo_block_size = user_props.find(
131       CuckooTablePropertyNames::kCuckooBlockSize);
132   if (cuckoo_block_size == user_props.end()) {
133     status_ = Status::Corruption("Cuckoo block size not found");
134     return;
135   }
136   cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>(
137       cuckoo_block_size->second.data());
138   cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
139   status_ = file_->Read(0, static_cast<size_t>(file_size), &file_data_, nullptr);
140 }
141 
Get(const ReadOptions &,const Slice & key,GetContext * get_context,const SliceTransform *,bool)142 Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,
143                               const Slice& key, GetContext* get_context,
144                               const SliceTransform* /* prefix_extractor */,
145                               bool /*skip_filters*/) {
146   assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
147   Slice user_key = ExtractUserKey(key);
148   for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
149     uint64_t offset = bucket_length_ * CuckooHash(
150         user_key, hash_cnt, use_module_hash_, table_size_,
151         identity_as_first_hash_, get_slice_hash_);
152     const char* bucket = &file_data_.data()[offset];
153     for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
154          ++block_idx, bucket += bucket_length_) {
155       if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()),
156                         Slice(bucket, user_key.size()))) {
157         return Status::OK();
158       }
159       // Here, we compare only the user key part as we support only one entry
160       // per user key and we don't support snapshot.
161       if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) {
162         Slice value(bucket + key_length_, value_length_);
163         if (is_last_level_) {
164           // Sequence number is not stored at the last level, so we will use
165           // kMaxSequenceNumber since it is unknown.  This could cause some
166           // transactions to fail to lock a key due to known sequence number.
167           // However, it is expected for anyone to use a CuckooTable in a
168           // TransactionDB.
169           get_context->SaveValue(value, kMaxSequenceNumber);
170         } else {
171           Slice full_key(bucket, key_length_);
172           ParsedInternalKey found_ikey;
173           ParseInternalKey(full_key, &found_ikey);
174           bool dont_care __attribute__((__unused__));
175           get_context->SaveValue(found_ikey, value, &dont_care);
176         }
177         // We don't support merge operations. So, we return here.
178         return Status::OK();
179       }
180     }
181   }
182   return Status::OK();
183 }
184 
Prepare(const Slice & key)185 void CuckooTableReader::Prepare(const Slice& key) {
186   // Prefetch the first Cuckoo Block.
187   Slice user_key = ExtractUserKey(key);
188   uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) +
189     bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_,
190                                 identity_as_first_hash_, nullptr);
191   uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
192   for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
193     PREFETCH(reinterpret_cast<const char*>(addr), 0, 3);
194   }
195 }
196 
197 class CuckooTableIterator : public InternalIterator {
198  public:
199   explicit CuckooTableIterator(CuckooTableReader* reader);
200   // No copying allowed
201   CuckooTableIterator(const CuckooTableIterator&) = delete;
202   void operator=(const Iterator&) = delete;
~CuckooTableIterator()203   ~CuckooTableIterator() override {}
204   bool Valid() const override;
205   void SeekToFirst() override;
206   void SeekToLast() override;
207   void Seek(const Slice& target) override;
208   void SeekForPrev(const Slice& target) override;
209   void Next() override;
210   void Prev() override;
211   Slice key() const override;
212   Slice value() const override;
status() const213   Status status() const override { return Status::OK(); }
214   void InitIfNeeded();
215 
216  private:
217   struct BucketComparator {
BucketComparatorROCKSDB_NAMESPACE::CuckooTableIterator::BucketComparator218     BucketComparator(const Slice& file_data, const Comparator* ucomp,
219                      uint32_t bucket_len, uint32_t user_key_len,
220                      const Slice& target = Slice())
221       : file_data_(file_data),
222         ucomp_(ucomp),
223         bucket_len_(bucket_len),
224         user_key_len_(user_key_len),
225         target_(target) {}
operator ()ROCKSDB_NAMESPACE::CuckooTableIterator::BucketComparator226     bool operator()(const uint32_t first, const uint32_t second) const {
227       const char* first_bucket =
228         (first == kInvalidIndex) ? target_.data() :
229                                    &file_data_.data()[first * bucket_len_];
230       const char* second_bucket =
231         (second == kInvalidIndex) ? target_.data() :
232                                     &file_data_.data()[second * bucket_len_];
233       return ucomp_->Compare(Slice(first_bucket, user_key_len_),
234                              Slice(second_bucket, user_key_len_)) < 0;
235     }
236    private:
237     const Slice file_data_;
238     const Comparator* ucomp_;
239     const uint32_t bucket_len_;
240     const uint32_t user_key_len_;
241     const Slice target_;
242   };
243 
244   const BucketComparator bucket_comparator_;
245   void PrepareKVAtCurrIdx();
246   CuckooTableReader* reader_;
247   bool initialized_;
248   // Contains a map of keys to bucket_id sorted in key order.
249   std::vector<uint32_t> sorted_bucket_ids_;
250   // We assume that the number of items can be stored in uint32 (4 Billion).
251   uint32_t curr_key_idx_;
252   Slice curr_value_;
253   IterKey curr_key_;
254 };
255 
CuckooTableIterator(CuckooTableReader * reader)256 CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
257   : bucket_comparator_(reader->file_data_, reader->ucomp_,
258                        reader->bucket_length_, reader->user_key_length_),
259     reader_(reader),
260     initialized_(false),
261     curr_key_idx_(kInvalidIndex) {
262   sorted_bucket_ids_.clear();
263   curr_value_.clear();
264   curr_key_.Clear();
265 }
266 
InitIfNeeded()267 void CuckooTableIterator::InitIfNeeded() {
268   if (initialized_) {
269     return;
270   }
271   sorted_bucket_ids_.reserve(static_cast<size_t>(reader_->GetTableProperties()->num_entries));
272   uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
273   assert(num_buckets < kInvalidIndex);
274   const char* bucket = reader_->file_data_.data();
275   for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) {
276     if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) {
277       sorted_bucket_ids_.push_back(bucket_id);
278     }
279     bucket += reader_->bucket_length_;
280   }
281   assert(sorted_bucket_ids_.size() ==
282       reader_->GetTableProperties()->num_entries);
283   std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
284             bucket_comparator_);
285   curr_key_idx_ = kInvalidIndex;
286   initialized_ = true;
287 }
288 
SeekToFirst()289 void CuckooTableIterator::SeekToFirst() {
290   InitIfNeeded();
291   curr_key_idx_ = 0;
292   PrepareKVAtCurrIdx();
293 }
294 
SeekToLast()295 void CuckooTableIterator::SeekToLast() {
296   InitIfNeeded();
297   curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1;
298   PrepareKVAtCurrIdx();
299 }
300 
Seek(const Slice & target)301 void CuckooTableIterator::Seek(const Slice& target) {
302   InitIfNeeded();
303   const BucketComparator seek_comparator(
304       reader_->file_data_, reader_->ucomp_,
305       reader_->bucket_length_, reader_->user_key_length_,
306       ExtractUserKey(target));
307   auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(),
308       sorted_bucket_ids_.end(),
309       kInvalidIndex,
310       seek_comparator);
311   curr_key_idx_ =
312       static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it));
313   PrepareKVAtCurrIdx();
314 }
315 
SeekForPrev(const Slice &)316 void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) {
317   // Not supported
318   assert(false);
319 }
320 
Valid() const321 bool CuckooTableIterator::Valid() const {
322   return curr_key_idx_ < sorted_bucket_ids_.size();
323 }
324 
PrepareKVAtCurrIdx()325 void CuckooTableIterator::PrepareKVAtCurrIdx() {
326   if (!Valid()) {
327     curr_value_.clear();
328     curr_key_.Clear();
329     return;
330   }
331   uint32_t id = sorted_bucket_ids_[curr_key_idx_];
332   const char* offset = reader_->file_data_.data() +
333                        id * reader_->bucket_length_;
334   if (reader_->is_last_level_) {
335     // Always return internal key.
336     curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_),
337                              0, kTypeValue);
338   } else {
339     curr_key_.SetInternalKey(Slice(offset, reader_->key_length_));
340   }
341   curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_);
342 }
343 
Next()344 void CuckooTableIterator::Next() {
345   if (!Valid()) {
346     curr_value_.clear();
347     curr_key_.Clear();
348     return;
349   }
350   ++curr_key_idx_;
351   PrepareKVAtCurrIdx();
352 }
353 
Prev()354 void CuckooTableIterator::Prev() {
355   if (curr_key_idx_ == 0) {
356     curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size());
357   }
358   if (!Valid()) {
359     curr_value_.clear();
360     curr_key_.Clear();
361     return;
362   }
363   --curr_key_idx_;
364   PrepareKVAtCurrIdx();
365 }
366 
key() const367 Slice CuckooTableIterator::key() const {
368   assert(Valid());
369   return curr_key_.GetInternalKey();
370 }
371 
value() const372 Slice CuckooTableIterator::value() const {
373   assert(Valid());
374   return curr_value_;
375 }
376 
NewIterator(const ReadOptions &,const SliceTransform *,Arena * arena,bool,TableReaderCaller,size_t)377 InternalIterator* CuckooTableReader::NewIterator(
378     const ReadOptions& /*read_options*/,
379     const SliceTransform* /* prefix_extractor */, Arena* arena,
380     bool /*skip_filters*/, TableReaderCaller /*caller*/,
381     size_t /*compaction_readahead_size*/) {
382   if (!status().ok()) {
383     return NewErrorInternalIterator<Slice>(
384         Status::Corruption("CuckooTableReader status is not okay."), arena);
385   }
386   CuckooTableIterator* iter;
387   if (arena == nullptr) {
388     iter = new CuckooTableIterator(this);
389   } else {
390     auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator));
391     iter = new (iter_mem) CuckooTableIterator(this);
392   }
393   return iter;
394 }
395 
ApproximateMemoryUsage() const396 size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; }
397 
398 }  // namespace ROCKSDB_NAMESPACE
399 #endif
400