1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 
7 #ifndef ROCKSDB_LITE
8 #include "memtable/hash_linklist_rep.h"
9 
10 #include <algorithm>
11 #include <atomic>
12 #include "db/memtable.h"
13 #include "memory/arena.h"
14 #include "memtable/skiplist.h"
15 #include "monitoring/histogram.h"
16 #include "port/port.h"
17 #include "rocksdb/memtablerep.h"
18 #include "rocksdb/slice.h"
19 #include "rocksdb/slice_transform.h"
20 #include "util/hash.h"
21 
22 namespace ROCKSDB_NAMESPACE {
23 namespace {
24 
25 typedef const char* Key;
26 typedef SkipList<Key, const MemTableRep::KeyComparator&> MemtableSkipList;
27 typedef std::atomic<void*> Pointer;
28 
29 // A data structure used as the header of a link list of a hash bucket.
30 struct BucketHeader {
31   Pointer next;
32   std::atomic<uint32_t> num_entries;
33 
BucketHeaderROCKSDB_NAMESPACE::__anonb174bd6b0111::BucketHeader34   explicit BucketHeader(void* n, uint32_t count)
35       : next(n), num_entries(count) {}
36 
IsSkipListBucketROCKSDB_NAMESPACE::__anonb174bd6b0111::BucketHeader37   bool IsSkipListBucket() {
38     return next.load(std::memory_order_relaxed) == this;
39   }
40 
GetNumEntriesROCKSDB_NAMESPACE::__anonb174bd6b0111::BucketHeader41   uint32_t GetNumEntries() const {
42     return num_entries.load(std::memory_order_relaxed);
43   }
44 
45   // REQUIRES: called from single-threaded Insert()
IncNumEntriesROCKSDB_NAMESPACE::__anonb174bd6b0111::BucketHeader46   void IncNumEntries() {
47     // Only one thread can do write at one time. No need to do atomic
48     // incremental. Update it with relaxed load and store.
49     num_entries.store(GetNumEntries() + 1, std::memory_order_relaxed);
50   }
51 };
52 
53 // A data structure used as the header of a skip list of a hash bucket.
54 struct SkipListBucketHeader {
55   BucketHeader Counting_header;
56   MemtableSkipList skip_list;
57 
SkipListBucketHeaderROCKSDB_NAMESPACE::__anonb174bd6b0111::SkipListBucketHeader58   explicit SkipListBucketHeader(const MemTableRep::KeyComparator& cmp,
59                                 Allocator* allocator, uint32_t count)
60       : Counting_header(this,  // Pointing to itself to indicate header type.
61                         count),
62         skip_list(cmp, allocator) {}
63 };
64 
65 struct Node {
66   // Accessors/mutators for links.  Wrapped in methods so we can
67   // add the appropriate barriers as necessary.
NextROCKSDB_NAMESPACE::__anonb174bd6b0111::Node68   Node* Next() {
69     // Use an 'acquire load' so that we observe a fully initialized
70     // version of the returned Node.
71     return next_.load(std::memory_order_acquire);
72   }
SetNextROCKSDB_NAMESPACE::__anonb174bd6b0111::Node73   void SetNext(Node* x) {
74     // Use a 'release store' so that anybody who reads through this
75     // pointer observes a fully initialized version of the inserted node.
76     next_.store(x, std::memory_order_release);
77   }
78   // No-barrier variants that can be safely used in a few locations.
NoBarrier_NextROCKSDB_NAMESPACE::__anonb174bd6b0111::Node79   Node* NoBarrier_Next() {
80     return next_.load(std::memory_order_relaxed);
81   }
82 
NoBarrier_SetNextROCKSDB_NAMESPACE::__anonb174bd6b0111::Node83   void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); }
84 
85   // Needed for placement new below which is fine
NodeROCKSDB_NAMESPACE::__anonb174bd6b0111::Node86   Node() {}
87 
88  private:
89   std::atomic<Node*> next_;
90 
91   // Prohibit copying due to the below
92   Node(const Node&) = delete;
93   Node& operator=(const Node&) = delete;
94 
95  public:
96   char key[1];
97 };
98 
99 // Memory structure of the mem table:
100 // It is a hash table, each bucket points to one entry, a linked list or a
101 // skip list. In order to track total number of records in a bucket to determine
102 // whether should switch to skip list, a header is added just to indicate
103 // number of entries in the bucket.
104 //
105 //
106 //          +-----> NULL    Case 1. Empty bucket
107 //          |
108 //          |
109 //          | +---> +-------+
110 //          | |     | Next  +--> NULL
111 //          | |     +-------+
112 //  +-----+ | |     |       |  Case 2. One Entry in bucket.
113 //  |     +-+ |     | Data  |          next pointer points to
114 //  +-----+   |     |       |          NULL. All other cases
115 //  |     |   |     |       |          next pointer is not NULL.
116 //  +-----+   |     +-------+
117 //  |     +---+
118 //  +-----+     +-> +-------+  +> +-------+  +-> +-------+
119 //  |     |     |   | Next  +--+  | Next  +--+   | Next  +-->NULL
120 //  +-----+     |   +-------+     +-------+      +-------+
121 //  |     +-----+   | Count |     |       |      |       |
122 //  +-----+         +-------+     | Data  |      | Data  |
123 //  |     |                       |       |      |       |
124 //  +-----+          Case 3.      |       |      |       |
125 //  |     |          A header     +-------+      +-------+
126 //  +-----+          points to
127 //  |     |          a linked list. Count indicates total number
128 //  +-----+          of rows in this bucket.
129 //  |     |
130 //  +-----+    +-> +-------+ <--+
131 //  |     |    |   | Next  +----+
132 //  +-----+    |   +-------+   Case 4. A header points to a skip
133 //  |     +----+   | Count |           list and next pointer points to
134 //  +-----+        +-------+           itself, to distinguish case 3 or 4.
135 //  |     |        |       |           Count still is kept to indicates total
136 //  +-----+        | Skip +-->         of entries in the bucket for debugging
137 //  |     |        | List  |   Data    purpose.
138 //  |     |        |      +-->
139 //  +-----+        |       |
140 //  |     |        +-------+
141 //  +-----+
142 //
143 // We don't have data race when changing cases because:
144 // (1) When changing from case 2->3, we create a new bucket header, put the
145 //     single node there first without changing the original node, and do a
146 //     release store when changing the bucket pointer. In that case, a reader
147 //     who sees a stale value of the bucket pointer will read this node, while
148 //     a reader sees the correct value because of the release store.
149 // (2) When changing case 3->4, a new header is created with skip list points
150 //     to the data, before doing an acquire store to change the bucket pointer.
151 //     The old header and nodes are never changed, so any reader sees any
152 //     of those existing pointers will guarantee to be able to iterate to the
153 //     end of the linked list.
154 // (3) Header's next pointer in case 3 might change, but they are never equal
155 //     to itself, so no matter a reader sees any stale or newer value, it will
156 //     be able to correctly distinguish case 3 and 4.
157 //
158 // The reason that we use case 2 is we want to make the format to be efficient
159 // when the utilization of buckets is relatively low. If we use case 3 for
160 // single entry bucket, we will need to waste 12 bytes for every entry,
161 // which can be significant decrease of memory utilization.
162 class HashLinkListRep : public MemTableRep {
163  public:
164   HashLinkListRep(const MemTableRep::KeyComparator& compare,
165                   Allocator* allocator, const SliceTransform* transform,
166                   size_t bucket_size, uint32_t threshold_use_skiplist,
167                   size_t huge_page_tlb_size, Logger* logger,
168                   int bucket_entries_logging_threshold,
169                   bool if_log_bucket_dist_when_flash);
170 
171   KeyHandle Allocate(const size_t len, char** buf) override;
172 
173   void Insert(KeyHandle handle) override;
174 
175   bool Contains(const char* key) const override;
176 
177   size_t ApproximateMemoryUsage() override;
178 
179   void Get(const LookupKey& k, void* callback_args,
180            bool (*callback_func)(void* arg, const char* entry)) override;
181 
182   ~HashLinkListRep() override;
183 
184   MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override;
185 
186   MemTableRep::Iterator* GetDynamicPrefixIterator(
187       Arena* arena = nullptr) override;
188 
189  private:
190   friend class DynamicIterator;
191 
192   size_t bucket_size_;
193 
194   // Maps slices (which are transformed user keys) to buckets of keys sharing
195   // the same transform.
196   Pointer* buckets_;
197 
198   const uint32_t threshold_use_skiplist_;
199 
200   // The user-supplied transform whose domain is the user keys.
201   const SliceTransform* transform_;
202 
203   const MemTableRep::KeyComparator& compare_;
204 
205   Logger* logger_;
206   int bucket_entries_logging_threshold_;
207   bool if_log_bucket_dist_when_flash_;
208 
209   bool LinkListContains(Node* head, const Slice& key) const;
210 
211   SkipListBucketHeader* GetSkipListBucketHeader(Pointer* first_next_pointer)
212       const;
213 
214   Node* GetLinkListFirstNode(Pointer* first_next_pointer) const;
215 
GetPrefix(const Slice & internal_key) const216   Slice GetPrefix(const Slice& internal_key) const {
217     return transform_->Transform(ExtractUserKey(internal_key));
218   }
219 
GetHash(const Slice & slice) const220   size_t GetHash(const Slice& slice) const {
221     return fastrange64(GetSliceNPHash64(slice), bucket_size_);
222   }
223 
GetBucket(size_t i) const224   Pointer* GetBucket(size_t i) const {
225     return static_cast<Pointer*>(buckets_[i].load(std::memory_order_acquire));
226   }
227 
GetBucket(const Slice & slice) const228   Pointer* GetBucket(const Slice& slice) const {
229     return GetBucket(GetHash(slice));
230   }
231 
Equal(const Slice & a,const Key & b) const232   bool Equal(const Slice& a, const Key& b) const {
233     return (compare_(b, a) == 0);
234   }
235 
Equal(const Key & a,const Key & b) const236   bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
237 
KeyIsAfterNode(const Slice & internal_key,const Node * n) const238   bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const {
239     // nullptr n is considered infinite
240     return (n != nullptr) && (compare_(n->key, internal_key) < 0);
241   }
242 
KeyIsAfterNode(const Key & key,const Node * n) const243   bool KeyIsAfterNode(const Key& key, const Node* n) const {
244     // nullptr n is considered infinite
245     return (n != nullptr) && (compare_(n->key, key) < 0);
246   }
247 
KeyIsAfterOrAtNode(const Slice & internal_key,const Node * n) const248   bool KeyIsAfterOrAtNode(const Slice& internal_key, const Node* n) const {
249     // nullptr n is considered infinite
250     return (n != nullptr) && (compare_(n->key, internal_key) <= 0);
251   }
252 
KeyIsAfterOrAtNode(const Key & key,const Node * n) const253   bool KeyIsAfterOrAtNode(const Key& key, const Node* n) const {
254     // nullptr n is considered infinite
255     return (n != nullptr) && (compare_(n->key, key) <= 0);
256   }
257 
258   Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const;
259   Node* FindLessOrEqualInBucket(Node* head, const Slice& key) const;
260 
261   class FullListIterator : public MemTableRep::Iterator {
262    public:
FullListIterator(MemtableSkipList * list,Allocator * allocator)263     explicit FullListIterator(MemtableSkipList* list, Allocator* allocator)
264         : iter_(list), full_list_(list), allocator_(allocator) {}
265 
~FullListIterator()266     ~FullListIterator() override {}
267 
268     // Returns true iff the iterator is positioned at a valid node.
Valid() const269     bool Valid() const override { return iter_.Valid(); }
270 
271     // Returns the key at the current position.
272     // REQUIRES: Valid()
key() const273     const char* key() const override {
274       assert(Valid());
275       return iter_.key();
276     }
277 
278     // Advances to the next position.
279     // REQUIRES: Valid()
Next()280     void Next() override {
281       assert(Valid());
282       iter_.Next();
283     }
284 
285     // Advances to the previous position.
286     // REQUIRES: Valid()
Prev()287     void Prev() override {
288       assert(Valid());
289       iter_.Prev();
290     }
291 
292     // Advance to the first entry with a key >= target
Seek(const Slice & internal_key,const char * memtable_key)293     void Seek(const Slice& internal_key, const char* memtable_key) override {
294       const char* encoded_key =
295           (memtable_key != nullptr) ?
296               memtable_key : EncodeKey(&tmp_, internal_key);
297       iter_.Seek(encoded_key);
298     }
299 
300     // Retreat to the last entry with a key <= target
SeekForPrev(const Slice & internal_key,const char * memtable_key)301     void SeekForPrev(const Slice& internal_key,
302                      const char* memtable_key) override {
303       const char* encoded_key = (memtable_key != nullptr)
304                                     ? memtable_key
305                                     : EncodeKey(&tmp_, internal_key);
306       iter_.SeekForPrev(encoded_key);
307     }
308 
309     // Position at the first entry in collection.
310     // Final state of iterator is Valid() iff collection is not empty.
SeekToFirst()311     void SeekToFirst() override { iter_.SeekToFirst(); }
312 
313     // Position at the last entry in collection.
314     // Final state of iterator is Valid() iff collection is not empty.
SeekToLast()315     void SeekToLast() override { iter_.SeekToLast(); }
316 
317    private:
318     MemtableSkipList::Iterator iter_;
319     // To destruct with the iterator.
320     std::unique_ptr<MemtableSkipList> full_list_;
321     std::unique_ptr<Allocator> allocator_;
322     std::string tmp_;       // For passing to EncodeKey
323   };
324 
325   class LinkListIterator : public MemTableRep::Iterator {
326    public:
LinkListIterator(const HashLinkListRep * const hash_link_list_rep,Node * head)327     explicit LinkListIterator(const HashLinkListRep* const hash_link_list_rep,
328                               Node* head)
329         : hash_link_list_rep_(hash_link_list_rep),
330           head_(head),
331           node_(nullptr) {}
332 
~LinkListIterator()333     ~LinkListIterator() override {}
334 
335     // Returns true iff the iterator is positioned at a valid node.
Valid() const336     bool Valid() const override { return node_ != nullptr; }
337 
338     // Returns the key at the current position.
339     // REQUIRES: Valid()
key() const340     const char* key() const override {
341       assert(Valid());
342       return node_->key;
343     }
344 
345     // Advances to the next position.
346     // REQUIRES: Valid()
Next()347     void Next() override {
348       assert(Valid());
349       node_ = node_->Next();
350     }
351 
352     // Advances to the previous position.
353     // REQUIRES: Valid()
Prev()354     void Prev() override {
355       // Prefix iterator does not support total order.
356       // We simply set the iterator to invalid state
357       Reset(nullptr);
358     }
359 
360     // Advance to the first entry with a key >= target
Seek(const Slice & internal_key,const char *)361     void Seek(const Slice& internal_key,
362               const char* /*memtable_key*/) override {
363       node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_,
364                                                               internal_key);
365     }
366 
367     // Retreat to the last entry with a key <= target
SeekForPrev(const Slice &,const char *)368     void SeekForPrev(const Slice& /*internal_key*/,
369                      const char* /*memtable_key*/) override {
370       // Since we do not support Prev()
371       // We simply do not support SeekForPrev
372       Reset(nullptr);
373     }
374 
375     // Position at the first entry in collection.
376     // Final state of iterator is Valid() iff collection is not empty.
SeekToFirst()377     void SeekToFirst() override {
378       // Prefix iterator does not support total order.
379       // We simply set the iterator to invalid state
380       Reset(nullptr);
381     }
382 
383     // Position at the last entry in collection.
384     // Final state of iterator is Valid() iff collection is not empty.
SeekToLast()385     void SeekToLast() override {
386       // Prefix iterator does not support total order.
387       // We simply set the iterator to invalid state
388       Reset(nullptr);
389     }
390 
391    protected:
Reset(Node * head)392     void Reset(Node* head) {
393       head_ = head;
394       node_ = nullptr;
395     }
396    private:
397     friend class HashLinkListRep;
398     const HashLinkListRep* const hash_link_list_rep_;
399     Node* head_;
400     Node* node_;
401 
SeekToHead()402     virtual void SeekToHead() {
403       node_ = head_;
404     }
405   };
406 
407   class DynamicIterator : public HashLinkListRep::LinkListIterator {
408    public:
DynamicIterator(HashLinkListRep & memtable_rep)409     explicit DynamicIterator(HashLinkListRep& memtable_rep)
410         : HashLinkListRep::LinkListIterator(&memtable_rep, nullptr),
411           memtable_rep_(memtable_rep) {}
412 
413     // Advance to the first entry with a key >= target
Seek(const Slice & k,const char * memtable_key)414     void Seek(const Slice& k, const char* memtable_key) override {
415       auto transformed = memtable_rep_.GetPrefix(k);
416       auto* bucket = memtable_rep_.GetBucket(transformed);
417 
418       SkipListBucketHeader* skip_list_header =
419           memtable_rep_.GetSkipListBucketHeader(bucket);
420       if (skip_list_header != nullptr) {
421         // The bucket is organized as a skip list
422         if (!skip_list_iter_) {
423           skip_list_iter_.reset(
424               new MemtableSkipList::Iterator(&skip_list_header->skip_list));
425         } else {
426           skip_list_iter_->SetList(&skip_list_header->skip_list);
427         }
428         if (memtable_key != nullptr) {
429           skip_list_iter_->Seek(memtable_key);
430         } else {
431           IterKey encoded_key;
432           encoded_key.EncodeLengthPrefixedKey(k);
433           skip_list_iter_->Seek(encoded_key.GetUserKey().data());
434         }
435       } else {
436         // The bucket is organized as a linked list
437         skip_list_iter_.reset();
438         Reset(memtable_rep_.GetLinkListFirstNode(bucket));
439         HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
440       }
441     }
442 
Valid() const443     bool Valid() const override {
444       if (skip_list_iter_) {
445         return skip_list_iter_->Valid();
446       }
447       return HashLinkListRep::LinkListIterator::Valid();
448     }
449 
key() const450     const char* key() const override {
451       if (skip_list_iter_) {
452         return skip_list_iter_->key();
453       }
454       return HashLinkListRep::LinkListIterator::key();
455     }
456 
Next()457     void Next() override {
458       if (skip_list_iter_) {
459         skip_list_iter_->Next();
460       } else {
461         HashLinkListRep::LinkListIterator::Next();
462       }
463     }
464 
465    private:
466     // the underlying memtable
467     const HashLinkListRep& memtable_rep_;
468     std::unique_ptr<MemtableSkipList::Iterator> skip_list_iter_;
469   };
470 
471   class EmptyIterator : public MemTableRep::Iterator {
472     // This is used when there wasn't a bucket. It is cheaper than
473     // instantiating an empty bucket over which to iterate.
474    public:
EmptyIterator()475     EmptyIterator() { }
Valid() const476     bool Valid() const override { return false; }
key() const477     const char* key() const override {
478       assert(false);
479       return nullptr;
480     }
Next()481     void Next() override {}
Prev()482     void Prev() override {}
Seek(const Slice &,const char *)483     void Seek(const Slice& /*user_key*/,
484               const char* /*memtable_key*/) override {}
SeekForPrev(const Slice &,const char *)485     void SeekForPrev(const Slice& /*user_key*/,
486                      const char* /*memtable_key*/) override {}
SeekToFirst()487     void SeekToFirst() override {}
SeekToLast()488     void SeekToLast() override {}
489 
490    private:
491   };
492 };
493 
HashLinkListRep(const MemTableRep::KeyComparator & compare,Allocator * allocator,const SliceTransform * transform,size_t bucket_size,uint32_t threshold_use_skiplist,size_t huge_page_tlb_size,Logger * logger,int bucket_entries_logging_threshold,bool if_log_bucket_dist_when_flash)494 HashLinkListRep::HashLinkListRep(
495     const MemTableRep::KeyComparator& compare, Allocator* allocator,
496     const SliceTransform* transform, size_t bucket_size,
497     uint32_t threshold_use_skiplist, size_t huge_page_tlb_size, Logger* logger,
498     int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash)
499     : MemTableRep(allocator),
500       bucket_size_(bucket_size),
501       // Threshold to use skip list doesn't make sense if less than 3, so we
502       // force it to be minimum of 3 to simplify implementation.
503       threshold_use_skiplist_(std::max(threshold_use_skiplist, 3U)),
504       transform_(transform),
505       compare_(compare),
506       logger_(logger),
507       bucket_entries_logging_threshold_(bucket_entries_logging_threshold),
508       if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) {
509   char* mem = allocator_->AllocateAligned(sizeof(Pointer) * bucket_size,
510                                       huge_page_tlb_size, logger);
511 
512   buckets_ = new (mem) Pointer[bucket_size];
513 
514   for (size_t i = 0; i < bucket_size_; ++i) {
515     buckets_[i].store(nullptr, std::memory_order_relaxed);
516   }
517 }
518 
~HashLinkListRep()519 HashLinkListRep::~HashLinkListRep() {
520 }
521 
Allocate(const size_t len,char ** buf)522 KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
523   char* mem = allocator_->AllocateAligned(sizeof(Node) + len);
524   Node* x = new (mem) Node();
525   *buf = x->key;
526   return static_cast<void*>(x);
527 }
528 
GetSkipListBucketHeader(Pointer * first_next_pointer) const529 SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
530     Pointer* first_next_pointer) const {
531   if (first_next_pointer == nullptr) {
532     return nullptr;
533   }
534   if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
535     // Single entry bucket
536     return nullptr;
537   }
538   // Counting header
539   BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
540   if (header->IsSkipListBucket()) {
541     assert(header->GetNumEntries() > threshold_use_skiplist_);
542     auto* skip_list_bucket_header =
543         reinterpret_cast<SkipListBucketHeader*>(header);
544     assert(skip_list_bucket_header->Counting_header.next.load(
545                std::memory_order_relaxed) == header);
546     return skip_list_bucket_header;
547   }
548   assert(header->GetNumEntries() <= threshold_use_skiplist_);
549   return nullptr;
550 }
551 
GetLinkListFirstNode(Pointer * first_next_pointer) const552 Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const {
553   if (first_next_pointer == nullptr) {
554     return nullptr;
555   }
556   if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
557     // Single entry bucket
558     return reinterpret_cast<Node*>(first_next_pointer);
559   }
560   // Counting header
561   BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
562   if (!header->IsSkipListBucket()) {
563     assert(header->GetNumEntries() <= threshold_use_skiplist_);
564     return reinterpret_cast<Node*>(
565         header->next.load(std::memory_order_acquire));
566   }
567   assert(header->GetNumEntries() > threshold_use_skiplist_);
568   return nullptr;
569 }
570 
Insert(KeyHandle handle)571 void HashLinkListRep::Insert(KeyHandle handle) {
572   Node* x = static_cast<Node*>(handle);
573   assert(!Contains(x->key));
574   Slice internal_key = GetLengthPrefixedSlice(x->key);
575   auto transformed = GetPrefix(internal_key);
576   auto& bucket = buckets_[GetHash(transformed)];
577   Pointer* first_next_pointer =
578       static_cast<Pointer*>(bucket.load(std::memory_order_relaxed));
579 
580   if (first_next_pointer == nullptr) {
581     // Case 1. empty bucket
582     // NoBarrier_SetNext() suffices since we will add a barrier when
583     // we publish a pointer to "x" in prev[i].
584     x->NoBarrier_SetNext(nullptr);
585     bucket.store(x, std::memory_order_release);
586     return;
587   }
588 
589   BucketHeader* header = nullptr;
590   if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
591     // Case 2. only one entry in the bucket
592     // Need to convert to a Counting bucket and turn to case 4.
593     Node* first = reinterpret_cast<Node*>(first_next_pointer);
594     // Need to add a bucket header.
595     // We have to first convert it to a bucket with header before inserting
596     // the new node. Otherwise, we might need to change next pointer of first.
597     // In that case, a reader might sees the next pointer is NULL and wrongly
598     // think the node is a bucket header.
599     auto* mem = allocator_->AllocateAligned(sizeof(BucketHeader));
600     header = new (mem) BucketHeader(first, 1);
601     bucket.store(header, std::memory_order_release);
602   } else {
603     header = reinterpret_cast<BucketHeader*>(first_next_pointer);
604     if (header->IsSkipListBucket()) {
605       // Case 4. Bucket is already a skip list
606       assert(header->GetNumEntries() > threshold_use_skiplist_);
607       auto* skip_list_bucket_header =
608           reinterpret_cast<SkipListBucketHeader*>(header);
609       // Only one thread can execute Insert() at one time. No need to do atomic
610       // incremental.
611       skip_list_bucket_header->Counting_header.IncNumEntries();
612       skip_list_bucket_header->skip_list.Insert(x->key);
613       return;
614     }
615   }
616 
617   if (bucket_entries_logging_threshold_ > 0 &&
618       header->GetNumEntries() ==
619           static_cast<uint32_t>(bucket_entries_logging_threshold_)) {
620     Info(logger_, "HashLinkedList bucket %" ROCKSDB_PRIszt
621                   " has more than %d "
622                   "entries. Key to insert: %s",
623          GetHash(transformed), header->GetNumEntries(),
624          GetLengthPrefixedSlice(x->key).ToString(true).c_str());
625   }
626 
627   if (header->GetNumEntries() == threshold_use_skiplist_) {
628     // Case 3. number of entries reaches the threshold so need to convert to
629     // skip list.
630     LinkListIterator bucket_iter(
631         this, reinterpret_cast<Node*>(
632                   first_next_pointer->load(std::memory_order_relaxed)));
633     auto mem = allocator_->AllocateAligned(sizeof(SkipListBucketHeader));
634     SkipListBucketHeader* new_skip_list_header = new (mem)
635         SkipListBucketHeader(compare_, allocator_, header->GetNumEntries() + 1);
636     auto& skip_list = new_skip_list_header->skip_list;
637 
638     // Add all current entries to the skip list
639     for (bucket_iter.SeekToHead(); bucket_iter.Valid(); bucket_iter.Next()) {
640       skip_list.Insert(bucket_iter.key());
641     }
642 
643     // insert the new entry
644     skip_list.Insert(x->key);
645     // Set the bucket
646     bucket.store(new_skip_list_header, std::memory_order_release);
647   } else {
648     // Case 5. Need to insert to the sorted linked list without changing the
649     // header.
650     Node* first =
651         reinterpret_cast<Node*>(header->next.load(std::memory_order_relaxed));
652     assert(first != nullptr);
653     // Advance counter unless the bucket needs to be advanced to skip list.
654     // In that case, we need to make sure the previous count never exceeds
655     // threshold_use_skiplist_ to avoid readers to cast to wrong format.
656     header->IncNumEntries();
657 
658     Node* cur = first;
659     Node* prev = nullptr;
660     while (true) {
661       if (cur == nullptr) {
662         break;
663       }
664       Node* next = cur->Next();
665       // Make sure the lists are sorted.
666       // If x points to head_ or next points nullptr, it is trivially satisfied.
667       assert((cur == first) || (next == nullptr) ||
668              KeyIsAfterNode(next->key, cur));
669       if (KeyIsAfterNode(internal_key, cur)) {
670         // Keep searching in this list
671         prev = cur;
672         cur = next;
673       } else {
674         break;
675       }
676     }
677 
678     // Our data structure does not allow duplicate insertion
679     assert(cur == nullptr || !Equal(x->key, cur->key));
680 
681     // NoBarrier_SetNext() suffices since we will add a barrier when
682     // we publish a pointer to "x" in prev[i].
683     x->NoBarrier_SetNext(cur);
684 
685     if (prev) {
686       prev->SetNext(x);
687     } else {
688       header->next.store(static_cast<void*>(x), std::memory_order_release);
689     }
690   }
691 }
692 
Contains(const char * key) const693 bool HashLinkListRep::Contains(const char* key) const {
694   Slice internal_key = GetLengthPrefixedSlice(key);
695 
696   auto transformed = GetPrefix(internal_key);
697   auto bucket = GetBucket(transformed);
698   if (bucket == nullptr) {
699     return false;
700   }
701 
702   SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket);
703   if (skip_list_header != nullptr) {
704     return skip_list_header->skip_list.Contains(key);
705   } else {
706     return LinkListContains(GetLinkListFirstNode(bucket), internal_key);
707   }
708 }
709 
ApproximateMemoryUsage()710 size_t HashLinkListRep::ApproximateMemoryUsage() {
711   // Memory is always allocated from the allocator.
712   return 0;
713 }
714 
Get(const LookupKey & k,void * callback_args,bool (* callback_func)(void * arg,const char * entry))715 void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
716                           bool (*callback_func)(void* arg, const char* entry)) {
717   auto transformed = transform_->Transform(k.user_key());
718   auto bucket = GetBucket(transformed);
719 
720   auto* skip_list_header = GetSkipListBucketHeader(bucket);
721   if (skip_list_header != nullptr) {
722     // Is a skip list
723     MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
724     for (iter.Seek(k.memtable_key().data());
725          iter.Valid() && callback_func(callback_args, iter.key());
726          iter.Next()) {
727     }
728   } else {
729     auto* link_list_head = GetLinkListFirstNode(bucket);
730     if (link_list_head != nullptr) {
731       LinkListIterator iter(this, link_list_head);
732       for (iter.Seek(k.internal_key(), nullptr);
733            iter.Valid() && callback_func(callback_args, iter.key());
734            iter.Next()) {
735       }
736     }
737   }
738 }
739 
GetIterator(Arena * alloc_arena)740 MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) {
741   // allocate a new arena of similar size to the one currently in use
742   Arena* new_arena = new Arena(allocator_->BlockSize());
743   auto list = new MemtableSkipList(compare_, new_arena);
744   HistogramImpl keys_per_bucket_hist;
745 
746   for (size_t i = 0; i < bucket_size_; ++i) {
747     int count = 0;
748     auto* bucket = GetBucket(i);
749     if (bucket != nullptr) {
750       auto* skip_list_header = GetSkipListBucketHeader(bucket);
751       if (skip_list_header != nullptr) {
752         // Is a skip list
753         MemtableSkipList::Iterator itr(&skip_list_header->skip_list);
754         for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
755           list->Insert(itr.key());
756           count++;
757         }
758       } else {
759         auto* link_list_head = GetLinkListFirstNode(bucket);
760         if (link_list_head != nullptr) {
761           LinkListIterator itr(this, link_list_head);
762           for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
763             list->Insert(itr.key());
764             count++;
765           }
766         }
767       }
768     }
769     if (if_log_bucket_dist_when_flash_) {
770       keys_per_bucket_hist.Add(count);
771     }
772   }
773   if (if_log_bucket_dist_when_flash_ && logger_ != nullptr) {
774     Info(logger_, "hashLinkedList Entry distribution among buckets: %s",
775          keys_per_bucket_hist.ToString().c_str());
776   }
777 
778   if (alloc_arena == nullptr) {
779     return new FullListIterator(list, new_arena);
780   } else {
781     auto mem = alloc_arena->AllocateAligned(sizeof(FullListIterator));
782     return new (mem) FullListIterator(list, new_arena);
783   }
784 }
785 
GetDynamicPrefixIterator(Arena * alloc_arena)786 MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator(
787     Arena* alloc_arena) {
788   if (alloc_arena == nullptr) {
789     return new DynamicIterator(*this);
790   } else {
791     auto mem = alloc_arena->AllocateAligned(sizeof(DynamicIterator));
792     return new (mem) DynamicIterator(*this);
793   }
794 }
795 
LinkListContains(Node * head,const Slice & user_key) const796 bool HashLinkListRep::LinkListContains(Node* head,
797                                        const Slice& user_key) const {
798   Node* x = FindGreaterOrEqualInBucket(head, user_key);
799   return (x != nullptr && Equal(user_key, x->key));
800 }
801 
FindGreaterOrEqualInBucket(Node * head,const Slice & key) const802 Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
803                                                   const Slice& key) const {
804   Node* x = head;
805   while (true) {
806     if (x == nullptr) {
807       return x;
808     }
809     Node* next = x->Next();
810     // Make sure the lists are sorted.
811     // If x points to head_ or next points nullptr, it is trivially satisfied.
812     assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x));
813     if (KeyIsAfterNode(key, x)) {
814       // Keep searching in this list
815       x = next;
816     } else {
817       break;
818     }
819   }
820   return x;
821 }
822 
823 } // anon namespace
824 
CreateMemTableRep(const MemTableRep::KeyComparator & compare,Allocator * allocator,const SliceTransform * transform,Logger * logger)825 MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
826     const MemTableRep::KeyComparator& compare, Allocator* allocator,
827     const SliceTransform* transform, Logger* logger) {
828   return new HashLinkListRep(compare, allocator, transform, bucket_count_,
829                              threshold_use_skiplist_, huge_page_tlb_size_,
830                              logger, bucket_entries_logging_threshold_,
831                              if_log_bucket_dist_when_flash_);
832 }
833 
NewHashLinkListRepFactory(size_t bucket_count,size_t huge_page_tlb_size,int bucket_entries_logging_threshold,bool if_log_bucket_dist_when_flash,uint32_t threshold_use_skiplist)834 MemTableRepFactory* NewHashLinkListRepFactory(
835     size_t bucket_count, size_t huge_page_tlb_size,
836     int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash,
837     uint32_t threshold_use_skiplist) {
838   return new HashLinkListRepFactory(
839       bucket_count, threshold_use_skiplist, huge_page_tlb_size,
840       bucket_entries_logging_threshold, if_log_bucket_dist_when_flash);
841 }
842 
843 }  // namespace ROCKSDB_NAMESPACE
844 #endif  // ROCKSDB_LITE
845