1 //===- ConcurrentHashtable.h ------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_ADT_CONCURRENTHASHTABLE_H
10 #define LLVM_ADT_CONCURRENTHASHTABLE_H
11 
12 #include "llvm/ADT/DenseMap.h"
13 #include "llvm/ADT/Hashing.h"
14 #include "llvm/ADT/PointerIntPair.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/Support/Allocator.h"
18 #include "llvm/Support/Debug.h"
19 #include "llvm/Support/Parallel.h"
20 #include "llvm/Support/WithColor.h"
21 #include "llvm/Support/xxhash.h"
22 #include <atomic>
23 #include <cstddef>
24 #include <iomanip>
25 #include <mutex>
26 #include <sstream>
27 #include <type_traits>
28 
29 namespace llvm {
30 
31 /// ConcurrentHashTable - is a resizeable concurrent hashtable.
32 /// The number of resizings limited up to x2^31. This hashtable is
33 /// useful to have efficient access to aggregate data(like strings,
34 /// type descriptors...) and to keep only single copy of such
35 /// an aggregate. The hashtable allows only concurrent insertions:
36 ///
37 /// KeyDataTy* = insert ( const KeyTy& );
38 ///
39 /// Data structure:
40 ///
41 /// Inserted value KeyTy is mapped to 64-bit hash value ->
42 ///
43 ///          [------- 64-bit Hash value --------]
44 ///          [  StartEntryIndex ][ Bucket Index ]
45 ///                    |                |
46 ///              points to the     points to
47 ///              first probe       the bucket.
48 ///              position inside
49 ///              bucket entries
50 ///
51 /// After initialization, all buckets have an initial size. During insertions,
52 /// buckets might be extended to contain more entries. Each bucket can be
53 /// independently resized and rehashed(no need to lock the whole table).
54 /// Different buckets may have different sizes. If the single bucket is full
55 /// then the bucket is resized.
56 ///
57 /// BucketsArray keeps all buckets. Each bucket keeps an array of Entries
58 /// (pointers to KeyDataTy) and another array of entries hashes:
59 ///
60 /// BucketsArray[BucketIdx].Hashes[EntryIdx]:
61 /// BucketsArray[BucketIdx].Entries[EntryIdx]:
62 ///
63 /// [Bucket 0].Hashes -> [uint32_t][uint32_t]
64 /// [Bucket 0].Entries -> [KeyDataTy*][KeyDataTy*]
65 ///
66 /// [Bucket 1].Hashes -> [uint32_t][uint32_t][uint32_t][uint32_t]
67 /// [Bucket 1].Entries -> [KeyDataTy*][KeyDataTy*][KeyDataTy*][KeyDataTy*]
68 ///                      .........................
69 /// [Bucket N].Hashes -> [uint32_t][uint32_t][uint32_t]
70 /// [Bucket N].Entries -> [KeyDataTy*][KeyDataTy*][KeyDataTy*]
71 ///
72 /// ConcurrentHashTableByPtr uses an external thread-safe allocator to allocate
73 /// KeyDataTy items.
74 
75 template <typename KeyTy, typename KeyDataTy, typename AllocatorTy>
76 class ConcurrentHashTableInfoByPtr {
77 public:
78   /// \returns Hash value for the specified \p Key.
79   static inline uint64_t getHashValue(const KeyTy &Key) {
80     return xxh3_64bits(Key);
81   }
82 
83   /// \returns true if both \p LHS and \p RHS are equal.
84   static inline bool isEqual(const KeyTy &LHS, const KeyTy &RHS) {
85     return LHS == RHS;
86   }
87 
88   /// \returns key for the specified \p KeyData.
89   static inline const KeyTy &getKey(const KeyDataTy &KeyData) {
90     return KeyData.getKey();
91   }
92 
93   /// \returns newly created object of KeyDataTy type.
94   static inline KeyDataTy *create(const KeyTy &Key, AllocatorTy &Allocator) {
95     return KeyDataTy::create(Key, Allocator);
96   }
97 };
98 
99 template <typename KeyTy, typename KeyDataTy, typename AllocatorTy,
100           typename Info =
101               ConcurrentHashTableInfoByPtr<KeyTy, KeyDataTy, AllocatorTy>>
102 class ConcurrentHashTableByPtr {
103 public:
104   ConcurrentHashTableByPtr(
105       AllocatorTy &Allocator, uint64_t EstimatedSize = 100000,
106       size_t ThreadsNum = parallel::strategy.compute_thread_count(),
107       size_t InitialNumberOfBuckets = 128)
108       : MultiThreadAllocator(Allocator) {
109     assert((ThreadsNum > 0) && "ThreadsNum must be greater than 0");
110     assert((InitialNumberOfBuckets > 0) &&
111            "InitialNumberOfBuckets must be greater than 0");
112 
113     // Calculate number of buckets.
114     uint64_t EstimatedNumberOfBuckets = ThreadsNum;
115     if (ThreadsNum > 1) {
116       EstimatedNumberOfBuckets *= InitialNumberOfBuckets;
117       EstimatedNumberOfBuckets *= std::max(
118           1,
119           countr_zero(PowerOf2Ceil(EstimatedSize / InitialNumberOfBuckets)) >>
120               2);
121     }
122     EstimatedNumberOfBuckets = PowerOf2Ceil(EstimatedNumberOfBuckets);
123     NumberOfBuckets =
124         std::min(EstimatedNumberOfBuckets, (uint64_t)(1Ull << 31));
125 
126     // Allocate buckets.
127     BucketsArray = std::make_unique<Bucket[]>(NumberOfBuckets);
128 
129     InitialBucketSize = EstimatedSize / NumberOfBuckets;
130     InitialBucketSize = std::max((uint32_t)1, InitialBucketSize);
131     InitialBucketSize = PowerOf2Ceil(InitialBucketSize);
132 
133     // Initialize each bucket.
134     for (uint32_t Idx = 0; Idx < NumberOfBuckets; Idx++) {
135       HashesPtr Hashes = new ExtHashBitsTy[InitialBucketSize];
136       memset(Hashes, 0, sizeof(ExtHashBitsTy) * InitialBucketSize);
137 
138       DataPtr Entries = new EntryDataTy[InitialBucketSize];
139       memset(Entries, 0, sizeof(EntryDataTy) * InitialBucketSize);
140 
141       BucketsArray[Idx].Size = InitialBucketSize;
142       BucketsArray[Idx].Hashes = Hashes;
143       BucketsArray[Idx].Entries = Entries;
144     }
145 
146     // Calculate masks.
147     HashMask = NumberOfBuckets - 1;
148 
149     size_t LeadingZerosNumber = countl_zero(HashMask);
150     HashBitsNum = 64 - LeadingZerosNumber;
151 
152     // We keep only high 32-bits of hash value. So bucket size cannot
153     // exceed 2^31. Bucket size is always power of two.
154     MaxBucketSize = 1Ull << (std::min((size_t)31, LeadingZerosNumber));
155 
156     // Calculate mask for extended hash bits.
157     ExtHashMask = (NumberOfBuckets * MaxBucketSize) - 1;
158   }
159 
160   virtual ~ConcurrentHashTableByPtr() {
161     // Deallocate buckets.
162     for (uint32_t Idx = 0; Idx < NumberOfBuckets; Idx++) {
163       delete[] BucketsArray[Idx].Hashes;
164       delete[] BucketsArray[Idx].Entries;
165     }
166   }
167 
168   /// Insert new value \p NewValue or return already existing entry.
169   ///
170   /// \returns entry and "true" if an entry is just inserted or
171   /// "false" if an entry already exists.
172   std::pair<KeyDataTy *, bool> insert(const KeyTy &NewValue) {
173     // Calculate bucket index.
174     uint64_t Hash = Info::getHashValue(NewValue);
175     Bucket &CurBucket = BucketsArray[getBucketIdx(Hash)];
176     uint32_t ExtHashBits = getExtHashBits(Hash);
177 
178 #if LLVM_ENABLE_THREADS
179     // Lock bucket.
180     CurBucket.Guard.lock();
181 #endif
182 
183     HashesPtr BucketHashes = CurBucket.Hashes;
184     DataPtr BucketEntries = CurBucket.Entries;
185     uint32_t CurEntryIdx = getStartIdx(ExtHashBits, CurBucket.Size);
186 
187     while (true) {
188       uint32_t CurEntryHashBits = BucketHashes[CurEntryIdx];
189 
190       if (CurEntryHashBits == 0 && BucketEntries[CurEntryIdx] == nullptr) {
191         // Found empty slot. Insert data.
192         KeyDataTy *NewData = Info::create(NewValue, MultiThreadAllocator);
193         BucketEntries[CurEntryIdx] = NewData;
194         BucketHashes[CurEntryIdx] = ExtHashBits;
195 
196         CurBucket.NumberOfEntries++;
197         RehashBucket(CurBucket);
198 
199 #if LLVM_ENABLE_THREADS
200         CurBucket.Guard.unlock();
201 #endif
202 
203         return {NewData, true};
204       }
205 
206       if (CurEntryHashBits == ExtHashBits) {
207         // Hash matched. Check value for equality.
208         KeyDataTy *EntryData = BucketEntries[CurEntryIdx];
209         if (Info::isEqual(Info::getKey(*EntryData), NewValue)) {
210           // Already existed entry matched with inserted data is found.
211 #if LLVM_ENABLE_THREADS
212           CurBucket.Guard.unlock();
213 #endif
214 
215           return {EntryData, false};
216         }
217       }
218 
219       CurEntryIdx++;
220       CurEntryIdx &= (CurBucket.Size - 1);
221     }
222 
223     llvm_unreachable("Insertion error.");
224     return {};
225   }
226 
227   /// Print information about current state of hash table structures.
228   void printStatistic(raw_ostream &OS) {
229     OS << "\n--- HashTable statistic:\n";
230     OS << "\nNumber of buckets = " << NumberOfBuckets;
231     OS << "\nInitial bucket size = " << InitialBucketSize;
232 
233     uint64_t NumberOfNonEmptyBuckets = 0;
234     uint64_t NumberOfEntriesPlusEmpty = 0;
235     uint64_t OverallNumberOfEntries = 0;
236     uint64_t OverallSize = sizeof(*this) + NumberOfBuckets * sizeof(Bucket);
237 
238     DenseMap<uint32_t, uint32_t> BucketSizesMap;
239 
240     // For each bucket...
241     for (uint32_t Idx = 0; Idx < NumberOfBuckets; Idx++) {
242       Bucket &CurBucket = BucketsArray[Idx];
243 
244       BucketSizesMap[CurBucket.Size]++;
245 
246       if (CurBucket.NumberOfEntries != 0)
247         NumberOfNonEmptyBuckets++;
248       NumberOfEntriesPlusEmpty += CurBucket.Size;
249       OverallNumberOfEntries += CurBucket.NumberOfEntries;
250       OverallSize +=
251           (sizeof(ExtHashBitsTy) + sizeof(EntryDataTy)) * CurBucket.Size;
252     }
253 
254     OS << "\nOverall number of entries = " << OverallNumberOfEntries;
255     OS << "\nOverall number of non empty buckets = " << NumberOfNonEmptyBuckets;
256     for (auto &BucketSize : BucketSizesMap)
257       OS << "\n Number of buckets with size " << BucketSize.first << ": "
258          << BucketSize.second;
259 
260     std::stringstream stream;
261     stream << std::fixed << std::setprecision(2)
262            << ((float)OverallNumberOfEntries / (float)NumberOfEntriesPlusEmpty);
263     std::string str = stream.str();
264 
265     OS << "\nLoad factor = " << str;
266     OS << "\nOverall allocated size = " << OverallSize;
267   }
268 
269 protected:
270   using ExtHashBitsTy = uint32_t;
271   using EntryDataTy = KeyDataTy *;
272 
273   using HashesPtr = ExtHashBitsTy *;
274   using DataPtr = EntryDataTy *;
275 
276   // Bucket structure. Keeps bucket data.
277   struct Bucket {
278     Bucket() = default;
279 
280     // Size of bucket.
281     uint32_t Size = 0;
282 
283     // Number of non-null entries.
284     uint32_t NumberOfEntries = 0;
285 
286     // Hashes for [Size] entries.
287     HashesPtr Hashes = nullptr;
288 
289     // [Size] entries.
290     DataPtr Entries = nullptr;
291 
292 #if LLVM_ENABLE_THREADS
293     // Mutex for this bucket.
294     std::mutex Guard;
295 #endif
296   };
297 
298   // Reallocate and rehash bucket if this is full enough.
299   void RehashBucket(Bucket &CurBucket) {
300     assert((CurBucket.Size > 0) && "Uninitialised bucket");
301     if (CurBucket.NumberOfEntries < CurBucket.Size * 0.9)
302       return;
303 
304     if (CurBucket.Size >= MaxBucketSize)
305       report_fatal_error("ConcurrentHashTable is full");
306 
307     uint32_t NewBucketSize = CurBucket.Size << 1;
308     assert((NewBucketSize <= MaxBucketSize) && "New bucket size is too big");
309     assert((CurBucket.Size < NewBucketSize) &&
310            "New bucket size less than size of current bucket");
311 
312     // Store old entries & hashes arrays.
313     HashesPtr SrcHashes = CurBucket.Hashes;
314     DataPtr SrcEntries = CurBucket.Entries;
315 
316     // Allocate new entries&hashes arrays.
317     HashesPtr DestHashes = new ExtHashBitsTy[NewBucketSize];
318     memset(DestHashes, 0, sizeof(ExtHashBitsTy) * NewBucketSize);
319 
320     DataPtr DestEntries = new EntryDataTy[NewBucketSize];
321     memset(DestEntries, 0, sizeof(EntryDataTy) * NewBucketSize);
322 
323     // For each entry in source arrays...
324     for (uint32_t CurSrcEntryIdx = 0; CurSrcEntryIdx < CurBucket.Size;
325          CurSrcEntryIdx++) {
326       uint32_t CurSrcEntryHashBits = SrcHashes[CurSrcEntryIdx];
327 
328       // Check for null entry.
329       if (CurSrcEntryHashBits == 0 && SrcEntries[CurSrcEntryIdx] == nullptr)
330         continue;
331 
332       uint32_t StartDestIdx = getStartIdx(CurSrcEntryHashBits, NewBucketSize);
333 
334       // Insert non-null entry into the new arrays.
335       while (true) {
336         uint32_t CurDestEntryHashBits = DestHashes[StartDestIdx];
337 
338         if (CurDestEntryHashBits == 0 && DestEntries[StartDestIdx] == nullptr) {
339           // Found empty slot. Insert data.
340           DestHashes[StartDestIdx] = CurSrcEntryHashBits;
341           DestEntries[StartDestIdx] = SrcEntries[CurSrcEntryIdx];
342           break;
343         }
344 
345         StartDestIdx++;
346         StartDestIdx = StartDestIdx & (NewBucketSize - 1);
347       }
348     }
349 
350     // Update bucket fields.
351     CurBucket.Hashes = DestHashes;
352     CurBucket.Entries = DestEntries;
353     CurBucket.Size = NewBucketSize;
354 
355     // Delete old bucket entries.
356     if (SrcHashes != nullptr)
357       delete[] SrcHashes;
358     if (SrcEntries != nullptr)
359       delete[] SrcEntries;
360   }
361 
362   uint32_t getBucketIdx(hash_code Hash) { return Hash & HashMask; }
363 
364   uint32_t getExtHashBits(uint64_t Hash) {
365     return (Hash & ExtHashMask) >> HashBitsNum;
366   }
367 
368   uint32_t getStartIdx(uint32_t ExtHashBits, uint32_t BucketSize) {
369     assert((BucketSize > 0) && "Empty bucket");
370 
371     return ExtHashBits & (BucketSize - 1);
372   }
373 
374   // Number of bits in hash mask.
375   uint64_t HashBitsNum = 0;
376 
377   // Hash mask.
378   uint64_t HashMask = 0;
379 
380   // Hash mask for the extended hash bits.
381   uint64_t ExtHashMask = 0;
382 
383   // The maximal bucket size.
384   uint32_t MaxBucketSize = 0;
385 
386   // Initial size of bucket.
387   uint32_t InitialBucketSize = 0;
388 
389   // The number of buckets.
390   uint32_t NumberOfBuckets = 0;
391 
392   // Array of buckets.
393   std::unique_ptr<Bucket[]> BucketsArray;
394 
395   // Used for allocating KeyDataTy values.
396   AllocatorTy &MultiThreadAllocator;
397 };
398 
399 } // end namespace llvm
400 
401 #endif // LLVM_ADT_CONCURRENTHASHTABLE_H
402