1 //===- ConcurrentHashtable.h ------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_ADT_CONCURRENTHASHTABLE_H
10 #define LLVM_ADT_CONCURRENTHASHTABLE_H
11 
12 #include "llvm/ADT/DenseMap.h"
13 #include "llvm/ADT/Hashing.h"
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/SmallVector.h"
16 #include "llvm/Support/Allocator.h"
17 #include "llvm/Support/Debug.h"
18 #include "llvm/Support/Parallel.h"
19 #include "llvm/Support/WithColor.h"
20 #include "llvm/Support/xxhash.h"
21 #include <atomic>
22 #include <cstddef>
23 #include <iomanip>
24 #include <mutex>
25 #include <sstream>
26 #include <type_traits>
27 
28 namespace llvm {
29 
30 /// ConcurrentHashTable - is a resizeable concurrent hashtable.
31 /// The number of resizings limited up to x2^31. This hashtable is
32 /// useful to have efficient access to aggregate data(like strings,
33 /// type descriptors...) and to keep only single copy of such
34 /// an aggregate. The hashtable allows only concurrent insertions:
35 ///
36 /// KeyDataTy* = insert ( const KeyTy& );
37 ///
38 /// Data structure:
39 ///
40 /// Inserted value KeyTy is mapped to 64-bit hash value ->
41 ///
42 ///          [------- 64-bit Hash value --------]
43 ///          [  StartEntryIndex ][ Bucket Index ]
44 ///                    |                |
45 ///              points to the     points to
46 ///              first probe       the bucket.
47 ///              position inside
48 ///              bucket entries
49 ///
50 /// After initialization, all buckets have an initial size. During insertions,
51 /// buckets might be extended to contain more entries. Each bucket can be
52 /// independently resized and rehashed(no need to lock the whole table).
53 /// Different buckets may have different sizes. If the single bucket is full
54 /// then the bucket is resized.
55 ///
56 /// BucketsArray keeps all buckets. Each bucket keeps an array of Entries
57 /// (pointers to KeyDataTy) and another array of entries hashes:
58 ///
59 /// BucketsArray[BucketIdx].Hashes[EntryIdx]:
60 /// BucketsArray[BucketIdx].Entries[EntryIdx]:
61 ///
62 /// [Bucket 0].Hashes -> [uint32_t][uint32_t]
63 /// [Bucket 0].Entries -> [KeyDataTy*][KeyDataTy*]
64 ///
65 /// [Bucket 1].Hashes -> [uint32_t][uint32_t][uint32_t][uint32_t]
66 /// [Bucket 1].Entries -> [KeyDataTy*][KeyDataTy*][KeyDataTy*][KeyDataTy*]
67 ///                      .........................
68 /// [Bucket N].Hashes -> [uint32_t][uint32_t][uint32_t]
69 /// [Bucket N].Entries -> [KeyDataTy*][KeyDataTy*][KeyDataTy*]
70 ///
71 /// ConcurrentHashTableByPtr uses an external thread-safe allocator to allocate
72 /// KeyDataTy items.
73 
74 template <typename KeyTy, typename KeyDataTy, typename AllocatorTy>
75 class ConcurrentHashTableInfoByPtr {
76 public:
77   /// \returns Hash value for the specified \p Key.
getHashValue(const KeyTy & Key)78   static inline uint64_t getHashValue(const KeyTy &Key) {
79     return xxh3_64bits(Key);
80   }
81 
82   /// \returns true if both \p LHS and \p RHS are equal.
isEqual(const KeyTy & LHS,const KeyTy & RHS)83   static inline bool isEqual(const KeyTy &LHS, const KeyTy &RHS) {
84     return LHS == RHS;
85   }
86 
87   /// \returns key for the specified \p KeyData.
getKey(const KeyDataTy & KeyData)88   static inline const KeyTy &getKey(const KeyDataTy &KeyData) {
89     return KeyData.getKey();
90   }
91 
92   /// \returns newly created object of KeyDataTy type.
create(const KeyTy & Key,AllocatorTy & Allocator)93   static inline KeyDataTy *create(const KeyTy &Key, AllocatorTy &Allocator) {
94     return KeyDataTy::create(Key, Allocator);
95   }
96 };
97 
98 template <typename KeyTy, typename KeyDataTy, typename AllocatorTy,
99           typename Info =
100               ConcurrentHashTableInfoByPtr<KeyTy, KeyDataTy, AllocatorTy>>
101 class ConcurrentHashTableByPtr {
102 public:
103   ConcurrentHashTableByPtr(
104       AllocatorTy &Allocator, uint64_t EstimatedSize = 100000,
105       size_t ThreadsNum = parallel::strategy.compute_thread_count(),
106       size_t InitialNumberOfBuckets = 128)
MultiThreadAllocator(Allocator)107       : MultiThreadAllocator(Allocator) {
108     assert((ThreadsNum > 0) && "ThreadsNum must be greater than 0");
109     assert((InitialNumberOfBuckets > 0) &&
110            "InitialNumberOfBuckets must be greater than 0");
111 
112     // Calculate number of buckets.
113     uint64_t EstimatedNumberOfBuckets = ThreadsNum;
114     if (ThreadsNum > 1) {
115       EstimatedNumberOfBuckets *= InitialNumberOfBuckets;
116       EstimatedNumberOfBuckets *= std::max(
117           1,
118           countr_zero(PowerOf2Ceil(EstimatedSize / InitialNumberOfBuckets)) >>
119               2);
120     }
121     EstimatedNumberOfBuckets = PowerOf2Ceil(EstimatedNumberOfBuckets);
122     NumberOfBuckets =
123         std::min(EstimatedNumberOfBuckets, (uint64_t)(1Ull << 31));
124 
125     // Allocate buckets.
126     BucketsArray = std::make_unique<Bucket[]>(NumberOfBuckets);
127 
128     InitialBucketSize = EstimatedSize / NumberOfBuckets;
129     InitialBucketSize = std::max((uint32_t)1, InitialBucketSize);
130     InitialBucketSize = PowerOf2Ceil(InitialBucketSize);
131 
132     // Initialize each bucket.
133     for (uint32_t Idx = 0; Idx < NumberOfBuckets; Idx++) {
134       HashesPtr Hashes = new ExtHashBitsTy[InitialBucketSize];
135       memset(Hashes, 0, sizeof(ExtHashBitsTy) * InitialBucketSize);
136 
137       DataPtr Entries = new EntryDataTy[InitialBucketSize];
138       memset(Entries, 0, sizeof(EntryDataTy) * InitialBucketSize);
139 
140       BucketsArray[Idx].Size = InitialBucketSize;
141       BucketsArray[Idx].Hashes = Hashes;
142       BucketsArray[Idx].Entries = Entries;
143     }
144 
145     // Calculate masks.
146     HashMask = NumberOfBuckets - 1;
147 
148     size_t LeadingZerosNumber = countl_zero(HashMask);
149     HashBitsNum = 64 - LeadingZerosNumber;
150 
151     // We keep only high 32-bits of hash value. So bucket size cannot
152     // exceed 2^31. Bucket size is always power of two.
153     MaxBucketSize = 1Ull << (std::min((size_t)31, LeadingZerosNumber));
154 
155     // Calculate mask for extended hash bits.
156     ExtHashMask = (uint64_t)NumberOfBuckets * MaxBucketSize - 1;
157   }
158 
~ConcurrentHashTableByPtr()159   virtual ~ConcurrentHashTableByPtr() {
160     // Deallocate buckets.
161     for (uint32_t Idx = 0; Idx < NumberOfBuckets; Idx++) {
162       delete[] BucketsArray[Idx].Hashes;
163       delete[] BucketsArray[Idx].Entries;
164     }
165   }
166 
167   /// Insert new value \p NewValue or return already existing entry.
168   ///
169   /// \returns entry and "true" if an entry is just inserted or
170   /// "false" if an entry already exists.
insert(const KeyTy & NewValue)171   std::pair<KeyDataTy *, bool> insert(const KeyTy &NewValue) {
172     // Calculate bucket index.
173     uint64_t Hash = Info::getHashValue(NewValue);
174     Bucket &CurBucket = BucketsArray[getBucketIdx(Hash)];
175     uint32_t ExtHashBits = getExtHashBits(Hash);
176 
177 #if LLVM_ENABLE_THREADS
178     // Lock bucket.
179     CurBucket.Guard.lock();
180 #endif
181 
182     HashesPtr BucketHashes = CurBucket.Hashes;
183     DataPtr BucketEntries = CurBucket.Entries;
184     uint32_t CurEntryIdx = getStartIdx(ExtHashBits, CurBucket.Size);
185 
186     while (true) {
187       uint32_t CurEntryHashBits = BucketHashes[CurEntryIdx];
188 
189       if (CurEntryHashBits == 0 && BucketEntries[CurEntryIdx] == nullptr) {
190         // Found empty slot. Insert data.
191         KeyDataTy *NewData = Info::create(NewValue, MultiThreadAllocator);
192         BucketEntries[CurEntryIdx] = NewData;
193         BucketHashes[CurEntryIdx] = ExtHashBits;
194 
195         CurBucket.NumberOfEntries++;
196         RehashBucket(CurBucket);
197 
198 #if LLVM_ENABLE_THREADS
199         CurBucket.Guard.unlock();
200 #endif
201 
202         return {NewData, true};
203       }
204 
205       if (CurEntryHashBits == ExtHashBits) {
206         // Hash matched. Check value for equality.
207         KeyDataTy *EntryData = BucketEntries[CurEntryIdx];
208         if (Info::isEqual(Info::getKey(*EntryData), NewValue)) {
209           // Already existed entry matched with inserted data is found.
210 #if LLVM_ENABLE_THREADS
211           CurBucket.Guard.unlock();
212 #endif
213 
214           return {EntryData, false};
215         }
216       }
217 
218       CurEntryIdx++;
219       CurEntryIdx &= (CurBucket.Size - 1);
220     }
221 
222     llvm_unreachable("Insertion error.");
223     return {};
224   }
225 
226   /// Print information about current state of hash table structures.
printStatistic(raw_ostream & OS)227   void printStatistic(raw_ostream &OS) {
228     OS << "\n--- HashTable statistic:\n";
229     OS << "\nNumber of buckets = " << NumberOfBuckets;
230     OS << "\nInitial bucket size = " << InitialBucketSize;
231 
232     uint64_t NumberOfNonEmptyBuckets = 0;
233     uint64_t NumberOfEntriesPlusEmpty = 0;
234     uint64_t OverallNumberOfEntries = 0;
235     uint64_t OverallSize = sizeof(*this) + NumberOfBuckets * sizeof(Bucket);
236 
237     DenseMap<uint32_t, uint32_t> BucketSizesMap;
238 
239     // For each bucket...
240     for (uint32_t Idx = 0; Idx < NumberOfBuckets; Idx++) {
241       Bucket &CurBucket = BucketsArray[Idx];
242 
243       BucketSizesMap[CurBucket.Size]++;
244 
245       if (CurBucket.NumberOfEntries != 0)
246         NumberOfNonEmptyBuckets++;
247       NumberOfEntriesPlusEmpty += CurBucket.Size;
248       OverallNumberOfEntries += CurBucket.NumberOfEntries;
249       OverallSize +=
250           (sizeof(ExtHashBitsTy) + sizeof(EntryDataTy)) * CurBucket.Size;
251     }
252 
253     OS << "\nOverall number of entries = " << OverallNumberOfEntries;
254     OS << "\nOverall number of non empty buckets = " << NumberOfNonEmptyBuckets;
255     for (auto &BucketSize : BucketSizesMap)
256       OS << "\n Number of buckets with size " << BucketSize.first << ": "
257          << BucketSize.second;
258 
259     std::stringstream stream;
260     stream << std::fixed << std::setprecision(2)
261            << ((float)OverallNumberOfEntries / (float)NumberOfEntriesPlusEmpty);
262     std::string str = stream.str();
263 
264     OS << "\nLoad factor = " << str;
265     OS << "\nOverall allocated size = " << OverallSize;
266   }
267 
268 protected:
269   using ExtHashBitsTy = uint32_t;
270   using EntryDataTy = KeyDataTy *;
271 
272   using HashesPtr = ExtHashBitsTy *;
273   using DataPtr = EntryDataTy *;
274 
275   // Bucket structure. Keeps bucket data.
276   struct Bucket {
277     Bucket() = default;
278 
279     // Size of bucket.
280     uint32_t Size = 0;
281 
282     // Number of non-null entries.
283     uint32_t NumberOfEntries = 0;
284 
285     // Hashes for [Size] entries.
286     HashesPtr Hashes = nullptr;
287 
288     // [Size] entries.
289     DataPtr Entries = nullptr;
290 
291 #if LLVM_ENABLE_THREADS
292     // Mutex for this bucket.
293     std::mutex Guard;
294 #endif
295   };
296 
297   // Reallocate and rehash bucket if this is full enough.
RehashBucket(Bucket & CurBucket)298   void RehashBucket(Bucket &CurBucket) {
299     assert((CurBucket.Size > 0) && "Uninitialised bucket");
300     if (CurBucket.NumberOfEntries < CurBucket.Size * 0.9)
301       return;
302 
303     if (CurBucket.Size >= MaxBucketSize)
304       report_fatal_error("ConcurrentHashTable is full");
305 
306     uint32_t NewBucketSize = CurBucket.Size << 1;
307     assert((NewBucketSize <= MaxBucketSize) && "New bucket size is too big");
308     assert((CurBucket.Size < NewBucketSize) &&
309            "New bucket size less than size of current bucket");
310 
311     // Store old entries & hashes arrays.
312     HashesPtr SrcHashes = CurBucket.Hashes;
313     DataPtr SrcEntries = CurBucket.Entries;
314 
315     // Allocate new entries&hashes arrays.
316     HashesPtr DestHashes = new ExtHashBitsTy[NewBucketSize];
317     memset(DestHashes, 0, sizeof(ExtHashBitsTy) * NewBucketSize);
318 
319     DataPtr DestEntries = new EntryDataTy[NewBucketSize];
320     memset(DestEntries, 0, sizeof(EntryDataTy) * NewBucketSize);
321 
322     // For each entry in source arrays...
323     for (uint32_t CurSrcEntryIdx = 0; CurSrcEntryIdx < CurBucket.Size;
324          CurSrcEntryIdx++) {
325       uint32_t CurSrcEntryHashBits = SrcHashes[CurSrcEntryIdx];
326 
327       // Check for null entry.
328       if (CurSrcEntryHashBits == 0 && SrcEntries[CurSrcEntryIdx] == nullptr)
329         continue;
330 
331       uint32_t StartDestIdx = getStartIdx(CurSrcEntryHashBits, NewBucketSize);
332 
333       // Insert non-null entry into the new arrays.
334       while (true) {
335         uint32_t CurDestEntryHashBits = DestHashes[StartDestIdx];
336 
337         if (CurDestEntryHashBits == 0 && DestEntries[StartDestIdx] == nullptr) {
338           // Found empty slot. Insert data.
339           DestHashes[StartDestIdx] = CurSrcEntryHashBits;
340           DestEntries[StartDestIdx] = SrcEntries[CurSrcEntryIdx];
341           break;
342         }
343 
344         StartDestIdx++;
345         StartDestIdx = StartDestIdx & (NewBucketSize - 1);
346       }
347     }
348 
349     // Update bucket fields.
350     CurBucket.Hashes = DestHashes;
351     CurBucket.Entries = DestEntries;
352     CurBucket.Size = NewBucketSize;
353 
354     // Delete old bucket entries.
355     if (SrcHashes != nullptr)
356       delete[] SrcHashes;
357     if (SrcEntries != nullptr)
358       delete[] SrcEntries;
359   }
360 
getBucketIdx(hash_code Hash)361   uint32_t getBucketIdx(hash_code Hash) { return Hash & HashMask; }
362 
getExtHashBits(uint64_t Hash)363   uint32_t getExtHashBits(uint64_t Hash) {
364     return (Hash & ExtHashMask) >> HashBitsNum;
365   }
366 
getStartIdx(uint32_t ExtHashBits,uint32_t BucketSize)367   uint32_t getStartIdx(uint32_t ExtHashBits, uint32_t BucketSize) {
368     assert((BucketSize > 0) && "Empty bucket");
369 
370     return ExtHashBits & (BucketSize - 1);
371   }
372 
373   // Number of bits in hash mask.
374   uint64_t HashBitsNum = 0;
375 
376   // Hash mask.
377   uint64_t HashMask = 0;
378 
379   // Hash mask for the extended hash bits.
380   uint64_t ExtHashMask = 0;
381 
382   // The maximal bucket size.
383   uint32_t MaxBucketSize = 0;
384 
385   // Initial size of bucket.
386   uint32_t InitialBucketSize = 0;
387 
388   // The number of buckets.
389   uint32_t NumberOfBuckets = 0;
390 
391   // Array of buckets.
392   std::unique_ptr<Bucket[]> BucketsArray;
393 
394   // Used for allocating KeyDataTy values.
395   AllocatorTy &MultiThreadAllocator;
396 };
397 
398 } // end namespace llvm
399 
400 #endif // LLVM_ADT_CONCURRENTHASHTABLE_H
401