1 //===- ConcurrentHashtable.h ------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef LLVM_ADT_CONCURRENTHASHTABLE_H 10 #define LLVM_ADT_CONCURRENTHASHTABLE_H 11 12 #include "llvm/ADT/DenseMap.h" 13 #include "llvm/ADT/Hashing.h" 14 #include "llvm/ADT/PointerIntPair.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SmallVector.h" 17 #include "llvm/Support/Allocator.h" 18 #include "llvm/Support/Debug.h" 19 #include "llvm/Support/Parallel.h" 20 #include "llvm/Support/WithColor.h" 21 #include "llvm/Support/xxhash.h" 22 #include <atomic> 23 #include <cstddef> 24 #include <iomanip> 25 #include <mutex> 26 #include <sstream> 27 #include <type_traits> 28 29 namespace llvm { 30 31 /// ConcurrentHashTable - is a resizeable concurrent hashtable. 32 /// The number of resizings limited up to x2^31. This hashtable is 33 /// useful to have efficient access to aggregate data(like strings, 34 /// type descriptors...) and to keep only single copy of such 35 /// an aggregate. The hashtable allows only concurrent insertions: 36 /// 37 /// KeyDataTy* = insert ( const KeyTy& ); 38 /// 39 /// Data structure: 40 /// 41 /// Inserted value KeyTy is mapped to 64-bit hash value -> 42 /// 43 /// [------- 64-bit Hash value --------] 44 /// [ StartEntryIndex ][ Bucket Index ] 45 /// | | 46 /// points to the points to 47 /// first probe the bucket. 48 /// position inside 49 /// bucket entries 50 /// 51 /// After initialization, all buckets have an initial size. During insertions, 52 /// buckets might be extended to contain more entries. Each bucket can be 53 /// independently resized and rehashed(no need to lock the whole table). 54 /// Different buckets may have different sizes. If the single bucket is full 55 /// then the bucket is resized. 56 /// 57 /// BucketsArray keeps all buckets. Each bucket keeps an array of Entries 58 /// (pointers to KeyDataTy) and another array of entries hashes: 59 /// 60 /// BucketsArray[BucketIdx].Hashes[EntryIdx]: 61 /// BucketsArray[BucketIdx].Entries[EntryIdx]: 62 /// 63 /// [Bucket 0].Hashes -> [uint32_t][uint32_t] 64 /// [Bucket 0].Entries -> [KeyDataTy*][KeyDataTy*] 65 /// 66 /// [Bucket 1].Hashes -> [uint32_t][uint32_t][uint32_t][uint32_t] 67 /// [Bucket 1].Entries -> [KeyDataTy*][KeyDataTy*][KeyDataTy*][KeyDataTy*] 68 /// ......................... 69 /// [Bucket N].Hashes -> [uint32_t][uint32_t][uint32_t] 70 /// [Bucket N].Entries -> [KeyDataTy*][KeyDataTy*][KeyDataTy*] 71 /// 72 /// ConcurrentHashTableByPtr uses an external thread-safe allocator to allocate 73 /// KeyDataTy items. 74 75 template <typename KeyTy, typename KeyDataTy, typename AllocatorTy> 76 class ConcurrentHashTableInfoByPtr { 77 public: 78 /// \returns Hash value for the specified \p Key. 79 static inline uint64_t getHashValue(const KeyTy &Key) { 80 return xxh3_64bits(Key); 81 } 82 83 /// \returns true if both \p LHS and \p RHS are equal. 84 static inline bool isEqual(const KeyTy &LHS, const KeyTy &RHS) { 85 return LHS == RHS; 86 } 87 88 /// \returns key for the specified \p KeyData. 89 static inline const KeyTy &getKey(const KeyDataTy &KeyData) { 90 return KeyData.getKey(); 91 } 92 93 /// \returns newly created object of KeyDataTy type. 94 static inline KeyDataTy *create(const KeyTy &Key, AllocatorTy &Allocator) { 95 return KeyDataTy::create(Key, Allocator); 96 } 97 }; 98 99 template <typename KeyTy, typename KeyDataTy, typename AllocatorTy, 100 typename Info = 101 ConcurrentHashTableInfoByPtr<KeyTy, KeyDataTy, AllocatorTy>> 102 class ConcurrentHashTableByPtr { 103 public: 104 ConcurrentHashTableByPtr( 105 AllocatorTy &Allocator, uint64_t EstimatedSize = 100000, 106 size_t ThreadsNum = parallel::strategy.compute_thread_count(), 107 size_t InitialNumberOfBuckets = 128) 108 : MultiThreadAllocator(Allocator) { 109 assert((ThreadsNum > 0) && "ThreadsNum must be greater than 0"); 110 assert((InitialNumberOfBuckets > 0) && 111 "InitialNumberOfBuckets must be greater than 0"); 112 113 // Calculate number of buckets. 114 uint64_t EstimatedNumberOfBuckets = ThreadsNum; 115 if (ThreadsNum > 1) { 116 EstimatedNumberOfBuckets *= InitialNumberOfBuckets; 117 EstimatedNumberOfBuckets *= std::max( 118 1, 119 countr_zero(PowerOf2Ceil(EstimatedSize / InitialNumberOfBuckets)) >> 120 2); 121 } 122 EstimatedNumberOfBuckets = PowerOf2Ceil(EstimatedNumberOfBuckets); 123 NumberOfBuckets = 124 std::min(EstimatedNumberOfBuckets, (uint64_t)(1Ull << 31)); 125 126 // Allocate buckets. 127 BucketsArray = std::make_unique<Bucket[]>(NumberOfBuckets); 128 129 InitialBucketSize = EstimatedSize / NumberOfBuckets; 130 InitialBucketSize = std::max((uint32_t)1, InitialBucketSize); 131 InitialBucketSize = PowerOf2Ceil(InitialBucketSize); 132 133 // Initialize each bucket. 134 for (uint32_t Idx = 0; Idx < NumberOfBuckets; Idx++) { 135 HashesPtr Hashes = new ExtHashBitsTy[InitialBucketSize]; 136 memset(Hashes, 0, sizeof(ExtHashBitsTy) * InitialBucketSize); 137 138 DataPtr Entries = new EntryDataTy[InitialBucketSize]; 139 memset(Entries, 0, sizeof(EntryDataTy) * InitialBucketSize); 140 141 BucketsArray[Idx].Size = InitialBucketSize; 142 BucketsArray[Idx].Hashes = Hashes; 143 BucketsArray[Idx].Entries = Entries; 144 } 145 146 // Calculate masks. 147 HashMask = NumberOfBuckets - 1; 148 149 size_t LeadingZerosNumber = countl_zero(HashMask); 150 HashBitsNum = 64 - LeadingZerosNumber; 151 152 // We keep only high 32-bits of hash value. So bucket size cannot 153 // exceed 2^31. Bucket size is always power of two. 154 MaxBucketSize = 1Ull << (std::min((size_t)31, LeadingZerosNumber)); 155 156 // Calculate mask for extended hash bits. 157 ExtHashMask = (NumberOfBuckets * MaxBucketSize) - 1; 158 } 159 160 virtual ~ConcurrentHashTableByPtr() { 161 // Deallocate buckets. 162 for (uint32_t Idx = 0; Idx < NumberOfBuckets; Idx++) { 163 delete[] BucketsArray[Idx].Hashes; 164 delete[] BucketsArray[Idx].Entries; 165 } 166 } 167 168 /// Insert new value \p NewValue or return already existing entry. 169 /// 170 /// \returns entry and "true" if an entry is just inserted or 171 /// "false" if an entry already exists. 172 std::pair<KeyDataTy *, bool> insert(const KeyTy &NewValue) { 173 // Calculate bucket index. 174 uint64_t Hash = Info::getHashValue(NewValue); 175 Bucket &CurBucket = BucketsArray[getBucketIdx(Hash)]; 176 uint32_t ExtHashBits = getExtHashBits(Hash); 177 178 #if LLVM_ENABLE_THREADS 179 // Lock bucket. 180 CurBucket.Guard.lock(); 181 #endif 182 183 HashesPtr BucketHashes = CurBucket.Hashes; 184 DataPtr BucketEntries = CurBucket.Entries; 185 uint32_t CurEntryIdx = getStartIdx(ExtHashBits, CurBucket.Size); 186 187 while (true) { 188 uint32_t CurEntryHashBits = BucketHashes[CurEntryIdx]; 189 190 if (CurEntryHashBits == 0 && BucketEntries[CurEntryIdx] == nullptr) { 191 // Found empty slot. Insert data. 192 KeyDataTy *NewData = Info::create(NewValue, MultiThreadAllocator); 193 BucketEntries[CurEntryIdx] = NewData; 194 BucketHashes[CurEntryIdx] = ExtHashBits; 195 196 CurBucket.NumberOfEntries++; 197 RehashBucket(CurBucket); 198 199 #if LLVM_ENABLE_THREADS 200 CurBucket.Guard.unlock(); 201 #endif 202 203 return {NewData, true}; 204 } 205 206 if (CurEntryHashBits == ExtHashBits) { 207 // Hash matched. Check value for equality. 208 KeyDataTy *EntryData = BucketEntries[CurEntryIdx]; 209 if (Info::isEqual(Info::getKey(*EntryData), NewValue)) { 210 // Already existed entry matched with inserted data is found. 211 #if LLVM_ENABLE_THREADS 212 CurBucket.Guard.unlock(); 213 #endif 214 215 return {EntryData, false}; 216 } 217 } 218 219 CurEntryIdx++; 220 CurEntryIdx &= (CurBucket.Size - 1); 221 } 222 223 llvm_unreachable("Insertion error."); 224 return {}; 225 } 226 227 /// Print information about current state of hash table structures. 228 void printStatistic(raw_ostream &OS) { 229 OS << "\n--- HashTable statistic:\n"; 230 OS << "\nNumber of buckets = " << NumberOfBuckets; 231 OS << "\nInitial bucket size = " << InitialBucketSize; 232 233 uint64_t NumberOfNonEmptyBuckets = 0; 234 uint64_t NumberOfEntriesPlusEmpty = 0; 235 uint64_t OverallNumberOfEntries = 0; 236 uint64_t OverallSize = sizeof(*this) + NumberOfBuckets * sizeof(Bucket); 237 238 DenseMap<uint32_t, uint32_t> BucketSizesMap; 239 240 // For each bucket... 241 for (uint32_t Idx = 0; Idx < NumberOfBuckets; Idx++) { 242 Bucket &CurBucket = BucketsArray[Idx]; 243 244 BucketSizesMap[CurBucket.Size]++; 245 246 if (CurBucket.NumberOfEntries != 0) 247 NumberOfNonEmptyBuckets++; 248 NumberOfEntriesPlusEmpty += CurBucket.Size; 249 OverallNumberOfEntries += CurBucket.NumberOfEntries; 250 OverallSize += 251 (sizeof(ExtHashBitsTy) + sizeof(EntryDataTy)) * CurBucket.Size; 252 } 253 254 OS << "\nOverall number of entries = " << OverallNumberOfEntries; 255 OS << "\nOverall number of non empty buckets = " << NumberOfNonEmptyBuckets; 256 for (auto &BucketSize : BucketSizesMap) 257 OS << "\n Number of buckets with size " << BucketSize.first << ": " 258 << BucketSize.second; 259 260 std::stringstream stream; 261 stream << std::fixed << std::setprecision(2) 262 << ((float)OverallNumberOfEntries / (float)NumberOfEntriesPlusEmpty); 263 std::string str = stream.str(); 264 265 OS << "\nLoad factor = " << str; 266 OS << "\nOverall allocated size = " << OverallSize; 267 } 268 269 protected: 270 using ExtHashBitsTy = uint32_t; 271 using EntryDataTy = KeyDataTy *; 272 273 using HashesPtr = ExtHashBitsTy *; 274 using DataPtr = EntryDataTy *; 275 276 // Bucket structure. Keeps bucket data. 277 struct Bucket { 278 Bucket() = default; 279 280 // Size of bucket. 281 uint32_t Size = 0; 282 283 // Number of non-null entries. 284 uint32_t NumberOfEntries = 0; 285 286 // Hashes for [Size] entries. 287 HashesPtr Hashes = nullptr; 288 289 // [Size] entries. 290 DataPtr Entries = nullptr; 291 292 #if LLVM_ENABLE_THREADS 293 // Mutex for this bucket. 294 std::mutex Guard; 295 #endif 296 }; 297 298 // Reallocate and rehash bucket if this is full enough. 299 void RehashBucket(Bucket &CurBucket) { 300 assert((CurBucket.Size > 0) && "Uninitialised bucket"); 301 if (CurBucket.NumberOfEntries < CurBucket.Size * 0.9) 302 return; 303 304 if (CurBucket.Size >= MaxBucketSize) 305 report_fatal_error("ConcurrentHashTable is full"); 306 307 uint32_t NewBucketSize = CurBucket.Size << 1; 308 assert((NewBucketSize <= MaxBucketSize) && "New bucket size is too big"); 309 assert((CurBucket.Size < NewBucketSize) && 310 "New bucket size less than size of current bucket"); 311 312 // Store old entries & hashes arrays. 313 HashesPtr SrcHashes = CurBucket.Hashes; 314 DataPtr SrcEntries = CurBucket.Entries; 315 316 // Allocate new entries&hashes arrays. 317 HashesPtr DestHashes = new ExtHashBitsTy[NewBucketSize]; 318 memset(DestHashes, 0, sizeof(ExtHashBitsTy) * NewBucketSize); 319 320 DataPtr DestEntries = new EntryDataTy[NewBucketSize]; 321 memset(DestEntries, 0, sizeof(EntryDataTy) * NewBucketSize); 322 323 // For each entry in source arrays... 324 for (uint32_t CurSrcEntryIdx = 0; CurSrcEntryIdx < CurBucket.Size; 325 CurSrcEntryIdx++) { 326 uint32_t CurSrcEntryHashBits = SrcHashes[CurSrcEntryIdx]; 327 328 // Check for null entry. 329 if (CurSrcEntryHashBits == 0 && SrcEntries[CurSrcEntryIdx] == nullptr) 330 continue; 331 332 uint32_t StartDestIdx = getStartIdx(CurSrcEntryHashBits, NewBucketSize); 333 334 // Insert non-null entry into the new arrays. 335 while (true) { 336 uint32_t CurDestEntryHashBits = DestHashes[StartDestIdx]; 337 338 if (CurDestEntryHashBits == 0 && DestEntries[StartDestIdx] == nullptr) { 339 // Found empty slot. Insert data. 340 DestHashes[StartDestIdx] = CurSrcEntryHashBits; 341 DestEntries[StartDestIdx] = SrcEntries[CurSrcEntryIdx]; 342 break; 343 } 344 345 StartDestIdx++; 346 StartDestIdx = StartDestIdx & (NewBucketSize - 1); 347 } 348 } 349 350 // Update bucket fields. 351 CurBucket.Hashes = DestHashes; 352 CurBucket.Entries = DestEntries; 353 CurBucket.Size = NewBucketSize; 354 355 // Delete old bucket entries. 356 if (SrcHashes != nullptr) 357 delete[] SrcHashes; 358 if (SrcEntries != nullptr) 359 delete[] SrcEntries; 360 } 361 362 uint32_t getBucketIdx(hash_code Hash) { return Hash & HashMask; } 363 364 uint32_t getExtHashBits(uint64_t Hash) { 365 return (Hash & ExtHashMask) >> HashBitsNum; 366 } 367 368 uint32_t getStartIdx(uint32_t ExtHashBits, uint32_t BucketSize) { 369 assert((BucketSize > 0) && "Empty bucket"); 370 371 return ExtHashBits & (BucketSize - 1); 372 } 373 374 // Number of bits in hash mask. 375 uint64_t HashBitsNum = 0; 376 377 // Hash mask. 378 uint64_t HashMask = 0; 379 380 // Hash mask for the extended hash bits. 381 uint64_t ExtHashMask = 0; 382 383 // The maximal bucket size. 384 uint32_t MaxBucketSize = 0; 385 386 // Initial size of bucket. 387 uint32_t InitialBucketSize = 0; 388 389 // The number of buckets. 390 uint32_t NumberOfBuckets = 0; 391 392 // Array of buckets. 393 std::unique_ptr<Bucket[]> BucketsArray; 394 395 // Used for allocating KeyDataTy values. 396 AllocatorTy &MultiThreadAllocator; 397 }; 398 399 } // end namespace llvm 400 401 #endif // LLVM_ADT_CONCURRENTHASHTABLE_H 402