1 //===- ConcurrentHashtable.h ------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef LLVM_ADT_CONCURRENTHASHTABLE_H 10 #define LLVM_ADT_CONCURRENTHASHTABLE_H 11 12 #include "llvm/ADT/DenseMap.h" 13 #include "llvm/ADT/Hashing.h" 14 #include "llvm/ADT/STLExtras.h" 15 #include "llvm/ADT/SmallVector.h" 16 #include "llvm/Support/Allocator.h" 17 #include "llvm/Support/Debug.h" 18 #include "llvm/Support/Parallel.h" 19 #include "llvm/Support/WithColor.h" 20 #include "llvm/Support/xxhash.h" 21 #include <atomic> 22 #include <cstddef> 23 #include <iomanip> 24 #include <mutex> 25 #include <sstream> 26 #include <type_traits> 27 28 namespace llvm { 29 30 /// ConcurrentHashTable - is a resizeable concurrent hashtable. 31 /// The number of resizings limited up to x2^31. This hashtable is 32 /// useful to have efficient access to aggregate data(like strings, 33 /// type descriptors...) and to keep only single copy of such 34 /// an aggregate. The hashtable allows only concurrent insertions: 35 /// 36 /// KeyDataTy* = insert ( const KeyTy& ); 37 /// 38 /// Data structure: 39 /// 40 /// Inserted value KeyTy is mapped to 64-bit hash value -> 41 /// 42 /// [------- 64-bit Hash value --------] 43 /// [ StartEntryIndex ][ Bucket Index ] 44 /// | | 45 /// points to the points to 46 /// first probe the bucket. 47 /// position inside 48 /// bucket entries 49 /// 50 /// After initialization, all buckets have an initial size. During insertions, 51 /// buckets might be extended to contain more entries. Each bucket can be 52 /// independently resized and rehashed(no need to lock the whole table). 53 /// Different buckets may have different sizes. If the single bucket is full 54 /// then the bucket is resized. 55 /// 56 /// BucketsArray keeps all buckets. Each bucket keeps an array of Entries 57 /// (pointers to KeyDataTy) and another array of entries hashes: 58 /// 59 /// BucketsArray[BucketIdx].Hashes[EntryIdx]: 60 /// BucketsArray[BucketIdx].Entries[EntryIdx]: 61 /// 62 /// [Bucket 0].Hashes -> [uint32_t][uint32_t] 63 /// [Bucket 0].Entries -> [KeyDataTy*][KeyDataTy*] 64 /// 65 /// [Bucket 1].Hashes -> [uint32_t][uint32_t][uint32_t][uint32_t] 66 /// [Bucket 1].Entries -> [KeyDataTy*][KeyDataTy*][KeyDataTy*][KeyDataTy*] 67 /// ......................... 68 /// [Bucket N].Hashes -> [uint32_t][uint32_t][uint32_t] 69 /// [Bucket N].Entries -> [KeyDataTy*][KeyDataTy*][KeyDataTy*] 70 /// 71 /// ConcurrentHashTableByPtr uses an external thread-safe allocator to allocate 72 /// KeyDataTy items. 73 74 template <typename KeyTy, typename KeyDataTy, typename AllocatorTy> 75 class ConcurrentHashTableInfoByPtr { 76 public: 77 /// \returns Hash value for the specified \p Key. getHashValue(const KeyTy & Key)78 static inline uint64_t getHashValue(const KeyTy &Key) { 79 return xxh3_64bits(Key); 80 } 81 82 /// \returns true if both \p LHS and \p RHS are equal. isEqual(const KeyTy & LHS,const KeyTy & RHS)83 static inline bool isEqual(const KeyTy &LHS, const KeyTy &RHS) { 84 return LHS == RHS; 85 } 86 87 /// \returns key for the specified \p KeyData. getKey(const KeyDataTy & KeyData)88 static inline const KeyTy &getKey(const KeyDataTy &KeyData) { 89 return KeyData.getKey(); 90 } 91 92 /// \returns newly created object of KeyDataTy type. create(const KeyTy & Key,AllocatorTy & Allocator)93 static inline KeyDataTy *create(const KeyTy &Key, AllocatorTy &Allocator) { 94 return KeyDataTy::create(Key, Allocator); 95 } 96 }; 97 98 template <typename KeyTy, typename KeyDataTy, typename AllocatorTy, 99 typename Info = 100 ConcurrentHashTableInfoByPtr<KeyTy, KeyDataTy, AllocatorTy>> 101 class ConcurrentHashTableByPtr { 102 public: 103 ConcurrentHashTableByPtr( 104 AllocatorTy &Allocator, uint64_t EstimatedSize = 100000, 105 size_t ThreadsNum = parallel::strategy.compute_thread_count(), 106 size_t InitialNumberOfBuckets = 128) MultiThreadAllocator(Allocator)107 : MultiThreadAllocator(Allocator) { 108 assert((ThreadsNum > 0) && "ThreadsNum must be greater than 0"); 109 assert((InitialNumberOfBuckets > 0) && 110 "InitialNumberOfBuckets must be greater than 0"); 111 112 // Calculate number of buckets. 113 uint64_t EstimatedNumberOfBuckets = ThreadsNum; 114 if (ThreadsNum > 1) { 115 EstimatedNumberOfBuckets *= InitialNumberOfBuckets; 116 EstimatedNumberOfBuckets *= std::max( 117 1, 118 countr_zero(PowerOf2Ceil(EstimatedSize / InitialNumberOfBuckets)) >> 119 2); 120 } 121 EstimatedNumberOfBuckets = PowerOf2Ceil(EstimatedNumberOfBuckets); 122 NumberOfBuckets = 123 std::min(EstimatedNumberOfBuckets, (uint64_t)(1Ull << 31)); 124 125 // Allocate buckets. 126 BucketsArray = std::make_unique<Bucket[]>(NumberOfBuckets); 127 128 InitialBucketSize = EstimatedSize / NumberOfBuckets; 129 InitialBucketSize = std::max((uint32_t)1, InitialBucketSize); 130 InitialBucketSize = PowerOf2Ceil(InitialBucketSize); 131 132 // Initialize each bucket. 133 for (uint32_t Idx = 0; Idx < NumberOfBuckets; Idx++) { 134 HashesPtr Hashes = new ExtHashBitsTy[InitialBucketSize]; 135 memset(Hashes, 0, sizeof(ExtHashBitsTy) * InitialBucketSize); 136 137 DataPtr Entries = new EntryDataTy[InitialBucketSize]; 138 memset(Entries, 0, sizeof(EntryDataTy) * InitialBucketSize); 139 140 BucketsArray[Idx].Size = InitialBucketSize; 141 BucketsArray[Idx].Hashes = Hashes; 142 BucketsArray[Idx].Entries = Entries; 143 } 144 145 // Calculate masks. 146 HashMask = NumberOfBuckets - 1; 147 148 size_t LeadingZerosNumber = countl_zero(HashMask); 149 HashBitsNum = 64 - LeadingZerosNumber; 150 151 // We keep only high 32-bits of hash value. So bucket size cannot 152 // exceed 2^31. Bucket size is always power of two. 153 MaxBucketSize = 1Ull << (std::min((size_t)31, LeadingZerosNumber)); 154 155 // Calculate mask for extended hash bits. 156 ExtHashMask = (uint64_t)NumberOfBuckets * MaxBucketSize - 1; 157 } 158 ~ConcurrentHashTableByPtr()159 virtual ~ConcurrentHashTableByPtr() { 160 // Deallocate buckets. 161 for (uint32_t Idx = 0; Idx < NumberOfBuckets; Idx++) { 162 delete[] BucketsArray[Idx].Hashes; 163 delete[] BucketsArray[Idx].Entries; 164 } 165 } 166 167 /// Insert new value \p NewValue or return already existing entry. 168 /// 169 /// \returns entry and "true" if an entry is just inserted or 170 /// "false" if an entry already exists. insert(const KeyTy & NewValue)171 std::pair<KeyDataTy *, bool> insert(const KeyTy &NewValue) { 172 // Calculate bucket index. 173 uint64_t Hash = Info::getHashValue(NewValue); 174 Bucket &CurBucket = BucketsArray[getBucketIdx(Hash)]; 175 uint32_t ExtHashBits = getExtHashBits(Hash); 176 177 #if LLVM_ENABLE_THREADS 178 // Lock bucket. 179 CurBucket.Guard.lock(); 180 #endif 181 182 HashesPtr BucketHashes = CurBucket.Hashes; 183 DataPtr BucketEntries = CurBucket.Entries; 184 uint32_t CurEntryIdx = getStartIdx(ExtHashBits, CurBucket.Size); 185 186 while (true) { 187 uint32_t CurEntryHashBits = BucketHashes[CurEntryIdx]; 188 189 if (CurEntryHashBits == 0 && BucketEntries[CurEntryIdx] == nullptr) { 190 // Found empty slot. Insert data. 191 KeyDataTy *NewData = Info::create(NewValue, MultiThreadAllocator); 192 BucketEntries[CurEntryIdx] = NewData; 193 BucketHashes[CurEntryIdx] = ExtHashBits; 194 195 CurBucket.NumberOfEntries++; 196 RehashBucket(CurBucket); 197 198 #if LLVM_ENABLE_THREADS 199 CurBucket.Guard.unlock(); 200 #endif 201 202 return {NewData, true}; 203 } 204 205 if (CurEntryHashBits == ExtHashBits) { 206 // Hash matched. Check value for equality. 207 KeyDataTy *EntryData = BucketEntries[CurEntryIdx]; 208 if (Info::isEqual(Info::getKey(*EntryData), NewValue)) { 209 // Already existed entry matched with inserted data is found. 210 #if LLVM_ENABLE_THREADS 211 CurBucket.Guard.unlock(); 212 #endif 213 214 return {EntryData, false}; 215 } 216 } 217 218 CurEntryIdx++; 219 CurEntryIdx &= (CurBucket.Size - 1); 220 } 221 222 llvm_unreachable("Insertion error."); 223 return {}; 224 } 225 226 /// Print information about current state of hash table structures. printStatistic(raw_ostream & OS)227 void printStatistic(raw_ostream &OS) { 228 OS << "\n--- HashTable statistic:\n"; 229 OS << "\nNumber of buckets = " << NumberOfBuckets; 230 OS << "\nInitial bucket size = " << InitialBucketSize; 231 232 uint64_t NumberOfNonEmptyBuckets = 0; 233 uint64_t NumberOfEntriesPlusEmpty = 0; 234 uint64_t OverallNumberOfEntries = 0; 235 uint64_t OverallSize = sizeof(*this) + NumberOfBuckets * sizeof(Bucket); 236 237 DenseMap<uint32_t, uint32_t> BucketSizesMap; 238 239 // For each bucket... 240 for (uint32_t Idx = 0; Idx < NumberOfBuckets; Idx++) { 241 Bucket &CurBucket = BucketsArray[Idx]; 242 243 BucketSizesMap[CurBucket.Size]++; 244 245 if (CurBucket.NumberOfEntries != 0) 246 NumberOfNonEmptyBuckets++; 247 NumberOfEntriesPlusEmpty += CurBucket.Size; 248 OverallNumberOfEntries += CurBucket.NumberOfEntries; 249 OverallSize += 250 (sizeof(ExtHashBitsTy) + sizeof(EntryDataTy)) * CurBucket.Size; 251 } 252 253 OS << "\nOverall number of entries = " << OverallNumberOfEntries; 254 OS << "\nOverall number of non empty buckets = " << NumberOfNonEmptyBuckets; 255 for (auto &BucketSize : BucketSizesMap) 256 OS << "\n Number of buckets with size " << BucketSize.first << ": " 257 << BucketSize.second; 258 259 std::stringstream stream; 260 stream << std::fixed << std::setprecision(2) 261 << ((float)OverallNumberOfEntries / (float)NumberOfEntriesPlusEmpty); 262 std::string str = stream.str(); 263 264 OS << "\nLoad factor = " << str; 265 OS << "\nOverall allocated size = " << OverallSize; 266 } 267 268 protected: 269 using ExtHashBitsTy = uint32_t; 270 using EntryDataTy = KeyDataTy *; 271 272 using HashesPtr = ExtHashBitsTy *; 273 using DataPtr = EntryDataTy *; 274 275 // Bucket structure. Keeps bucket data. 276 struct Bucket { 277 Bucket() = default; 278 279 // Size of bucket. 280 uint32_t Size = 0; 281 282 // Number of non-null entries. 283 uint32_t NumberOfEntries = 0; 284 285 // Hashes for [Size] entries. 286 HashesPtr Hashes = nullptr; 287 288 // [Size] entries. 289 DataPtr Entries = nullptr; 290 291 #if LLVM_ENABLE_THREADS 292 // Mutex for this bucket. 293 std::mutex Guard; 294 #endif 295 }; 296 297 // Reallocate and rehash bucket if this is full enough. RehashBucket(Bucket & CurBucket)298 void RehashBucket(Bucket &CurBucket) { 299 assert((CurBucket.Size > 0) && "Uninitialised bucket"); 300 if (CurBucket.NumberOfEntries < CurBucket.Size * 0.9) 301 return; 302 303 if (CurBucket.Size >= MaxBucketSize) 304 report_fatal_error("ConcurrentHashTable is full"); 305 306 uint32_t NewBucketSize = CurBucket.Size << 1; 307 assert((NewBucketSize <= MaxBucketSize) && "New bucket size is too big"); 308 assert((CurBucket.Size < NewBucketSize) && 309 "New bucket size less than size of current bucket"); 310 311 // Store old entries & hashes arrays. 312 HashesPtr SrcHashes = CurBucket.Hashes; 313 DataPtr SrcEntries = CurBucket.Entries; 314 315 // Allocate new entries&hashes arrays. 316 HashesPtr DestHashes = new ExtHashBitsTy[NewBucketSize]; 317 memset(DestHashes, 0, sizeof(ExtHashBitsTy) * NewBucketSize); 318 319 DataPtr DestEntries = new EntryDataTy[NewBucketSize]; 320 memset(DestEntries, 0, sizeof(EntryDataTy) * NewBucketSize); 321 322 // For each entry in source arrays... 323 for (uint32_t CurSrcEntryIdx = 0; CurSrcEntryIdx < CurBucket.Size; 324 CurSrcEntryIdx++) { 325 uint32_t CurSrcEntryHashBits = SrcHashes[CurSrcEntryIdx]; 326 327 // Check for null entry. 328 if (CurSrcEntryHashBits == 0 && SrcEntries[CurSrcEntryIdx] == nullptr) 329 continue; 330 331 uint32_t StartDestIdx = getStartIdx(CurSrcEntryHashBits, NewBucketSize); 332 333 // Insert non-null entry into the new arrays. 334 while (true) { 335 uint32_t CurDestEntryHashBits = DestHashes[StartDestIdx]; 336 337 if (CurDestEntryHashBits == 0 && DestEntries[StartDestIdx] == nullptr) { 338 // Found empty slot. Insert data. 339 DestHashes[StartDestIdx] = CurSrcEntryHashBits; 340 DestEntries[StartDestIdx] = SrcEntries[CurSrcEntryIdx]; 341 break; 342 } 343 344 StartDestIdx++; 345 StartDestIdx = StartDestIdx & (NewBucketSize - 1); 346 } 347 } 348 349 // Update bucket fields. 350 CurBucket.Hashes = DestHashes; 351 CurBucket.Entries = DestEntries; 352 CurBucket.Size = NewBucketSize; 353 354 // Delete old bucket entries. 355 if (SrcHashes != nullptr) 356 delete[] SrcHashes; 357 if (SrcEntries != nullptr) 358 delete[] SrcEntries; 359 } 360 getBucketIdx(hash_code Hash)361 uint32_t getBucketIdx(hash_code Hash) { return Hash & HashMask; } 362 getExtHashBits(uint64_t Hash)363 uint32_t getExtHashBits(uint64_t Hash) { 364 return (Hash & ExtHashMask) >> HashBitsNum; 365 } 366 getStartIdx(uint32_t ExtHashBits,uint32_t BucketSize)367 uint32_t getStartIdx(uint32_t ExtHashBits, uint32_t BucketSize) { 368 assert((BucketSize > 0) && "Empty bucket"); 369 370 return ExtHashBits & (BucketSize - 1); 371 } 372 373 // Number of bits in hash mask. 374 uint64_t HashBitsNum = 0; 375 376 // Hash mask. 377 uint64_t HashMask = 0; 378 379 // Hash mask for the extended hash bits. 380 uint64_t ExtHashMask = 0; 381 382 // The maximal bucket size. 383 uint32_t MaxBucketSize = 0; 384 385 // Initial size of bucket. 386 uint32_t InitialBucketSize = 0; 387 388 // The number of buckets. 389 uint32_t NumberOfBuckets = 0; 390 391 // Array of buckets. 392 std::unique_ptr<Bucket[]> BucketsArray; 393 394 // Used for allocating KeyDataTy values. 395 AllocatorTy &MultiThreadAllocator; 396 }; 397 398 } // end namespace llvm 399 400 #endif // LLVM_ADT_CONCURRENTHASHTABLE_H 401