1 //===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// Defines facilities for reading and writing on-disk hash tables. 11 /// 12 //===----------------------------------------------------------------------===// 13 #ifndef LLVM_SUPPORT_ONDISKHASHTABLE_H 14 #define LLVM_SUPPORT_ONDISKHASHTABLE_H 15 16 #include "llvm/Support/Alignment.h" 17 #include "llvm/Support/Allocator.h" 18 #include "llvm/Support/DataTypes.h" 19 #include "llvm/Support/EndianStream.h" 20 #include "llvm/Support/MathExtras.h" 21 #include "llvm/Support/raw_ostream.h" 22 #include <cassert> 23 #include <cstdlib> 24 25 namespace llvm { 26 27 /// Generates an on disk hash table. 28 /// 29 /// This needs an \c Info that handles storing values into the hash table's 30 /// payload and computes the hash for a given key. This should provide the 31 /// following interface: 32 /// 33 /// \code 34 /// class ExampleInfo { 35 /// public: 36 /// typedef ExampleKey key_type; // Must be copy constructible 37 /// typedef ExampleKey &key_type_ref; 38 /// typedef ExampleData data_type; // Must be copy constructible 39 /// typedef ExampleData &data_type_ref; 40 /// typedef uint32_t hash_value_type; // The type the hash function returns. 41 /// typedef uint32_t offset_type; // The type for offsets into the table. 42 /// 43 /// /// Calculate the hash for Key 44 /// static hash_value_type ComputeHash(key_type_ref Key); 45 /// /// Return the lengths, in bytes, of the given Key/Data pair. 46 /// static std::pair<offset_type, offset_type> 47 /// EmitKeyDataLength(raw_ostream &Out, key_type_ref Key, data_type_ref Data); 48 /// /// Write Key to Out. KeyLen is the length from EmitKeyDataLength. 49 /// static void EmitKey(raw_ostream &Out, key_type_ref Key, 50 /// offset_type KeyLen); 51 /// /// Write Data to Out. DataLen is the length from EmitKeyDataLength. 52 /// static void EmitData(raw_ostream &Out, key_type_ref Key, 53 /// data_type_ref Data, offset_type DataLen); 54 /// /// Determine if two keys are equal. Optional, only needed by contains. 55 /// static bool EqualKey(key_type_ref Key1, key_type_ref Key2); 56 /// }; 57 /// \endcode 58 template <typename Info> class OnDiskChainedHashTableGenerator { 59 /// A single item in the hash table. 60 class Item { 61 public: 62 typename Info::key_type Key; 63 typename Info::data_type Data; 64 Item *Next; 65 const typename Info::hash_value_type Hash; 66 67 Item(typename Info::key_type_ref Key, typename Info::data_type_ref Data, 68 Info &InfoObj) 69 : Key(Key), Data(Data), Next(nullptr), Hash(InfoObj.ComputeHash(Key)) {} 70 }; 71 72 typedef typename Info::offset_type offset_type; 73 offset_type NumBuckets; 74 offset_type NumEntries; 75 llvm::SpecificBumpPtrAllocator<Item> BA; 76 77 /// A linked list of values in a particular hash bucket. 78 struct Bucket { 79 offset_type Off; 80 unsigned Length; 81 Item *Head; 82 }; 83 84 Bucket *Buckets; 85 86 private: 87 /// Insert an item into the appropriate hash bucket. 88 void insert(Bucket *Buckets, size_t Size, Item *E) { 89 Bucket &B = Buckets[E->Hash & (Size - 1)]; 90 E->Next = B.Head; 91 ++B.Length; 92 B.Head = E; 93 } 94 95 /// Resize the hash table, moving the old entries into the new buckets. 96 void resize(size_t NewSize) { 97 Bucket *NewBuckets = static_cast<Bucket *>( 98 safe_calloc(NewSize, sizeof(Bucket))); 99 // Populate NewBuckets with the old entries. 100 for (size_t I = 0; I < NumBuckets; ++I) 101 for (Item *E = Buckets[I].Head; E;) { 102 Item *N = E->Next; 103 E->Next = nullptr; 104 insert(NewBuckets, NewSize, E); 105 E = N; 106 } 107 108 free(Buckets); 109 NumBuckets = NewSize; 110 Buckets = NewBuckets; 111 } 112 113 public: 114 /// Insert an entry into the table. 115 void insert(typename Info::key_type_ref Key, 116 typename Info::data_type_ref Data) { 117 Info InfoObj; 118 insert(Key, Data, InfoObj); 119 } 120 121 /// Insert an entry into the table. 122 /// 123 /// Uses the provided Info instead of a stack allocated one. 124 void insert(typename Info::key_type_ref Key, 125 typename Info::data_type_ref Data, Info &InfoObj) { 126 ++NumEntries; 127 if (4 * NumEntries >= 3 * NumBuckets) 128 resize(NumBuckets * 2); 129 insert(Buckets, NumBuckets, new (BA.Allocate()) Item(Key, Data, InfoObj)); 130 } 131 132 /// Determine whether an entry has been inserted. 133 bool contains(typename Info::key_type_ref Key, Info &InfoObj) { 134 unsigned Hash = InfoObj.ComputeHash(Key); 135 for (Item *I = Buckets[Hash & (NumBuckets - 1)].Head; I; I = I->Next) 136 if (I->Hash == Hash && InfoObj.EqualKey(I->Key, Key)) 137 return true; 138 return false; 139 } 140 141 /// Emit the table to Out, which must not be at offset 0. 142 offset_type Emit(raw_ostream &Out) { 143 Info InfoObj; 144 return Emit(Out, InfoObj); 145 } 146 147 /// Emit the table to Out, which must not be at offset 0. 148 /// 149 /// Uses the provided Info instead of a stack allocated one. 150 offset_type Emit(raw_ostream &Out, Info &InfoObj) { 151 using namespace llvm::support; 152 endian::Writer LE(Out, little); 153 154 // Now we're done adding entries, resize the bucket list if it's 155 // significantly too large. (This only happens if the number of 156 // entries is small and we're within our initial allocation of 157 // 64 buckets.) We aim for an occupancy ratio in [3/8, 3/4). 158 // 159 // As a special case, if there are two or fewer entries, just 160 // form a single bucket. A linear scan is fine in that case, and 161 // this is very common in C++ class lookup tables. This also 162 // guarantees we produce at least one bucket for an empty table. 163 // 164 // FIXME: Try computing a perfect hash function at this point. 165 unsigned TargetNumBuckets = 166 NumEntries <= 2 ? 1 : llvm::bit_ceil(NumEntries * 4 / 3 + 1); 167 if (TargetNumBuckets != NumBuckets) 168 resize(TargetNumBuckets); 169 170 // Emit the payload of the table. 171 for (offset_type I = 0; I < NumBuckets; ++I) { 172 Bucket &B = Buckets[I]; 173 if (!B.Head) 174 continue; 175 176 // Store the offset for the data of this bucket. 177 B.Off = Out.tell(); 178 assert(B.Off && "Cannot write a bucket at offset 0. Please add padding."); 179 180 // Write out the number of items in the bucket. 181 LE.write<uint16_t>(B.Length); 182 assert(B.Length != 0 && "Bucket has a head but zero length?"); 183 184 // Write out the entries in the bucket. 185 for (Item *I = B.Head; I; I = I->Next) { 186 LE.write<typename Info::hash_value_type>(I->Hash); 187 const std::pair<offset_type, offset_type> &Len = 188 InfoObj.EmitKeyDataLength(Out, I->Key, I->Data); 189 #ifdef NDEBUG 190 InfoObj.EmitKey(Out, I->Key, Len.first); 191 InfoObj.EmitData(Out, I->Key, I->Data, Len.second); 192 #else 193 // In asserts mode, check that the users length matches the data they 194 // wrote. 195 uint64_t KeyStart = Out.tell(); 196 InfoObj.EmitKey(Out, I->Key, Len.first); 197 uint64_t DataStart = Out.tell(); 198 InfoObj.EmitData(Out, I->Key, I->Data, Len.second); 199 uint64_t End = Out.tell(); 200 assert(offset_type(DataStart - KeyStart) == Len.first && 201 "key length does not match bytes written"); 202 assert(offset_type(End - DataStart) == Len.second && 203 "data length does not match bytes written"); 204 #endif 205 } 206 } 207 208 // Pad with zeros so that we can start the hashtable at an aligned address. 209 offset_type TableOff = Out.tell(); 210 uint64_t N = offsetToAlignment(TableOff, Align(alignof(offset_type))); 211 TableOff += N; 212 while (N--) 213 LE.write<uint8_t>(0); 214 215 // Emit the hashtable itself. 216 LE.write<offset_type>(NumBuckets); 217 LE.write<offset_type>(NumEntries); 218 for (offset_type I = 0; I < NumBuckets; ++I) 219 LE.write<offset_type>(Buckets[I].Off); 220 221 return TableOff; 222 } 223 224 OnDiskChainedHashTableGenerator() { 225 NumEntries = 0; 226 NumBuckets = 64; 227 // Note that we do not need to run the constructors of the individual 228 // Bucket objects since 'calloc' returns bytes that are all 0. 229 Buckets = static_cast<Bucket *>(safe_calloc(NumBuckets, sizeof(Bucket))); 230 } 231 232 ~OnDiskChainedHashTableGenerator() { std::free(Buckets); } 233 }; 234 235 /// Provides lookup on an on disk hash table. 236 /// 237 /// This needs an \c Info that handles reading values from the hash table's 238 /// payload and computes the hash for a given key. This should provide the 239 /// following interface: 240 /// 241 /// \code 242 /// class ExampleLookupInfo { 243 /// public: 244 /// typedef ExampleData data_type; 245 /// typedef ExampleInternalKey internal_key_type; // The stored key type. 246 /// typedef ExampleKey external_key_type; // The type to pass to find(). 247 /// typedef uint32_t hash_value_type; // The type the hash function returns. 248 /// typedef uint32_t offset_type; // The type for offsets into the table. 249 /// 250 /// /// Compare two keys for equality. 251 /// static bool EqualKey(internal_key_type &Key1, internal_key_type &Key2); 252 /// /// Calculate the hash for the given key. 253 /// static hash_value_type ComputeHash(internal_key_type &IKey); 254 /// /// Translate from the semantic type of a key in the hash table to the 255 /// /// type that is actually stored and used for hashing and comparisons. 256 /// /// The internal and external types are often the same, in which case this 257 /// /// can simply return the passed in value. 258 /// static const internal_key_type &GetInternalKey(external_key_type &EKey); 259 /// /// Read the key and data length from Buffer, leaving it pointing at the 260 /// /// following byte. 261 /// static std::pair<offset_type, offset_type> 262 /// ReadKeyDataLength(const unsigned char *&Buffer); 263 /// /// Read the key from Buffer, given the KeyLen as reported from 264 /// /// ReadKeyDataLength. 265 /// const internal_key_type &ReadKey(const unsigned char *Buffer, 266 /// offset_type KeyLen); 267 /// /// Read the data for Key from Buffer, given the DataLen as reported from 268 /// /// ReadKeyDataLength. 269 /// data_type ReadData(StringRef Key, const unsigned char *Buffer, 270 /// offset_type DataLen); 271 /// }; 272 /// \endcode 273 template <typename Info> class OnDiskChainedHashTable { 274 const typename Info::offset_type NumBuckets; 275 const typename Info::offset_type NumEntries; 276 const unsigned char *const Buckets; 277 const unsigned char *const Base; 278 Info InfoObj; 279 280 public: 281 typedef Info InfoType; 282 typedef typename Info::internal_key_type internal_key_type; 283 typedef typename Info::external_key_type external_key_type; 284 typedef typename Info::data_type data_type; 285 typedef typename Info::hash_value_type hash_value_type; 286 typedef typename Info::offset_type offset_type; 287 288 OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries, 289 const unsigned char *Buckets, 290 const unsigned char *Base, 291 const Info &InfoObj = Info()) 292 : NumBuckets(NumBuckets), NumEntries(NumEntries), Buckets(Buckets), 293 Base(Base), InfoObj(InfoObj) { 294 assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 && 295 "'buckets' must have a 4-byte alignment"); 296 } 297 298 /// Read the number of buckets and the number of entries from a hash table 299 /// produced by OnDiskHashTableGenerator::Emit, and advance the Buckets 300 /// pointer past them. 301 static std::pair<offset_type, offset_type> 302 readNumBucketsAndEntries(const unsigned char *&Buckets) { 303 assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 && 304 "buckets should be 4-byte aligned."); 305 using namespace llvm::support; 306 offset_type NumBuckets = 307 endian::readNext<offset_type, little, aligned>(Buckets); 308 offset_type NumEntries = 309 endian::readNext<offset_type, little, aligned>(Buckets); 310 return std::make_pair(NumBuckets, NumEntries); 311 } 312 313 offset_type getNumBuckets() const { return NumBuckets; } 314 offset_type getNumEntries() const { return NumEntries; } 315 const unsigned char *getBase() const { return Base; } 316 const unsigned char *getBuckets() const { return Buckets; } 317 318 bool isEmpty() const { return NumEntries == 0; } 319 320 class iterator { 321 internal_key_type Key; 322 const unsigned char *const Data; 323 const offset_type Len; 324 Info *InfoObj; 325 326 public: 327 iterator() : Key(), Data(nullptr), Len(0), InfoObj(nullptr) {} 328 iterator(const internal_key_type K, const unsigned char *D, offset_type L, 329 Info *InfoObj) 330 : Key(K), Data(D), Len(L), InfoObj(InfoObj) {} 331 332 data_type operator*() const { return InfoObj->ReadData(Key, Data, Len); } 333 334 const unsigned char *getDataPtr() const { return Data; } 335 offset_type getDataLen() const { return Len; } 336 337 bool operator==(const iterator &X) const { return X.Data == Data; } 338 bool operator!=(const iterator &X) const { return X.Data != Data; } 339 }; 340 341 /// Look up the stored data for a particular key. 342 iterator find(const external_key_type &EKey, Info *InfoPtr = nullptr) { 343 const internal_key_type &IKey = InfoObj.GetInternalKey(EKey); 344 hash_value_type KeyHash = InfoObj.ComputeHash(IKey); 345 return find_hashed(IKey, KeyHash, InfoPtr); 346 } 347 348 /// Look up the stored data for a particular key with a known hash. 349 iterator find_hashed(const internal_key_type &IKey, hash_value_type KeyHash, 350 Info *InfoPtr = nullptr) { 351 using namespace llvm::support; 352 353 if (!InfoPtr) 354 InfoPtr = &InfoObj; 355 356 // Each bucket is just an offset into the hash table file. 357 offset_type Idx = KeyHash & (NumBuckets - 1); 358 const unsigned char *Bucket = Buckets + sizeof(offset_type) * Idx; 359 360 offset_type Offset = endian::readNext<offset_type, little, aligned>(Bucket); 361 if (Offset == 0) 362 return iterator(); // Empty bucket. 363 const unsigned char *Items = Base + Offset; 364 365 // 'Items' starts with a 16-bit unsigned integer representing the 366 // number of items in this bucket. 367 unsigned Len = endian::readNext<uint16_t, little, unaligned>(Items); 368 369 for (unsigned i = 0; i < Len; ++i) { 370 // Read the hash. 371 hash_value_type ItemHash = 372 endian::readNext<hash_value_type, little, unaligned>(Items); 373 374 // Determine the length of the key and the data. 375 const std::pair<offset_type, offset_type> &L = 376 Info::ReadKeyDataLength(Items); 377 offset_type ItemLen = L.first + L.second; 378 379 // Compare the hashes. If they are not the same, skip the entry entirely. 380 if (ItemHash != KeyHash) { 381 Items += ItemLen; 382 continue; 383 } 384 385 // Read the key. 386 const internal_key_type &X = 387 InfoPtr->ReadKey((const unsigned char *const)Items, L.first); 388 389 // If the key doesn't match just skip reading the value. 390 if (!InfoPtr->EqualKey(X, IKey)) { 391 Items += ItemLen; 392 continue; 393 } 394 395 // The key matches! 396 return iterator(X, Items + L.first, L.second, InfoPtr); 397 } 398 399 return iterator(); 400 } 401 402 iterator end() const { return iterator(); } 403 404 Info &getInfoObj() { return InfoObj; } 405 406 /// Create the hash table. 407 /// 408 /// \param Buckets is the beginning of the hash table itself, which follows 409 /// the payload of entire structure. This is the value returned by 410 /// OnDiskHashTableGenerator::Emit. 411 /// 412 /// \param Base is the point from which all offsets into the structure are 413 /// based. This is offset 0 in the stream that was used when Emitting the 414 /// table. 415 static OnDiskChainedHashTable *Create(const unsigned char *Buckets, 416 const unsigned char *const Base, 417 const Info &InfoObj = Info()) { 418 assert(Buckets > Base); 419 auto NumBucketsAndEntries = readNumBucketsAndEntries(Buckets); 420 return new OnDiskChainedHashTable<Info>(NumBucketsAndEntries.first, 421 NumBucketsAndEntries.second, 422 Buckets, Base, InfoObj); 423 } 424 }; 425 426 /// Provides lookup and iteration over an on disk hash table. 427 /// 428 /// \copydetails llvm::OnDiskChainedHashTable 429 template <typename Info> 430 class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> { 431 const unsigned char *Payload; 432 433 public: 434 typedef OnDiskChainedHashTable<Info> base_type; 435 typedef typename base_type::internal_key_type internal_key_type; 436 typedef typename base_type::external_key_type external_key_type; 437 typedef typename base_type::data_type data_type; 438 typedef typename base_type::hash_value_type hash_value_type; 439 typedef typename base_type::offset_type offset_type; 440 441 private: 442 /// Iterates over all of the keys in the table. 443 class iterator_base { 444 const unsigned char *Ptr; 445 offset_type NumItemsInBucketLeft; 446 offset_type NumEntriesLeft; 447 448 public: 449 typedef external_key_type value_type; 450 451 iterator_base(const unsigned char *const Ptr, offset_type NumEntries) 452 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries) {} 453 iterator_base() 454 : Ptr(nullptr), NumItemsInBucketLeft(0), NumEntriesLeft(0) {} 455 456 friend bool operator==(const iterator_base &X, const iterator_base &Y) { 457 return X.NumEntriesLeft == Y.NumEntriesLeft; 458 } 459 friend bool operator!=(const iterator_base &X, const iterator_base &Y) { 460 return X.NumEntriesLeft != Y.NumEntriesLeft; 461 } 462 463 /// Move to the next item. 464 void advance() { 465 using namespace llvm::support; 466 if (!NumItemsInBucketLeft) { 467 // 'Items' starts with a 16-bit unsigned integer representing the 468 // number of items in this bucket. 469 NumItemsInBucketLeft = 470 endian::readNext<uint16_t, little, unaligned>(Ptr); 471 } 472 Ptr += sizeof(hash_value_type); // Skip the hash. 473 // Determine the length of the key and the data. 474 const std::pair<offset_type, offset_type> &L = 475 Info::ReadKeyDataLength(Ptr); 476 Ptr += L.first + L.second; 477 assert(NumItemsInBucketLeft); 478 --NumItemsInBucketLeft; 479 assert(NumEntriesLeft); 480 --NumEntriesLeft; 481 } 482 483 /// Get the start of the item as written by the trait (after the hash and 484 /// immediately before the key and value length). 485 const unsigned char *getItem() const { 486 return Ptr + (NumItemsInBucketLeft ? 0 : 2) + sizeof(hash_value_type); 487 } 488 }; 489 490 public: 491 OnDiskIterableChainedHashTable(offset_type NumBuckets, offset_type NumEntries, 492 const unsigned char *Buckets, 493 const unsigned char *Payload, 494 const unsigned char *Base, 495 const Info &InfoObj = Info()) 496 : base_type(NumBuckets, NumEntries, Buckets, Base, InfoObj), 497 Payload(Payload) {} 498 499 /// Iterates over all of the keys in the table. 500 class key_iterator : public iterator_base { 501 Info *InfoObj; 502 503 public: 504 typedef external_key_type value_type; 505 506 key_iterator(const unsigned char *const Ptr, offset_type NumEntries, 507 Info *InfoObj) 508 : iterator_base(Ptr, NumEntries), InfoObj(InfoObj) {} 509 key_iterator() : iterator_base(), InfoObj() {} 510 511 key_iterator &operator++() { 512 this->advance(); 513 return *this; 514 } 515 key_iterator operator++(int) { // Postincrement 516 key_iterator tmp = *this; 517 ++*this; 518 return tmp; 519 } 520 521 internal_key_type getInternalKey() const { 522 auto *LocalPtr = this->getItem(); 523 524 // Determine the length of the key and the data. 525 auto L = Info::ReadKeyDataLength(LocalPtr); 526 527 // Read the key. 528 return InfoObj->ReadKey(LocalPtr, L.first); 529 } 530 531 value_type operator*() const { 532 return InfoObj->GetExternalKey(getInternalKey()); 533 } 534 }; 535 536 key_iterator key_begin() { 537 return key_iterator(Payload, this->getNumEntries(), &this->getInfoObj()); 538 } 539 key_iterator key_end() { return key_iterator(); } 540 541 iterator_range<key_iterator> keys() { 542 return make_range(key_begin(), key_end()); 543 } 544 545 /// Iterates over all the entries in the table, returning the data. 546 class data_iterator : public iterator_base { 547 Info *InfoObj; 548 549 public: 550 typedef data_type value_type; 551 552 data_iterator(const unsigned char *const Ptr, offset_type NumEntries, 553 Info *InfoObj) 554 : iterator_base(Ptr, NumEntries), InfoObj(InfoObj) {} 555 data_iterator() : iterator_base(), InfoObj() {} 556 557 data_iterator &operator++() { // Preincrement 558 this->advance(); 559 return *this; 560 } 561 data_iterator operator++(int) { // Postincrement 562 data_iterator tmp = *this; 563 ++*this; 564 return tmp; 565 } 566 567 value_type operator*() const { 568 auto *LocalPtr = this->getItem(); 569 570 // Determine the length of the key and the data. 571 auto L = Info::ReadKeyDataLength(LocalPtr); 572 573 // Read the key. 574 const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first); 575 return InfoObj->ReadData(Key, LocalPtr + L.first, L.second); 576 } 577 }; 578 579 data_iterator data_begin() { 580 return data_iterator(Payload, this->getNumEntries(), &this->getInfoObj()); 581 } 582 data_iterator data_end() { return data_iterator(); } 583 584 iterator_range<data_iterator> data() { 585 return make_range(data_begin(), data_end()); 586 } 587 588 /// Create the hash table. 589 /// 590 /// \param Buckets is the beginning of the hash table itself, which follows 591 /// the payload of entire structure. This is the value returned by 592 /// OnDiskHashTableGenerator::Emit. 593 /// 594 /// \param Payload is the beginning of the data contained in the table. This 595 /// is Base plus any padding or header data that was stored, ie, the offset 596 /// that the stream was at when calling Emit. 597 /// 598 /// \param Base is the point from which all offsets into the structure are 599 /// based. This is offset 0 in the stream that was used when Emitting the 600 /// table. 601 static OnDiskIterableChainedHashTable * 602 Create(const unsigned char *Buckets, const unsigned char *const Payload, 603 const unsigned char *const Base, const Info &InfoObj = Info()) { 604 assert(Buckets > Base); 605 auto NumBucketsAndEntries = 606 OnDiskIterableChainedHashTable<Info>::readNumBucketsAndEntries(Buckets); 607 return new OnDiskIterableChainedHashTable<Info>( 608 NumBucketsAndEntries.first, NumBucketsAndEntries.second, 609 Buckets, Payload, Base, InfoObj); 610 } 611 }; 612 613 } // end namespace llvm 614 615 #endif 616