1 //===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // The data structures defined in this file are based on the reference
10 // implementation which is available at
11 // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
16 #include "llvm/DebugInfo/CodeView/RecordName.h"
17 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
18 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
19 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
20 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
21 #include "llvm/DebugInfo/MSF/MSFCommon.h"
22 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
23 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
24 #include "llvm/DebugInfo/PDB/Native/Hash.h"
25 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
26 #include "llvm/Support/BinaryItemStream.h"
27 #include "llvm/Support/BinaryStreamWriter.h"
28 #include "llvm/Support/Parallel.h"
29 #include "llvm/Support/xxhash.h"
30 #include <algorithm>
31 #include <vector>
32 
33 using namespace llvm;
34 using namespace llvm::msf;
35 using namespace llvm::pdb;
36 using namespace llvm::codeview;
37 
38 // Helper class for building the public and global PDB hash table buckets.
39 struct llvm::pdb::GSIHashStreamBuilder {
40   // Sum of the size of all public or global records.
41   uint32_t RecordByteSize = 0;
42 
43   std::vector<PSHashRecord> HashRecords;
44 
45   // The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The
46   // reference implementation builds a hash table with IPHR_HASH buckets in it.
47   // The last bucket is used to link together free hash table cells in a linked
48   // list, but it is always empty in the compressed, on-disk format. However,
49   // the bitmap must have a bit for it.
50   std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
51 
52   std::vector<support::ulittle32_t> HashBuckets;
53 
54   uint32_t calculateSerializedLength() const;
55   Error commit(BinaryStreamWriter &Writer);
56 
57   void finalizePublicBuckets();
58   void finalizeGlobalBuckets(uint32_t RecordZeroOffset);
59 
60   // Assign public and global symbol records into hash table buckets.
61   // Modifies the list of records to store the bucket index, but does not
62   // change the order.
63   void finalizeBuckets(uint32_t RecordZeroOffset,
64                        MutableArrayRef<BulkPublic> Globals);
65 };
66 
67 // DenseMapInfo implementation for deduplicating symbol records.
68 struct llvm::pdb::SymbolDenseMapInfo {
69   static inline CVSymbol getEmptyKey() {
70     static CVSymbol Empty;
71     return Empty;
72   }
73   static inline CVSymbol getTombstoneKey() {
74     static CVSymbol Tombstone(
75         DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
76     return Tombstone;
77   }
78   static unsigned getHashValue(const CVSymbol &Val) {
79     return xxHash64(Val.RecordData);
80   }
81   static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
82     return LHS.RecordData == RHS.RecordData;
83   }
84 };
85 
86 namespace {
87 LLVM_PACKED_START
88 struct PublicSym32Layout {
89   RecordPrefix Prefix;
90   PublicSym32Header Pub;
91   // char Name[];
92 };
93 LLVM_PACKED_END
94 } // namespace
95 
96 // Calculate how much memory this public needs when serialized.
97 static uint32_t sizeOfPublic(const BulkPublic &Pub) {
98   uint32_t NameLen = Pub.NameLen;
99   NameLen = std::min(NameLen,
100                      uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
101   return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
102 }
103 
104 static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {
105   // Assume the caller has allocated sizeOfPublic bytes.
106   uint32_t NameLen = std::min(
107       Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
108   size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
109   assert(Size == sizeOfPublic(Pub));
110   auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);
111   FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
112   FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);
113   FixedMem->Pub.Flags = Pub.Flags;
114   FixedMem->Pub.Offset = Pub.Offset;
115   FixedMem->Pub.Segment = Pub.Segment;
116   char *NameMem = reinterpret_cast<char *>(FixedMem + 1);
117   memcpy(NameMem, Pub.Name, NameLen);
118   // Zero the null terminator and remaining bytes.
119   memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen);
120   return CVSymbol(makeArrayRef(reinterpret_cast<uint8_t *>(Mem), Size));
121 }
122 
123 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
124   uint32_t Size = sizeof(GSIHashHeader);
125   Size += HashRecords.size() * sizeof(PSHashRecord);
126   Size += HashBitmap.size() * sizeof(uint32_t);
127   Size += HashBuckets.size() * sizeof(uint32_t);
128   return Size;
129 }
130 
131 Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
132   GSIHashHeader Header;
133   Header.VerSignature = GSIHashHeader::HdrSignature;
134   Header.VerHdr = GSIHashHeader::HdrVersion;
135   Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
136   Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4;
137 
138   if (auto EC = Writer.writeObject(Header))
139     return EC;
140 
141   if (auto EC = Writer.writeArray(makeArrayRef(HashRecords)))
142     return EC;
143   if (auto EC = Writer.writeArray(makeArrayRef(HashBitmap)))
144     return EC;
145   if (auto EC = Writer.writeArray(makeArrayRef(HashBuckets)))
146     return EC;
147   return Error::success();
148 }
149 
150 static bool isAsciiString(StringRef S) {
151   return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; });
152 }
153 
154 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
155 static int gsiRecordCmp(StringRef S1, StringRef S2) {
156   size_t LS = S1.size();
157   size_t RS = S2.size();
158   // Shorter strings always compare less than longer strings.
159   if (LS != RS)
160     return (LS > RS) - (LS < RS);
161 
162   // If either string contains non ascii characters, memcmp them.
163   if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
164     return memcmp(S1.data(), S2.data(), LS);
165 
166   // Both strings are ascii, perform a case-insensitive comparison.
167   return S1.compare_insensitive(S2.data());
168 }
169 
170 void GSIStreamBuilder::finalizePublicBuckets() {
171   PSH->finalizeBuckets(0, Publics);
172 }
173 
174 void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) {
175   // Build up a list of globals to be bucketed. Use the BulkPublic data
176   // structure for this purpose, even though these are global records, not
177   // public records. Most of the same fields are required:
178   // - Name
179   // - NameLen
180   // - SymOffset
181   // - BucketIdx
182   // The dead fields are Offset, Segment, and Flags.
183   std::vector<BulkPublic> Records;
184   Records.resize(Globals.size());
185   uint32_t SymOffset = RecordZeroOffset;
186   for (size_t I = 0, E = Globals.size(); I < E; ++I) {
187     StringRef Name = getSymbolName(Globals[I]);
188     Records[I].Name = Name.data();
189     Records[I].NameLen = Name.size();
190     Records[I].SymOffset = SymOffset;
191     SymOffset += Globals[I].length();
192   }
193 
194   GSH->finalizeBuckets(RecordZeroOffset, Records);
195 }
196 
197 void GSIHashStreamBuilder::finalizeBuckets(
198     uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) {
199   // Hash every name in parallel.
200   parallelFor(0, Records.size(), [&](size_t I) {
201     Records[I].setBucketIdx(hashStringV1(Records[I].Name) % IPHR_HASH);
202   });
203 
204   // Count up the size of each bucket. Then, use an exclusive prefix sum to
205   // calculate the bucket start offsets. This is C++17 std::exclusive_scan, but
206   // we can't use it yet.
207   uint32_t BucketStarts[IPHR_HASH] = {0};
208   for (const BulkPublic &P : Records)
209     ++BucketStarts[P.BucketIdx];
210   uint32_t Sum = 0;
211   for (uint32_t &B : BucketStarts) {
212     uint32_t Size = B;
213     B = Sum;
214     Sum += Size;
215   }
216 
217   // Place globals into the hash table in bucket order. When placing a global,
218   // update the bucket start. Every hash table slot should be filled. Always use
219   // a refcount of one for now.
220   HashRecords.resize(Records.size());
221   uint32_t BucketCursors[IPHR_HASH];
222   memcpy(BucketCursors, BucketStarts, sizeof(BucketCursors));
223   for (int I = 0, E = Records.size(); I < E; ++I) {
224     uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++;
225     HashRecords[HashIdx].Off = I;
226     HashRecords[HashIdx].CRef = 1;
227   }
228 
229   // Within the buckets, sort each bucket by memcmp of the symbol's name.  It's
230   // important that we use the same sorting algorithm as is used by the
231   // reference implementation to ensure that the search for a record within a
232   // bucket can properly early-out when it detects the record won't be found.
233   // The algorithm used here corresponds to the function
234   // caseInsensitiveComparePchPchCchCch in the reference implementation.
235   parallelFor(0, IPHR_HASH, [&](size_t I) {
236     auto B = HashRecords.begin() + BucketStarts[I];
237     auto E = HashRecords.begin() + BucketCursors[I];
238     if (B == E)
239       return;
240     auto BucketCmp = [Records](const PSHashRecord &LHash,
241                                const PSHashRecord &RHash) {
242       const BulkPublic &L = Records[uint32_t(LHash.Off)];
243       const BulkPublic &R = Records[uint32_t(RHash.Off)];
244       assert(L.BucketIdx == R.BucketIdx);
245       int Cmp = gsiRecordCmp(L.getName(), R.getName());
246       if (Cmp != 0)
247         return Cmp < 0;
248       // This comparison is necessary to make the sorting stable in the presence
249       // of two static globals with the same name. The easiest way to observe
250       // this is with S_LDATA32 records.
251       return L.SymOffset < R.SymOffset;
252     };
253     llvm::sort(B, E, BucketCmp);
254 
255     // After we are done sorting, replace the global indices with the stream
256     // offsets of each global. Add one when writing symbol offsets to disk.
257     // See GSI1::fixSymRecs.
258     for (PSHashRecord &HRec : make_range(B, E))
259       HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1;
260   });
261 
262   // For each non-empty bucket, push the bucket start offset into HashBuckets
263   // and set a bit in the hash bitmap.
264   for (uint32_t I = 0; I < HashBitmap.size(); ++I) {
265     uint32_t Word = 0;
266     for (uint32_t J = 0; J < 32; ++J) {
267       // Skip empty buckets.
268       uint32_t BucketIdx = I * 32 + J;
269       if (BucketIdx >= IPHR_HASH ||
270           BucketStarts[BucketIdx] == BucketCursors[BucketIdx])
271         continue;
272       Word |= (1U << J);
273 
274       // Calculate what the offset of the first hash record in the chain would
275       // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
276       // each record would be 12 bytes. See HROffsetCalc in gsi.h.
277       const int SizeOfHROffsetCalc = 12;
278       ulittle32_t ChainStartOff =
279           ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc);
280       HashBuckets.push_back(ChainStartOff);
281     }
282     HashBitmap[I] = Word;
283   }
284 }
285 
286 GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
287     : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
288       GSH(std::make_unique<GSIHashStreamBuilder>()) {}
289 
290 GSIStreamBuilder::~GSIStreamBuilder() = default;
291 
292 uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
293   uint32_t Size = 0;
294   Size += sizeof(PublicsStreamHeader);
295   Size += PSH->calculateSerializedLength();
296   Size += Publics.size() * sizeof(uint32_t); // AddrMap
297   // FIXME: Add thunk map and section offsets for incremental linking.
298 
299   return Size;
300 }
301 
302 uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
303   return GSH->calculateSerializedLength();
304 }
305 
306 Error GSIStreamBuilder::finalizeMsfLayout() {
307   // First we write public symbol records, then we write global symbol records.
308   finalizePublicBuckets();
309   finalizeGlobalBuckets(PSH->RecordByteSize);
310 
311   Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
312   if (!Idx)
313     return Idx.takeError();
314   GlobalsStreamIndex = *Idx;
315 
316   Idx = Msf.addStream(calculatePublicsHashStreamSize());
317   if (!Idx)
318     return Idx.takeError();
319   PublicsStreamIndex = *Idx;
320 
321   uint32_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize;
322 
323   Idx = Msf.addStream(RecordBytes);
324   if (!Idx)
325     return Idx.takeError();
326   RecordStreamIndex = *Idx;
327   return Error::success();
328 }
329 
330 void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) {
331   assert(Publics.empty() && PSH->RecordByteSize == 0 &&
332          "publics can only be added once");
333   Publics = std::move(PublicsIn);
334 
335   // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
336   parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) {
337     return L.getName() < R.getName();
338   });
339 
340   // Assign offsets and calculate the length of the public symbol records.
341   uint32_t SymOffset = 0;
342   for (BulkPublic &Pub : Publics) {
343     Pub.SymOffset = SymOffset;
344     SymOffset += sizeOfPublic(Pub);
345   }
346 
347   // Remember the length of the public stream records.
348   PSH->RecordByteSize = SymOffset;
349 }
350 
351 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
352   serializeAndAddGlobal(Sym);
353 }
354 
355 void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
356   serializeAndAddGlobal(Sym);
357 }
358 
359 void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
360   serializeAndAddGlobal(Sym);
361 }
362 
363 template <typename T>
364 void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) {
365   T Copy(Symbol);
366   addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
367                                                    CodeViewContainer::Pdb));
368 }
369 
370 void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) {
371   // Ignore duplicate typedefs and constants.
372   if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
373     auto Iter = GlobalsSeen.insert(Symbol);
374     if (!Iter.second)
375       return;
376   }
377   GSH->RecordByteSize += Symbol.length();
378   Globals.push_back(Symbol);
379 }
380 
381 // Serialize each public and write it.
382 static Error writePublics(BinaryStreamWriter &Writer,
383                           ArrayRef<BulkPublic> Publics) {
384   std::vector<uint8_t> Storage;
385   for (const BulkPublic &Pub : Publics) {
386     Storage.resize(sizeOfPublic(Pub));
387     serializePublic(Storage.data(), Pub);
388     if (Error E = Writer.writeBytes(Storage))
389       return E;
390   }
391   return Error::success();
392 }
393 
394 static Error writeRecords(BinaryStreamWriter &Writer,
395                           ArrayRef<CVSymbol> Records) {
396   BinaryItemStream<CVSymbol> ItemStream(support::endianness::little);
397   ItemStream.setItems(Records);
398   BinaryStreamRef RecordsRef(ItemStream);
399   return Writer.writeStreamRef(RecordsRef);
400 }
401 
402 Error GSIStreamBuilder::commitSymbolRecordStream(
403     WritableBinaryStreamRef Stream) {
404   BinaryStreamWriter Writer(Stream);
405 
406   // Write public symbol records first, followed by global symbol records.  This
407   // must match the order that we assume in finalizeMsfLayout when computing
408   // PSHZero and GSHZero.
409   if (auto EC = writePublics(Writer, Publics))
410     return EC;
411   if (auto EC = writeRecords(Writer, Globals))
412     return EC;
413 
414   return Error::success();
415 }
416 
417 static std::vector<support::ulittle32_t>
418 computeAddrMap(ArrayRef<BulkPublic> Publics) {
419   // Build a parallel vector of indices into the Publics vector, and sort it by
420   // address.
421   std::vector<ulittle32_t> PubAddrMap;
422   PubAddrMap.reserve(Publics.size());
423   for (int I = 0, E = Publics.size(); I < E; ++I)
424     PubAddrMap.push_back(ulittle32_t(I));
425 
426   auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) {
427     const BulkPublic &L = Publics[LIdx];
428     const BulkPublic &R = Publics[RIdx];
429     if (L.Segment != R.Segment)
430       return L.Segment < R.Segment;
431     if (L.Offset != R.Offset)
432       return L.Offset < R.Offset;
433     // parallelSort is unstable, so we have to do name comparison to ensure
434     // that two names for the same location come out in a deterministic order.
435     return L.getName() < R.getName();
436   };
437   parallelSort(PubAddrMap, AddrCmp);
438 
439   // Rewrite the public symbol indices into symbol offsets.
440   for (ulittle32_t &Entry : PubAddrMap)
441     Entry = Publics[Entry].SymOffset;
442   return PubAddrMap;
443 }
444 
445 Error GSIStreamBuilder::commitPublicsHashStream(
446     WritableBinaryStreamRef Stream) {
447   BinaryStreamWriter Writer(Stream);
448   PublicsStreamHeader Header;
449 
450   // FIXME: Fill these in. They are for incremental linking.
451   Header.SymHash = PSH->calculateSerializedLength();
452   Header.AddrMap = Publics.size() * 4;
453   Header.NumThunks = 0;
454   Header.SizeOfThunk = 0;
455   Header.ISectThunkTable = 0;
456   memset(Header.Padding, 0, sizeof(Header.Padding));
457   Header.OffThunkTable = 0;
458   Header.NumSections = 0;
459   if (auto EC = Writer.writeObject(Header))
460     return EC;
461 
462   if (auto EC = PSH->commit(Writer))
463     return EC;
464 
465   std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics);
466   assert(PubAddrMap.size() == Publics.size());
467   if (auto EC = Writer.writeArray(makeArrayRef(PubAddrMap)))
468     return EC;
469 
470   return Error::success();
471 }
472 
473 Error GSIStreamBuilder::commitGlobalsHashStream(
474     WritableBinaryStreamRef Stream) {
475   BinaryStreamWriter Writer(Stream);
476   return GSH->commit(Writer);
477 }
478 
479 Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
480                                WritableBinaryStreamRef Buffer) {
481   auto GS = WritableMappedBlockStream::createIndexedStream(
482       Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator());
483   auto PS = WritableMappedBlockStream::createIndexedStream(
484       Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator());
485   auto PRS = WritableMappedBlockStream::createIndexedStream(
486       Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator());
487 
488   if (auto EC = commitSymbolRecordStream(*PRS))
489     return EC;
490   if (auto EC = commitGlobalsHashStream(*GS))
491     return EC;
492   if (auto EC = commitPublicsHashStream(*PS))
493     return EC;
494   return Error::success();
495 }
496