1 //===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
10 
11 #include "llvm/ADT/DenseSet.h"
12 #include "llvm/DebugInfo/CodeView/RecordName.h"
13 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
14 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
15 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
16 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
17 #include "llvm/DebugInfo/MSF/MSFCommon.h"
18 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
19 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
20 #include "llvm/DebugInfo/PDB/Native/Hash.h"
21 #include "llvm/Support/BinaryItemStream.h"
22 #include "llvm/Support/BinaryStreamWriter.h"
23 #include "llvm/Support/xxhash.h"
24 #include <algorithm>
25 #include <vector>
26 
27 using namespace llvm;
28 using namespace llvm::msf;
29 using namespace llvm::pdb;
30 using namespace llvm::codeview;
31 
32 struct llvm::pdb::GSIHashStreamBuilder {
33   struct SymbolDenseMapInfo {
34     static inline CVSymbol getEmptyKey() {
35       static CVSymbol Empty;
36       return Empty;
37     }
38     static inline CVSymbol getTombstoneKey() {
39       static CVSymbol Tombstone(
40           DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
41       return Tombstone;
42     }
43     static unsigned getHashValue(const CVSymbol &Val) {
44       return xxHash64(Val.RecordData);
45     }
46     static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
47       return LHS.RecordData == RHS.RecordData;
48     }
49   };
50 
51   std::vector<CVSymbol> Records;
52   uint32_t StreamIndex;
53   llvm::DenseSet<CVSymbol, SymbolDenseMapInfo> SymbolHashes;
54   std::vector<PSHashRecord> HashRecords;
55   std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
56   std::vector<support::ulittle32_t> HashBuckets;
57 
58   uint32_t calculateSerializedLength() const;
59   uint32_t calculateRecordByteSize() const;
60   Error commit(BinaryStreamWriter &Writer);
61   void finalizeBuckets(uint32_t RecordZeroOffset);
62 
63   template <typename T> void addSymbol(const T &Symbol, MSFBuilder &Msf) {
64     T Copy(Symbol);
65     addSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
66                                                CodeViewContainer::Pdb));
67   }
68   void addSymbol(const CVSymbol &Symbol) {
69     if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
70       auto Iter = SymbolHashes.insert(Symbol);
71       if (!Iter.second)
72         return;
73     }
74 
75     Records.push_back(Symbol);
76   }
77 };
78 
79 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
80   uint32_t Size = sizeof(GSIHashHeader);
81   Size += HashRecords.size() * sizeof(PSHashRecord);
82   Size += HashBitmap.size() * sizeof(uint32_t);
83   Size += HashBuckets.size() * sizeof(uint32_t);
84   return Size;
85 }
86 
87 uint32_t GSIHashStreamBuilder::calculateRecordByteSize() const {
88   uint32_t Size = 0;
89   for (const auto &Sym : Records)
90     Size += Sym.length();
91   return Size;
92 }
93 
94 Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
95   GSIHashHeader Header;
96   Header.VerSignature = GSIHashHeader::HdrSignature;
97   Header.VerHdr = GSIHashHeader::HdrVersion;
98   Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
99   Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4;
100 
101   if (auto EC = Writer.writeObject(Header))
102     return EC;
103 
104   if (auto EC = Writer.writeArray(makeArrayRef(HashRecords)))
105     return EC;
106   if (auto EC = Writer.writeArray(makeArrayRef(HashBitmap)))
107     return EC;
108   if (auto EC = Writer.writeArray(makeArrayRef(HashBuckets)))
109     return EC;
110   return Error::success();
111 }
112 
113 static bool isAsciiString(StringRef S) {
114   return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; });
115 }
116 
117 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
118 static bool gsiRecordLess(StringRef S1, StringRef S2) {
119   size_t LS = S1.size();
120   size_t RS = S2.size();
121   // Shorter strings always compare less than longer strings.
122   if (LS != RS)
123     return LS < RS;
124 
125   // If either string contains non ascii characters, memcmp them.
126   if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
127     return memcmp(S1.data(), S2.data(), LS) < 0;
128 
129   // Both strings are ascii, perform a case-insenstive comparison.
130   return S1.compare_lower(S2.data()) < 0;
131 }
132 
133 void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset) {
134   std::array<std::vector<std::pair<StringRef, PSHashRecord>>, IPHR_HASH + 1>
135       TmpBuckets;
136   uint32_t SymOffset = RecordZeroOffset;
137   for (const CVSymbol &Sym : Records) {
138     PSHashRecord HR;
139     // Add one when writing symbol offsets to disk. See GSI1::fixSymRecs.
140     HR.Off = SymOffset + 1;
141     HR.CRef = 1; // Always use a refcount of 1.
142 
143     // Hash the name to figure out which bucket this goes into.
144     StringRef Name = getSymbolName(Sym);
145     size_t BucketIdx = hashStringV1(Name) % IPHR_HASH;
146     TmpBuckets[BucketIdx].push_back(std::make_pair(Name, HR));
147     SymOffset += Sym.length();
148   }
149 
150   // Compute the three tables: the hash records in bucket and chain order, the
151   // bucket presence bitmap, and the bucket chain start offsets.
152   HashRecords.reserve(Records.size());
153   for (ulittle32_t &Word : HashBitmap)
154     Word = 0;
155   for (size_t BucketIdx = 0; BucketIdx < IPHR_HASH + 1; ++BucketIdx) {
156     auto &Bucket = TmpBuckets[BucketIdx];
157     if (Bucket.empty())
158       continue;
159     HashBitmap[BucketIdx / 32] |= 1U << (BucketIdx % 32);
160 
161     // Calculate what the offset of the first hash record in the chain would
162     // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
163     // each record would be 12 bytes. See HROffsetCalc in gsi.h.
164     const int SizeOfHROffsetCalc = 12;
165     ulittle32_t ChainStartOff =
166         ulittle32_t(HashRecords.size() * SizeOfHROffsetCalc);
167     HashBuckets.push_back(ChainStartOff);
168 
169     // Sort each bucket by memcmp of the symbol's name.  It's important that
170     // we use the same sorting algorithm as is used by the reference
171     // implementation to ensure that the search for a record within a bucket
172     // can properly early-out when it detects the record won't be found.  The
173     // algorithm used here corredsponds to the function
174     // caseInsensitiveComparePchPchCchCch in the reference implementation.
175     llvm::sort(Bucket, [](const std::pair<StringRef, PSHashRecord> &Left,
176                           const std::pair<StringRef, PSHashRecord> &Right) {
177       return gsiRecordLess(Left.first, Right.first);
178     });
179 
180     for (const auto &Entry : Bucket)
181       HashRecords.push_back(Entry.second);
182   }
183 }
184 
185 GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
186     : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
187       GSH(std::make_unique<GSIHashStreamBuilder>()) {}
188 
189 GSIStreamBuilder::~GSIStreamBuilder() {}
190 
191 uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
192   uint32_t Size = 0;
193   Size += sizeof(PublicsStreamHeader);
194   Size += PSH->calculateSerializedLength();
195   Size += PSH->Records.size() * sizeof(uint32_t); // AddrMap
196   // FIXME: Add thunk map and section offsets for incremental linking.
197 
198   return Size;
199 }
200 
201 uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
202   return GSH->calculateSerializedLength();
203 }
204 
205 Error GSIStreamBuilder::finalizeMsfLayout() {
206   // First we write public symbol records, then we write global symbol records.
207   uint32_t PSHZero = 0;
208   uint32_t GSHZero = PSH->calculateRecordByteSize();
209 
210   PSH->finalizeBuckets(PSHZero);
211   GSH->finalizeBuckets(GSHZero);
212 
213   Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
214   if (!Idx)
215     return Idx.takeError();
216   GSH->StreamIndex = *Idx;
217   Idx = Msf.addStream(calculatePublicsHashStreamSize());
218   if (!Idx)
219     return Idx.takeError();
220   PSH->StreamIndex = *Idx;
221 
222   uint32_t RecordBytes =
223       GSH->calculateRecordByteSize() + PSH->calculateRecordByteSize();
224 
225   Idx = Msf.addStream(RecordBytes);
226   if (!Idx)
227     return Idx.takeError();
228   RecordStreamIdx = *Idx;
229   return Error::success();
230 }
231 
232 static bool comparePubSymByAddrAndName(
233     const std::pair<const CVSymbol *, const PublicSym32 *> &LS,
234     const std::pair<const CVSymbol *, const PublicSym32 *> &RS) {
235   if (LS.second->Segment != RS.second->Segment)
236     return LS.second->Segment < RS.second->Segment;
237   if (LS.second->Offset != RS.second->Offset)
238     return LS.second->Offset < RS.second->Offset;
239 
240   return LS.second->Name < RS.second->Name;
241 }
242 
243 /// Compute the address map. The address map is an array of symbol offsets
244 /// sorted so that it can be binary searched by address.
245 static std::vector<ulittle32_t> computeAddrMap(ArrayRef<CVSymbol> Records) {
246   // Make a vector of pointers to the symbols so we can sort it by address.
247   // Also gather the symbol offsets while we're at it.
248 
249   std::vector<PublicSym32> DeserializedPublics;
250   std::vector<std::pair<const CVSymbol *, const PublicSym32 *>> PublicsByAddr;
251   std::vector<uint32_t> SymOffsets;
252   DeserializedPublics.reserve(Records.size());
253   PublicsByAddr.reserve(Records.size());
254   SymOffsets.reserve(Records.size());
255 
256   uint32_t SymOffset = 0;
257   for (const CVSymbol &Sym : Records) {
258     assert(Sym.kind() == SymbolKind::S_PUB32);
259     DeserializedPublics.push_back(
260         cantFail(SymbolDeserializer::deserializeAs<PublicSym32>(Sym)));
261     PublicsByAddr.emplace_back(&Sym, &DeserializedPublics.back());
262     SymOffsets.push_back(SymOffset);
263     SymOffset += Sym.length();
264   }
265   llvm::stable_sort(PublicsByAddr, comparePubSymByAddrAndName);
266 
267   // Fill in the symbol offsets in the appropriate order.
268   std::vector<ulittle32_t> AddrMap;
269   AddrMap.reserve(Records.size());
270   for (auto &Sym : PublicsByAddr) {
271     ptrdiff_t Idx = std::distance(Records.data(), Sym.first);
272     assert(Idx >= 0 && size_t(Idx) < Records.size());
273     AddrMap.push_back(ulittle32_t(SymOffsets[Idx]));
274   }
275   return AddrMap;
276 }
277 
278 uint32_t GSIStreamBuilder::getPublicsStreamIndex() const {
279   return PSH->StreamIndex;
280 }
281 
282 uint32_t GSIStreamBuilder::getGlobalsStreamIndex() const {
283   return GSH->StreamIndex;
284 }
285 
286 void GSIStreamBuilder::addPublicSymbol(const PublicSym32 &Pub) {
287   PSH->addSymbol(Pub, Msf);
288 }
289 
290 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
291   GSH->addSymbol(Sym, Msf);
292 }
293 
294 void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
295   GSH->addSymbol(Sym, Msf);
296 }
297 
298 void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
299   GSH->addSymbol(Sym, Msf);
300 }
301 
302 void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Sym) {
303   GSH->addSymbol(Sym);
304 }
305 
306 static Error writeRecords(BinaryStreamWriter &Writer,
307                           ArrayRef<CVSymbol> Records) {
308   BinaryItemStream<CVSymbol> ItemStream(support::endianness::little);
309   ItemStream.setItems(Records);
310   BinaryStreamRef RecordsRef(ItemStream);
311   return Writer.writeStreamRef(RecordsRef);
312 }
313 
314 Error GSIStreamBuilder::commitSymbolRecordStream(
315     WritableBinaryStreamRef Stream) {
316   BinaryStreamWriter Writer(Stream);
317 
318   // Write public symbol records first, followed by global symbol records.  This
319   // must match the order that we assume in finalizeMsfLayout when computing
320   // PSHZero and GSHZero.
321   if (auto EC = writeRecords(Writer, PSH->Records))
322     return EC;
323   if (auto EC = writeRecords(Writer, GSH->Records))
324     return EC;
325 
326   return Error::success();
327 }
328 
329 Error GSIStreamBuilder::commitPublicsHashStream(
330     WritableBinaryStreamRef Stream) {
331   BinaryStreamWriter Writer(Stream);
332   PublicsStreamHeader Header;
333 
334   // FIXME: Fill these in. They are for incremental linking.
335   Header.SymHash = PSH->calculateSerializedLength();
336   Header.AddrMap = PSH->Records.size() * 4;
337   Header.NumThunks = 0;
338   Header.SizeOfThunk = 0;
339   Header.ISectThunkTable = 0;
340   memset(Header.Padding, 0, sizeof(Header.Padding));
341   Header.OffThunkTable = 0;
342   Header.NumSections = 0;
343   if (auto EC = Writer.writeObject(Header))
344     return EC;
345 
346   if (auto EC = PSH->commit(Writer))
347     return EC;
348 
349   std::vector<ulittle32_t> AddrMap = computeAddrMap(PSH->Records);
350   if (auto EC = Writer.writeArray(makeArrayRef(AddrMap)))
351     return EC;
352 
353   return Error::success();
354 }
355 
356 Error GSIStreamBuilder::commitGlobalsHashStream(
357     WritableBinaryStreamRef Stream) {
358   BinaryStreamWriter Writer(Stream);
359   return GSH->commit(Writer);
360 }
361 
362 Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
363                                WritableBinaryStreamRef Buffer) {
364   auto GS = WritableMappedBlockStream::createIndexedStream(
365       Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator());
366   auto PS = WritableMappedBlockStream::createIndexedStream(
367       Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator());
368   auto PRS = WritableMappedBlockStream::createIndexedStream(
369       Layout, Buffer, getRecordStreamIdx(), Msf.getAllocator());
370 
371   if (auto EC = commitSymbolRecordStream(*PRS))
372     return EC;
373   if (auto EC = commitGlobalsHashStream(*GS))
374     return EC;
375   if (auto EC = commitPublicsHashStream(*PS))
376     return EC;
377   return Error::success();
378 }
379