1 //===- GsymCreator.cpp ----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //===----------------------------------------------------------------------===//
7 
8 #include "llvm/DebugInfo/GSYM/GsymCreator.h"
9 #include "llvm/DebugInfo/GSYM/FileWriter.h"
10 #include "llvm/DebugInfo/GSYM/Header.h"
11 #include "llvm/DebugInfo/GSYM/LineTable.h"
12 #include "llvm/MC/StringTableBuilder.h"
13 #include "llvm/Support/raw_ostream.h"
14 
15 #include <algorithm>
16 #include <cassert>
17 #include <functional>
18 #include <vector>
19 
20 using namespace llvm;
21 using namespace gsym;
22 
23 GsymCreator::GsymCreator(bool Quiet)
24     : StrTab(StringTableBuilder::ELF), Quiet(Quiet) {
25   insertFile(StringRef());
26 }
27 
28 uint32_t GsymCreator::insertFile(StringRef Path, llvm::sys::path::Style Style) {
29   llvm::StringRef directory = llvm::sys::path::parent_path(Path, Style);
30   llvm::StringRef filename = llvm::sys::path::filename(Path, Style);
31   // We must insert the strings first, then call the FileEntry constructor.
32   // If we inline the insertString() function call into the constructor, the
33   // call order is undefined due to parameter lists not having any ordering
34   // requirements.
35   const uint32_t Dir = insertString(directory);
36   const uint32_t Base = insertString(filename);
37   return insertFileEntry(FileEntry(Dir, Base));
38 }
39 
40 uint32_t GsymCreator::insertFileEntry(FileEntry FE) {
41   std::lock_guard<std::mutex> Guard(Mutex);
42   const auto NextIndex = Files.size();
43   // Find FE in hash map and insert if not present.
44   auto R = FileEntryToIndex.insert(std::make_pair(FE, NextIndex));
45   if (R.second)
46     Files.emplace_back(FE);
47   return R.first->second;
48 }
49 
50 uint32_t GsymCreator::copyFile(const GsymCreator &SrcGC, uint32_t FileIdx) {
51   // File index zero is reserved for a FileEntry with no directory and no
52   // filename. Any other file and we need to copy the strings for the directory
53   // and filename.
54   if (FileIdx == 0)
55     return 0;
56   const FileEntry SrcFE = SrcGC.Files[FileIdx];
57   // Copy the strings for the file and then add the newly converted file entry.
58   uint32_t Dir = StrTab.add(SrcGC.StringOffsetMap.find(SrcFE.Dir)->second);
59   uint32_t Base = StrTab.add(SrcGC.StringOffsetMap.find(SrcFE.Base)->second);
60   FileEntry DstFE(Dir, Base);
61   return insertFileEntry(DstFE);
62 }
63 
64 llvm::Error GsymCreator::save(StringRef Path, llvm::endianness ByteOrder,
65                               std::optional<uint64_t> SegmentSize) const {
66   if (SegmentSize)
67     return saveSegments(Path, ByteOrder, *SegmentSize);
68   std::error_code EC;
69   raw_fd_ostream OutStrm(Path, EC);
70   if (EC)
71     return llvm::errorCodeToError(EC);
72   FileWriter O(OutStrm, ByteOrder);
73   return encode(O);
74 }
75 
76 llvm::Error GsymCreator::encode(FileWriter &O) const {
77   std::lock_guard<std::mutex> Guard(Mutex);
78   if (Funcs.empty())
79     return createStringError(std::errc::invalid_argument,
80                              "no functions to encode");
81   if (!Finalized)
82     return createStringError(std::errc::invalid_argument,
83                              "GsymCreator wasn't finalized prior to encoding");
84 
85   if (Funcs.size() > UINT32_MAX)
86     return createStringError(std::errc::invalid_argument,
87                              "too many FunctionInfos");
88 
89   std::optional<uint64_t> BaseAddress = getBaseAddress();
90   // Base address should be valid if we have any functions.
91   if (!BaseAddress)
92     return createStringError(std::errc::invalid_argument,
93                              "invalid base address");
94   Header Hdr;
95   Hdr.Magic = GSYM_MAGIC;
96   Hdr.Version = GSYM_VERSION;
97   Hdr.AddrOffSize = getAddressOffsetSize();
98   Hdr.UUIDSize = static_cast<uint8_t>(UUID.size());
99   Hdr.BaseAddress = *BaseAddress;
100   Hdr.NumAddresses = static_cast<uint32_t>(Funcs.size());
101   Hdr.StrtabOffset = 0; // We will fix this up later.
102   Hdr.StrtabSize = 0;   // We will fix this up later.
103   memset(Hdr.UUID, 0, sizeof(Hdr.UUID));
104   if (UUID.size() > sizeof(Hdr.UUID))
105     return createStringError(std::errc::invalid_argument,
106                              "invalid UUID size %u", (uint32_t)UUID.size());
107   // Copy the UUID value if we have one.
108   if (UUID.size() > 0)
109     memcpy(Hdr.UUID, UUID.data(), UUID.size());
110   // Write out the header.
111   llvm::Error Err = Hdr.encode(O);
112   if (Err)
113     return Err;
114 
115   const uint64_t MaxAddressOffset = getMaxAddressOffset();
116   // Write out the address offsets.
117   O.alignTo(Hdr.AddrOffSize);
118   for (const auto &FuncInfo : Funcs) {
119     uint64_t AddrOffset = FuncInfo.startAddress() - Hdr.BaseAddress;
120     // Make sure we calculated the address offsets byte size correctly by
121     // verifying the current address offset is within ranges. We have seen bugs
122     // introduced when the code changes that can cause problems here so it is
123     // good to catch this during testing.
124     assert(AddrOffset <= MaxAddressOffset);
125     (void)MaxAddressOffset;
126     switch (Hdr.AddrOffSize) {
127     case 1:
128       O.writeU8(static_cast<uint8_t>(AddrOffset));
129       break;
130     case 2:
131       O.writeU16(static_cast<uint16_t>(AddrOffset));
132       break;
133     case 4:
134       O.writeU32(static_cast<uint32_t>(AddrOffset));
135       break;
136     case 8:
137       O.writeU64(AddrOffset);
138       break;
139     }
140   }
141 
142   // Write out all zeros for the AddrInfoOffsets.
143   O.alignTo(4);
144   const off_t AddrInfoOffsetsOffset = O.tell();
145   for (size_t i = 0, n = Funcs.size(); i < n; ++i)
146     O.writeU32(0);
147 
148   // Write out the file table
149   O.alignTo(4);
150   assert(!Files.empty());
151   assert(Files[0].Dir == 0);
152   assert(Files[0].Base == 0);
153   size_t NumFiles = Files.size();
154   if (NumFiles > UINT32_MAX)
155     return createStringError(std::errc::invalid_argument, "too many files");
156   O.writeU32(static_cast<uint32_t>(NumFiles));
157   for (auto File : Files) {
158     O.writeU32(File.Dir);
159     O.writeU32(File.Base);
160   }
161 
162   // Write out the string table.
163   const off_t StrtabOffset = O.tell();
164   StrTab.write(O.get_stream());
165   const off_t StrtabSize = O.tell() - StrtabOffset;
166   std::vector<uint32_t> AddrInfoOffsets;
167 
168   // Write out the address infos for each function info.
169   for (const auto &FuncInfo : Funcs) {
170     if (Expected<uint64_t> OffsetOrErr = FuncInfo.encode(O))
171       AddrInfoOffsets.push_back(OffsetOrErr.get());
172     else
173       return OffsetOrErr.takeError();
174   }
175   // Fixup the string table offset and size in the header
176   O.fixup32((uint32_t)StrtabOffset, offsetof(Header, StrtabOffset));
177   O.fixup32((uint32_t)StrtabSize, offsetof(Header, StrtabSize));
178 
179   // Fixup all address info offsets
180   uint64_t Offset = 0;
181   for (auto AddrInfoOffset : AddrInfoOffsets) {
182     O.fixup32(AddrInfoOffset, AddrInfoOffsetsOffset + Offset);
183     Offset += 4;
184   }
185   return ErrorSuccess();
186 }
187 
188 llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
189   std::lock_guard<std::mutex> Guard(Mutex);
190   if (Finalized)
191     return createStringError(std::errc::invalid_argument, "already finalized");
192   Finalized = true;
193 
194   // Don't let the string table indexes change by finalizing in order.
195   StrTab.finalizeInOrder();
196 
197   // Remove duplicates function infos that have both entries from debug info
198   // (DWARF or Breakpad) and entries from the SymbolTable.
199   //
200   // Also handle overlapping function. Usually there shouldn't be any, but they
201   // can and do happen in some rare cases.
202   //
203   // (a)          (b)         (c)
204   //     ^  ^       ^            ^
205   //     |X |Y      |X ^         |X
206   //     |  |       |  |Y        |  ^
207   //     |  |       |  v         v  |Y
208   //     v  v       v               v
209   //
210   // In (a) and (b), Y is ignored and X will be reported for the full range.
211   // In (c), both functions will be included in the result and lookups for an
212   // address in the intersection will return Y because of binary search.
213   //
214   // Note that in case of (b), we cannot include Y in the result because then
215   // we wouldn't find any function for range (end of Y, end of X)
216   // with binary search
217 
218   const auto NumBefore = Funcs.size();
219   // Only sort and unique if this isn't a segment. If this is a segment we
220   // already finalized the main GsymCreator with all of the function infos
221   // and then the already sorted and uniqued function infos were added to this
222   // object.
223   if (!IsSegment) {
224     if (NumBefore > 1) {
225       // Sort function infos so we can emit sorted functions.
226       llvm::sort(Funcs);
227       std::vector<FunctionInfo> FinalizedFuncs;
228       FinalizedFuncs.reserve(Funcs.size());
229       FinalizedFuncs.emplace_back(std::move(Funcs.front()));
230       for (size_t Idx=1; Idx < NumBefore; ++Idx) {
231         FunctionInfo &Prev = FinalizedFuncs.back();
232         FunctionInfo &Curr = Funcs[Idx];
233         // Empty ranges won't intersect, but we still need to
234         // catch the case where we have multiple symbols at the
235         // same address and coalesce them.
236         const bool ranges_equal = Prev.Range == Curr.Range;
237         if (ranges_equal || Prev.Range.intersects(Curr.Range)) {
238           // Overlapping ranges or empty identical ranges.
239           if (ranges_equal) {
240             // Same address range. Check if one is from debug
241             // info and the other is from a symbol table. If
242             // so, then keep the one with debug info. Our
243             // sorting guarantees that entries with matching
244             // address ranges that have debug info are last in
245             // the sort.
246             if (!(Prev == Curr)) {
247               if (Prev.hasRichInfo() && Curr.hasRichInfo()) {
248                 if (!Quiet) {
249                   OS << "warning: same address range contains "
250                         "different debug "
251                     << "info. Removing:\n"
252                     << Prev << "\nIn favor of this one:\n"
253                     << Curr << "\n";
254                 }
255               }
256               // We want to swap the current entry with the previous since
257               // later entries with the same range always have more debug info
258               // or different debug info.
259               std::swap(Prev, Curr);
260             }
261           } else {
262             if (!Quiet) { // print warnings about overlaps
263               OS << "warning: function ranges overlap:\n"
264                 << Prev << "\n"
265                 << Curr << "\n";
266             }
267             FinalizedFuncs.emplace_back(std::move(Curr));
268           }
269         } else {
270           if (Prev.Range.size() == 0 && Curr.Range.contains(Prev.Range.start())) {
271             // Symbols on macOS don't have address ranges, so if the range
272             // doesn't match and the size is zero, then we replace the empty
273             // symbol function info with the current one.
274             std::swap(Prev, Curr);
275           } else {
276             FinalizedFuncs.emplace_back(std::move(Curr));
277           }
278         }
279       }
280       std::swap(Funcs, FinalizedFuncs);
281     }
282     // If our last function info entry doesn't have a size and if we have valid
283     // text ranges, we should set the size of the last entry since any search for
284     // a high address might match our last entry. By fixing up this size, we can
285     // help ensure we don't cause lookups to always return the last symbol that
286     // has no size when doing lookups.
287     if (!Funcs.empty() && Funcs.back().Range.size() == 0 && ValidTextRanges) {
288       if (auto Range =
289               ValidTextRanges->getRangeThatContains(Funcs.back().Range.start())) {
290         Funcs.back().Range = {Funcs.back().Range.start(), Range->end()};
291       }
292     }
293     OS << "Pruned " << NumBefore - Funcs.size() << " functions, ended with "
294       << Funcs.size() << " total\n";
295   }
296   return Error::success();
297 }
298 
299 uint32_t GsymCreator::copyString(const GsymCreator &SrcGC, uint32_t StrOff) {
300   // String offset at zero is always the empty string, no copying needed.
301   if (StrOff == 0)
302     return 0;
303   return StrTab.add(SrcGC.StringOffsetMap.find(StrOff)->second);
304 }
305 
306 uint32_t GsymCreator::insertString(StringRef S, bool Copy) {
307   if (S.empty())
308     return 0;
309 
310   // The hash can be calculated outside the lock.
311   CachedHashStringRef CHStr(S);
312   std::lock_guard<std::mutex> Guard(Mutex);
313   if (Copy) {
314     // We need to provide backing storage for the string if requested
315     // since StringTableBuilder stores references to strings. Any string
316     // that comes from a section in an object file doesn't need to be
317     // copied, but any string created by code will need to be copied.
318     // This allows GsymCreator to be really fast when parsing DWARF and
319     // other object files as most strings don't need to be copied.
320     if (!StrTab.contains(CHStr))
321       CHStr = CachedHashStringRef{StringStorage.insert(S).first->getKey(),
322                                   CHStr.hash()};
323   }
324   const uint32_t StrOff = StrTab.add(CHStr);
325   // Save a mapping of string offsets to the cached string reference in case
326   // we need to segment the GSYM file and copy string from one string table to
327   // another.
328   if (StringOffsetMap.count(StrOff) == 0)
329     StringOffsetMap.insert(std::make_pair(StrOff, CHStr));
330   return StrOff;
331 }
332 
333 void GsymCreator::addFunctionInfo(FunctionInfo &&FI) {
334   std::lock_guard<std::mutex> Guard(Mutex);
335   Funcs.emplace_back(std::move(FI));
336 }
337 
338 void GsymCreator::forEachFunctionInfo(
339     std::function<bool(FunctionInfo &)> const &Callback) {
340   std::lock_guard<std::mutex> Guard(Mutex);
341   for (auto &FI : Funcs) {
342     if (!Callback(FI))
343       break;
344   }
345 }
346 
347 void GsymCreator::forEachFunctionInfo(
348     std::function<bool(const FunctionInfo &)> const &Callback) const {
349   std::lock_guard<std::mutex> Guard(Mutex);
350   for (const auto &FI : Funcs) {
351     if (!Callback(FI))
352       break;
353   }
354 }
355 
356 size_t GsymCreator::getNumFunctionInfos() const {
357   std::lock_guard<std::mutex> Guard(Mutex);
358   return Funcs.size();
359 }
360 
361 bool GsymCreator::IsValidTextAddress(uint64_t Addr) const {
362   if (ValidTextRanges)
363     return ValidTextRanges->contains(Addr);
364   return true; // No valid text ranges has been set, so accept all ranges.
365 }
366 
367 std::optional<uint64_t> GsymCreator::getFirstFunctionAddress() const {
368   // If we have finalized then Funcs are sorted. If we are a segment then
369   // Funcs will be sorted as well since function infos get added from an
370   // already finalized GsymCreator object where its functions were sorted and
371   // uniqued.
372   if ((Finalized || IsSegment) && !Funcs.empty())
373     return std::optional<uint64_t>(Funcs.front().startAddress());
374   return std::nullopt;
375 }
376 
377 std::optional<uint64_t> GsymCreator::getLastFunctionAddress() const {
378   // If we have finalized then Funcs are sorted. If we are a segment then
379   // Funcs will be sorted as well since function infos get added from an
380   // already finalized GsymCreator object where its functions were sorted and
381   // uniqued.
382   if ((Finalized || IsSegment) && !Funcs.empty())
383     return std::optional<uint64_t>(Funcs.back().startAddress());
384   return std::nullopt;
385 }
386 
387 std::optional<uint64_t> GsymCreator::getBaseAddress() const {
388   if (BaseAddress)
389     return BaseAddress;
390   return getFirstFunctionAddress();
391 }
392 
393 uint64_t GsymCreator::getMaxAddressOffset() const {
394   switch (getAddressOffsetSize()) {
395     case 1: return UINT8_MAX;
396     case 2: return UINT16_MAX;
397     case 4: return UINT32_MAX;
398     case 8: return UINT64_MAX;
399   }
400   llvm_unreachable("invalid address offset");
401 }
402 
403 uint8_t GsymCreator::getAddressOffsetSize() const {
404   const std::optional<uint64_t> BaseAddress = getBaseAddress();
405   const std::optional<uint64_t> LastFuncAddr = getLastFunctionAddress();
406   if (BaseAddress && LastFuncAddr) {
407     const uint64_t AddrDelta = *LastFuncAddr - *BaseAddress;
408     if (AddrDelta <= UINT8_MAX)
409       return 1;
410     else if (AddrDelta <= UINT16_MAX)
411       return 2;
412     else if (AddrDelta <= UINT32_MAX)
413       return 4;
414     return 8;
415   }
416   return 1;
417 }
418 
419 uint64_t GsymCreator::calculateHeaderAndTableSize() const {
420   uint64_t Size = sizeof(Header);
421   const size_t NumFuncs = Funcs.size();
422   // Add size of address offset table
423   Size += NumFuncs * getAddressOffsetSize();
424   // Add size of address info offsets which are 32 bit integers in version 1.
425   Size += NumFuncs * sizeof(uint32_t);
426   // Add file table size
427   Size += Files.size() * sizeof(FileEntry);
428   // Add string table size
429   Size += StrTab.getSize();
430 
431   return Size;
432 }
433 
434 // This function takes a InlineInfo class that was copy constructed from an
435 // InlineInfo from the \a SrcGC and updates all members that point to strings
436 // and files to point to strings and files from this GsymCreator.
437 void GsymCreator::fixupInlineInfo(const GsymCreator &SrcGC, InlineInfo &II) {
438   II.Name = copyString(SrcGC, II.Name);
439   II.CallFile = copyFile(SrcGC, II.CallFile);
440   for (auto &ChildII: II.Children)
441     fixupInlineInfo(SrcGC, ChildII);
442 }
443 
444 uint64_t GsymCreator::copyFunctionInfo(const GsymCreator &SrcGC, size_t FuncIdx) {
445   // To copy a function info we need to copy any files and strings over into
446   // this GsymCreator and then copy the function info and update the string
447   // table offsets to match the new offsets.
448   const FunctionInfo &SrcFI = SrcGC.Funcs[FuncIdx];
449 
450   FunctionInfo DstFI;
451   DstFI.Range = SrcFI.Range;
452   DstFI.Name = copyString(SrcGC, SrcFI.Name);
453   // Copy the line table if there is one.
454   if (SrcFI.OptLineTable) {
455     // Copy the entire line table.
456     DstFI.OptLineTable = LineTable(SrcFI.OptLineTable.value());
457     // Fixup all LineEntry::File entries which are indexes in the the file table
458     // from SrcGC and must be converted to file indexes from this GsymCreator.
459     LineTable &DstLT = DstFI.OptLineTable.value();
460     const size_t NumLines = DstLT.size();
461     for (size_t I=0; I<NumLines; ++I) {
462       LineEntry &LE = DstLT.get(I);
463       LE.File = copyFile(SrcGC, LE.File);
464     }
465   }
466   // Copy the inline information if needed.
467   if (SrcFI.Inline) {
468     // Make a copy of the source inline information.
469     DstFI.Inline = SrcFI.Inline.value();
470     // Fixup all strings and files in the copied inline information.
471     fixupInlineInfo(SrcGC, *DstFI.Inline);
472   }
473   std::lock_guard<std::mutex> Guard(Mutex);
474   Funcs.emplace_back(DstFI);
475   return Funcs.back().cacheEncoding();
476 }
477 
478 llvm::Error GsymCreator::saveSegments(StringRef Path,
479                                       llvm::endianness ByteOrder,
480                                       uint64_t SegmentSize) const {
481   if (SegmentSize == 0)
482     return createStringError(std::errc::invalid_argument,
483                              "invalid segment size zero");
484 
485   size_t FuncIdx = 0;
486   const size_t NumFuncs = Funcs.size();
487   while (FuncIdx < NumFuncs) {
488     llvm::Expected<std::unique_ptr<GsymCreator>> ExpectedGC =
489         createSegment(SegmentSize, FuncIdx);
490     if (ExpectedGC) {
491       GsymCreator *GC = ExpectedGC->get();
492       if (GC == NULL)
493         break; // We had not more functions to encode.
494       raw_null_ostream ErrorStrm;
495       llvm::Error Err = GC->finalize(ErrorStrm);
496       if (Err)
497         return Err;
498       std::string SegmentedGsymPath;
499       raw_string_ostream SGP(SegmentedGsymPath);
500       std::optional<uint64_t> FirstFuncAddr = GC->getFirstFunctionAddress();
501       if (FirstFuncAddr) {
502         SGP << Path << "-" << llvm::format_hex(*FirstFuncAddr, 1);
503         SGP.flush();
504         Err = GC->save(SegmentedGsymPath, ByteOrder, std::nullopt);
505         if (Err)
506           return Err;
507       }
508     } else {
509       return ExpectedGC.takeError();
510     }
511   }
512   return Error::success();
513 }
514 
515 llvm::Expected<std::unique_ptr<GsymCreator>>
516 GsymCreator::createSegment(uint64_t SegmentSize, size_t &FuncIdx) const {
517   // No function entries, return empty unique pointer
518   if (FuncIdx >= Funcs.size())
519     return std::unique_ptr<GsymCreator>();
520 
521   std::unique_ptr<GsymCreator> GC(new GsymCreator(/*Quiet=*/true));
522 
523   // Tell the creator that this is a segment.
524   GC->setIsSegment();
525 
526   // Set the base address if there is one.
527   if (BaseAddress)
528     GC->setBaseAddress(*BaseAddress);
529   // Copy the UUID value from this object into the new creator.
530   GC->setUUID(UUID);
531   const size_t NumFuncs = Funcs.size();
532   // Track how big the function infos are for the current segment so we can
533   // emit segments that are close to the requested size. It is quick math to
534   // determine the current header and tables sizes, so we can do that each loop.
535   uint64_t SegmentFuncInfosSize = 0;
536   for (; FuncIdx < NumFuncs; ++FuncIdx) {
537     const uint64_t HeaderAndTableSize = GC->calculateHeaderAndTableSize();
538     if (HeaderAndTableSize + SegmentFuncInfosSize >= SegmentSize) {
539       if (SegmentFuncInfosSize == 0)
540         return createStringError(std::errc::invalid_argument,
541                                  "a segment size of %" PRIu64 " is to small to "
542                                  "fit any function infos, specify a larger value",
543                                  SegmentSize);
544 
545       break;
546     }
547     SegmentFuncInfosSize += alignTo(GC->copyFunctionInfo(*this, FuncIdx), 4);
548   }
549   return std::move(GC);
550 }
551