1 //===- GsymCreator.cpp ----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //===----------------------------------------------------------------------===//
7 
8 #include "llvm/DebugInfo/GSYM/GsymCreator.h"
9 #include "llvm/DebugInfo/GSYM/FileWriter.h"
10 #include "llvm/DebugInfo/GSYM/Header.h"
11 #include "llvm/DebugInfo/GSYM/LineTable.h"
12 #include "llvm/MC/StringTableBuilder.h"
13 #include "llvm/Support/raw_ostream.h"
14 
15 #include <algorithm>
16 #include <cassert>
17 #include <functional>
18 #include <vector>
19 
20 using namespace llvm;
21 using namespace gsym;
22 
23 GsymCreator::GsymCreator(bool Quiet)
24     : StrTab(StringTableBuilder::ELF), Quiet(Quiet) {
25   insertFile(StringRef());
26 }
27 
28 uint32_t GsymCreator::insertFile(StringRef Path, llvm::sys::path::Style Style) {
29   llvm::StringRef directory = llvm::sys::path::parent_path(Path, Style);
30   llvm::StringRef filename = llvm::sys::path::filename(Path, Style);
31   // We must insert the strings first, then call the FileEntry constructor.
32   // If we inline the insertString() function call into the constructor, the
33   // call order is undefined due to parameter lists not having any ordering
34   // requirements.
35   const uint32_t Dir = insertString(directory);
36   const uint32_t Base = insertString(filename);
37   return insertFileEntry(FileEntry(Dir, Base));
38 }
39 
40 uint32_t GsymCreator::insertFileEntry(FileEntry FE) {
41   std::lock_guard<std::mutex> Guard(Mutex);
42   const auto NextIndex = Files.size();
43   // Find FE in hash map and insert if not present.
44   auto R = FileEntryToIndex.insert(std::make_pair(FE, NextIndex));
45   if (R.second)
46     Files.emplace_back(FE);
47   return R.first->second;
48 }
49 
50 uint32_t GsymCreator::copyFile(const GsymCreator &SrcGC, uint32_t FileIdx) {
51   // File index zero is reserved for a FileEntry with no directory and no
52   // filename. Any other file and we need to copy the strings for the directory
53   // and filename.
54   if (FileIdx == 0)
55     return 0;
56   const FileEntry SrcFE = SrcGC.Files[FileIdx];
57   // Copy the strings for the file and then add the newly converted file entry.
58   uint32_t Dir = StrTab.add(SrcGC.StringOffsetMap.find(SrcFE.Dir)->second);
59   uint32_t Base = StrTab.add(SrcGC.StringOffsetMap.find(SrcFE.Base)->second);
60   FileEntry DstFE(Dir, Base);
61   return insertFileEntry(DstFE);
62 }
63 
64 
65 llvm::Error GsymCreator::save(StringRef Path,
66                               llvm::support::endianness ByteOrder,
67                               std::optional<uint64_t> SegmentSize) const {
68   if (SegmentSize)
69     return saveSegments(Path, ByteOrder, *SegmentSize);
70   std::error_code EC;
71   raw_fd_ostream OutStrm(Path, EC);
72   if (EC)
73     return llvm::errorCodeToError(EC);
74   FileWriter O(OutStrm, ByteOrder);
75   return encode(O);
76 }
77 
78 llvm::Error GsymCreator::encode(FileWriter &O) const {
79   std::lock_guard<std::mutex> Guard(Mutex);
80   if (Funcs.empty())
81     return createStringError(std::errc::invalid_argument,
82                              "no functions to encode");
83   if (!Finalized)
84     return createStringError(std::errc::invalid_argument,
85                              "GsymCreator wasn't finalized prior to encoding");
86 
87   if (Funcs.size() > UINT32_MAX)
88     return createStringError(std::errc::invalid_argument,
89                              "too many FunctionInfos");
90 
91   std::optional<uint64_t> BaseAddress = getBaseAddress();
92   // Base address should be valid if we have any functions.
93   if (!BaseAddress)
94     return createStringError(std::errc::invalid_argument,
95                              "invalid base address");
96   Header Hdr;
97   Hdr.Magic = GSYM_MAGIC;
98   Hdr.Version = GSYM_VERSION;
99   Hdr.AddrOffSize = getAddressOffsetSize();
100   Hdr.UUIDSize = static_cast<uint8_t>(UUID.size());
101   Hdr.BaseAddress = *BaseAddress;
102   Hdr.NumAddresses = static_cast<uint32_t>(Funcs.size());
103   Hdr.StrtabOffset = 0; // We will fix this up later.
104   Hdr.StrtabSize = 0;   // We will fix this up later.
105   memset(Hdr.UUID, 0, sizeof(Hdr.UUID));
106   if (UUID.size() > sizeof(Hdr.UUID))
107     return createStringError(std::errc::invalid_argument,
108                              "invalid UUID size %u", (uint32_t)UUID.size());
109   // Copy the UUID value if we have one.
110   if (UUID.size() > 0)
111     memcpy(Hdr.UUID, UUID.data(), UUID.size());
112   // Write out the header.
113   llvm::Error Err = Hdr.encode(O);
114   if (Err)
115     return Err;
116 
117   const uint64_t MaxAddressOffset = getMaxAddressOffset();
118   // Write out the address offsets.
119   O.alignTo(Hdr.AddrOffSize);
120   for (const auto &FuncInfo : Funcs) {
121     uint64_t AddrOffset = FuncInfo.startAddress() - Hdr.BaseAddress;
122     // Make sure we calculated the address offsets byte size correctly by
123     // verifying the current address offset is within ranges. We have seen bugs
124     // introduced when the code changes that can cause problems here so it is
125     // good to catch this during testing.
126     assert(AddrOffset <= MaxAddressOffset);
127     (void)MaxAddressOffset;
128     switch (Hdr.AddrOffSize) {
129     case 1:
130       O.writeU8(static_cast<uint8_t>(AddrOffset));
131       break;
132     case 2:
133       O.writeU16(static_cast<uint16_t>(AddrOffset));
134       break;
135     case 4:
136       O.writeU32(static_cast<uint32_t>(AddrOffset));
137       break;
138     case 8:
139       O.writeU64(AddrOffset);
140       break;
141     }
142   }
143 
144   // Write out all zeros for the AddrInfoOffsets.
145   O.alignTo(4);
146   const off_t AddrInfoOffsetsOffset = O.tell();
147   for (size_t i = 0, n = Funcs.size(); i < n; ++i)
148     O.writeU32(0);
149 
150   // Write out the file table
151   O.alignTo(4);
152   assert(!Files.empty());
153   assert(Files[0].Dir == 0);
154   assert(Files[0].Base == 0);
155   size_t NumFiles = Files.size();
156   if (NumFiles > UINT32_MAX)
157     return createStringError(std::errc::invalid_argument, "too many files");
158   O.writeU32(static_cast<uint32_t>(NumFiles));
159   for (auto File : Files) {
160     O.writeU32(File.Dir);
161     O.writeU32(File.Base);
162   }
163 
164   // Write out the string table.
165   const off_t StrtabOffset = O.tell();
166   StrTab.write(O.get_stream());
167   const off_t StrtabSize = O.tell() - StrtabOffset;
168   std::vector<uint32_t> AddrInfoOffsets;
169 
170   // Write out the address infos for each function info.
171   for (const auto &FuncInfo : Funcs) {
172     if (Expected<uint64_t> OffsetOrErr = FuncInfo.encode(O))
173       AddrInfoOffsets.push_back(OffsetOrErr.get());
174     else
175       return OffsetOrErr.takeError();
176   }
177   // Fixup the string table offset and size in the header
178   O.fixup32((uint32_t)StrtabOffset, offsetof(Header, StrtabOffset));
179   O.fixup32((uint32_t)StrtabSize, offsetof(Header, StrtabSize));
180 
181   // Fixup all address info offsets
182   uint64_t Offset = 0;
183   for (auto AddrInfoOffset : AddrInfoOffsets) {
184     O.fixup32(AddrInfoOffset, AddrInfoOffsetsOffset + Offset);
185     Offset += 4;
186   }
187   return ErrorSuccess();
188 }
189 
190 // Similar to std::remove_if, but the predicate is binary and it is passed both
191 // the previous and the current element.
192 template <class ForwardIt, class BinaryPredicate>
193 static ForwardIt removeIfBinary(ForwardIt FirstIt, ForwardIt LastIt,
194                                 BinaryPredicate Pred) {
195   if (FirstIt != LastIt) {
196     auto PrevIt = FirstIt++;
197     FirstIt = std::find_if(FirstIt, LastIt, [&](const auto &Curr) {
198       return Pred(*PrevIt++, Curr);
199     });
200     if (FirstIt != LastIt)
201       for (ForwardIt CurrIt = FirstIt; ++CurrIt != LastIt;)
202         if (!Pred(*PrevIt, *CurrIt)) {
203           PrevIt = FirstIt;
204           *FirstIt++ = std::move(*CurrIt);
205         }
206   }
207   return FirstIt;
208 }
209 
210 llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
211   std::lock_guard<std::mutex> Guard(Mutex);
212   if (Finalized)
213     return createStringError(std::errc::invalid_argument, "already finalized");
214   Finalized = true;
215 
216   // Sort function infos so we can emit sorted functions.
217   llvm::sort(Funcs);
218 
219   // Don't let the string table indexes change by finalizing in order.
220   StrTab.finalizeInOrder();
221 
222   // Remove duplicates function infos that have both entries from debug info
223   // (DWARF or Breakpad) and entries from the SymbolTable.
224   //
225   // Also handle overlapping function. Usually there shouldn't be any, but they
226   // can and do happen in some rare cases.
227   //
228   // (a)          (b)         (c)
229   //     ^  ^       ^            ^
230   //     |X |Y      |X ^         |X
231   //     |  |       |  |Y        |  ^
232   //     |  |       |  v         v  |Y
233   //     v  v       v               v
234   //
235   // In (a) and (b), Y is ignored and X will be reported for the full range.
236   // In (c), both functions will be included in the result and lookups for an
237   // address in the intersection will return Y because of binary search.
238   //
239   // Note that in case of (b), we cannot include Y in the result because then
240   // we wouldn't find any function for range (end of Y, end of X)
241   // with binary search
242   auto NumBefore = Funcs.size();
243   Funcs.erase(
244       removeIfBinary(Funcs.begin(), Funcs.end(),
245                      [&](const auto &Prev, const auto &Curr) {
246                        // Empty ranges won't intersect, but we still need to
247                        // catch the case where we have multiple symbols at the
248                        // same address and coalesce them.
249                        const bool ranges_equal = Prev.Range == Curr.Range;
250                        if (ranges_equal || Prev.Range.intersects(Curr.Range)) {
251                          // Overlapping ranges or empty identical ranges.
252                          if (ranges_equal) {
253                            // Same address range. Check if one is from debug
254                            // info and the other is from a symbol table. If
255                            // so, then keep the one with debug info. Our
256                            // sorting guarantees that entries with matching
257                            // address ranges that have debug info are last in
258                            // the sort.
259                            if (Prev == Curr) {
260                              // FunctionInfo entries match exactly (range,
261                              // lines, inlines)
262 
263                              // We used to output a warning here, but this was
264                              // so frequent on some binaries, in particular
265                              // when those were built with GCC, that it slowed
266                              // down processing extremely.
267                              return true;
268                            } else {
269                              if (!Prev.hasRichInfo() && Curr.hasRichInfo()) {
270                                // Same address range, one with no debug info
271                                // (symbol) and the next with debug info. Keep
272                                // the latter.
273                                return true;
274                              } else {
275                                if (!Quiet) {
276                                  OS << "warning: same address range contains "
277                                        "different debug "
278                                     << "info. Removing:\n"
279                                     << Prev << "\nIn favor of this one:\n"
280                                     << Curr << "\n";
281                                }
282                                return true;
283                              }
284                            }
285                          } else {
286                            if (!Quiet) { // print warnings about overlaps
287                              OS << "warning: function ranges overlap:\n"
288                                 << Prev << "\n"
289                                 << Curr << "\n";
290                            }
291                          }
292                        } else if (Prev.Range.size() == 0 &&
293                                   Curr.Range.contains(Prev.Range.start())) {
294                          if (!Quiet) {
295                            OS << "warning: removing symbol:\n"
296                               << Prev << "\nKeeping:\n"
297                               << Curr << "\n";
298                          }
299                          return true;
300                        }
301 
302                        return false;
303                      }),
304       Funcs.end());
305 
306   // If our last function info entry doesn't have a size and if we have valid
307   // text ranges, we should set the size of the last entry since any search for
308   // a high address might match our last entry. By fixing up this size, we can
309   // help ensure we don't cause lookups to always return the last symbol that
310   // has no size when doing lookups.
311   if (!Funcs.empty() && Funcs.back().Range.size() == 0 && ValidTextRanges) {
312     if (auto Range =
313             ValidTextRanges->getRangeThatContains(Funcs.back().Range.start())) {
314       Funcs.back().Range = {Funcs.back().Range.start(), Range->end()};
315     }
316   }
317   OS << "Pruned " << NumBefore - Funcs.size() << " functions, ended with "
318      << Funcs.size() << " total\n";
319   return Error::success();
320 }
321 
322 uint32_t GsymCreator::copyString(const GsymCreator &SrcGC, uint32_t StrOff) {
323   // String offset at zero is always the empty string, no copying needed.
324   if (StrOff == 0)
325     return 0;
326   return StrTab.add(SrcGC.StringOffsetMap.find(StrOff)->second);
327 }
328 
329 uint32_t GsymCreator::insertString(StringRef S, bool Copy) {
330   if (S.empty())
331     return 0;
332 
333   // The hash can be calculated outside the lock.
334   CachedHashStringRef CHStr(S);
335   std::lock_guard<std::mutex> Guard(Mutex);
336   if (Copy) {
337     // We need to provide backing storage for the string if requested
338     // since StringTableBuilder stores references to strings. Any string
339     // that comes from a section in an object file doesn't need to be
340     // copied, but any string created by code will need to be copied.
341     // This allows GsymCreator to be really fast when parsing DWARF and
342     // other object files as most strings don't need to be copied.
343     if (!StrTab.contains(CHStr))
344       CHStr = CachedHashStringRef{StringStorage.insert(S).first->getKey(),
345                                   CHStr.hash()};
346   }
347   const uint32_t StrOff = StrTab.add(CHStr);
348   // Save a mapping of string offsets to the cached string reference in case
349   // we need to segment the GSYM file and copy string from one string table to
350   // another.
351   if (StringOffsetMap.count(StrOff) == 0)
352     StringOffsetMap.insert(std::make_pair(StrOff, CHStr));
353   return StrOff;
354 }
355 
356 void GsymCreator::addFunctionInfo(FunctionInfo &&FI) {
357   std::lock_guard<std::mutex> Guard(Mutex);
358   Ranges.insert(FI.Range);
359   Funcs.emplace_back(std::move(FI));
360 }
361 
362 void GsymCreator::forEachFunctionInfo(
363     std::function<bool(FunctionInfo &)> const &Callback) {
364   std::lock_guard<std::mutex> Guard(Mutex);
365   for (auto &FI : Funcs) {
366     if (!Callback(FI))
367       break;
368   }
369 }
370 
371 void GsymCreator::forEachFunctionInfo(
372     std::function<bool(const FunctionInfo &)> const &Callback) const {
373   std::lock_guard<std::mutex> Guard(Mutex);
374   for (const auto &FI : Funcs) {
375     if (!Callback(FI))
376       break;
377   }
378 }
379 
380 size_t GsymCreator::getNumFunctionInfos() const {
381   std::lock_guard<std::mutex> Guard(Mutex);
382   return Funcs.size();
383 }
384 
385 bool GsymCreator::IsValidTextAddress(uint64_t Addr) const {
386   if (ValidTextRanges)
387     return ValidTextRanges->contains(Addr);
388   return true; // No valid text ranges has been set, so accept all ranges.
389 }
390 
391 bool GsymCreator::hasFunctionInfoForAddress(uint64_t Addr) const {
392   std::lock_guard<std::mutex> Guard(Mutex);
393   return Ranges.contains(Addr);
394 }
395 
396 std::optional<uint64_t> GsymCreator::getFirstFunctionAddress() const {
397   if (Finalized && !Funcs.empty())
398     return std::optional<uint64_t>(Funcs.front().startAddress());
399   // This code gets used by the segmentation of GSYM files to help determine the
400   // size of the GSYM header while continually adding new FunctionInfo objects
401   // to this object, so we haven't finalized this object yet.
402   if (Ranges.empty())
403     return std::nullopt;
404   return std::optional<uint64_t>(Ranges.begin()->start());
405 }
406 
407 std::optional<uint64_t> GsymCreator::getLastFunctionAddress() const {
408   if (Finalized && !Funcs.empty())
409     return std::optional<uint64_t>(Funcs.back().startAddress());
410   // This code gets used by the segmentation of GSYM files to help determine the
411   // size of the GSYM header while continually adding new FunctionInfo objects
412   // to this object, so we haven't finalized this object yet.
413   if (Ranges.empty())
414     return std::nullopt;
415   return std::optional<uint64_t>((Ranges.end() - 1)->end());
416 }
417 
418 std::optional<uint64_t> GsymCreator::getBaseAddress() const {
419   if (BaseAddress)
420     return BaseAddress;
421   return getFirstFunctionAddress();
422 }
423 
424 uint64_t GsymCreator::getMaxAddressOffset() const {
425   switch (getAddressOffsetSize()) {
426     case 1: return UINT8_MAX;
427     case 2: return UINT16_MAX;
428     case 4: return UINT32_MAX;
429     case 8: return UINT64_MAX;
430   }
431   llvm_unreachable("invalid address offset");
432 }
433 
434 uint8_t GsymCreator::getAddressOffsetSize() const {
435   const std::optional<uint64_t> BaseAddress = getBaseAddress();
436   const std::optional<uint64_t> LastFuncAddr = getLastFunctionAddress();
437   if (BaseAddress && LastFuncAddr) {
438     const uint64_t AddrDelta = *LastFuncAddr - *BaseAddress;
439     if (AddrDelta <= UINT8_MAX)
440       return 1;
441     else if (AddrDelta <= UINT16_MAX)
442       return 2;
443     else if (AddrDelta <= UINT32_MAX)
444       return 4;
445     return 8;
446   }
447   return 1;
448 }
449 
450 uint64_t GsymCreator::calculateHeaderAndTableSize() const {
451   uint64_t Size = sizeof(Header);
452   const size_t NumFuncs = Funcs.size();
453   // Add size of address offset table
454   Size += NumFuncs * getAddressOffsetSize();
455   // Add size of address info offsets which are 32 bit integers in version 1.
456   Size += NumFuncs * sizeof(uint32_t);
457   // Add file table size
458   Size += Files.size() * sizeof(FileEntry);
459   // Add string table size
460   Size += StrTab.getSize();
461 
462   return Size;
463 }
464 
465 // This function takes a InlineInfo class that was copy constructed from an
466 // InlineInfo from the \a SrcGC and updates all members that point to strings
467 // and files to point to strings and files from this GsymCreator.
468 void GsymCreator::fixupInlineInfo(const GsymCreator &SrcGC, InlineInfo &II) {
469   II.Name = copyString(SrcGC, II.Name);
470   II.CallFile = copyFile(SrcGC, II.CallFile);
471   for (auto &ChildII: II.Children)
472     fixupInlineInfo(SrcGC, ChildII);
473 }
474 
475 uint64_t GsymCreator::copyFunctionInfo(const GsymCreator &SrcGC, size_t FuncIdx) {
476   // To copy a function info we need to copy any files and strings over into
477   // this GsymCreator and then copy the function info and update the string
478   // table offsets to match the new offsets.
479   const FunctionInfo &SrcFI = SrcGC.Funcs[FuncIdx];
480   Ranges.insert(SrcFI.Range);
481 
482   FunctionInfo DstFI;
483   DstFI.Range = SrcFI.Range;
484   DstFI.Name = copyString(SrcGC, SrcFI.Name);
485   // Copy the line table if there is one.
486   if (SrcFI.OptLineTable) {
487     // Copy the entire line table.
488     DstFI.OptLineTable = LineTable(SrcFI.OptLineTable.value());
489     // Fixup all LineEntry::File entries which are indexes in the the file table
490     // from SrcGC and must be converted to file indexes from this GsymCreator.
491     LineTable &DstLT = DstFI.OptLineTable.value();
492     const size_t NumLines = DstLT.size();
493     for (size_t I=0; I<NumLines; ++I) {
494       LineEntry &LE = DstLT.get(I);
495       LE.File = copyFile(SrcGC, LE.File);
496     }
497   }
498   // Copy the inline information if needed.
499   if (SrcFI.Inline) {
500     // Make a copy of the source inline information.
501     DstFI.Inline = SrcFI.Inline.value();
502     // Fixup all strings and files in the copied inline information.
503     fixupInlineInfo(SrcGC, *DstFI.Inline);
504   }
505   std::lock_guard<std::mutex> Guard(Mutex);
506   Funcs.push_back(DstFI);
507   return Funcs.back().cacheEncoding();
508 }
509 
510 llvm::Error GsymCreator::saveSegments(StringRef Path,
511                                       llvm::support::endianness ByteOrder,
512                                       uint64_t SegmentSize) const {
513   if (SegmentSize == 0)
514     return createStringError(std::errc::invalid_argument,
515                              "invalid segment size zero");
516 
517   size_t FuncIdx = 0;
518   const size_t NumFuncs = Funcs.size();
519   while (FuncIdx < NumFuncs) {
520     llvm::Expected<std::unique_ptr<GsymCreator>> ExpectedGC =
521         createSegment(SegmentSize, FuncIdx);
522     if (ExpectedGC) {
523       GsymCreator *GC = ExpectedGC->get();
524       if (GC == NULL)
525         break; // We had not more functions to encode.
526       raw_null_ostream ErrorStrm;
527       llvm::Error Err = GC->finalize(ErrorStrm);
528       if (Err)
529         return Err;
530       std::string SegmentedGsymPath;
531       raw_string_ostream SGP(SegmentedGsymPath);
532       std::optional<uint64_t> FirstFuncAddr = GC->getFirstFunctionAddress();
533       if (FirstFuncAddr) {
534         SGP << Path << "-" << llvm::format_hex(*FirstFuncAddr, 1);
535         SGP.flush();
536         Err = GC->save(SegmentedGsymPath, ByteOrder, std::nullopt);
537         if (Err)
538           return Err;
539       }
540     } else {
541       return ExpectedGC.takeError();
542     }
543   }
544   return Error::success();
545 }
546 
547 llvm::Expected<std::unique_ptr<GsymCreator>>
548 GsymCreator::createSegment(uint64_t SegmentSize, size_t &FuncIdx) const {
549   // No function entries, return empty unique pointer
550   if (FuncIdx >= Funcs.size())
551     return std::unique_ptr<GsymCreator>();
552 
553   std::unique_ptr<GsymCreator> GC(new GsymCreator(/*Quiet=*/true));
554   // Set the base address if there is one.
555   if (BaseAddress)
556     GC->setBaseAddress(*BaseAddress);
557   // Copy the UUID value from this object into the new creator.
558   GC->setUUID(UUID);
559   const size_t NumFuncs = Funcs.size();
560   // Track how big the function infos are for the current segment so we can
561   // emit segments that are close to the requested size. It is quick math to
562   // determine the current header and tables sizes, so we can do that each loop.
563   uint64_t SegmentFuncInfosSize = 0;
564   for (; FuncIdx < NumFuncs; ++FuncIdx) {
565     const uint64_t HeaderAndTableSize = GC->calculateHeaderAndTableSize();
566     if (HeaderAndTableSize + SegmentFuncInfosSize >= SegmentSize) {
567       if (SegmentFuncInfosSize == 0)
568         return createStringError(std::errc::invalid_argument,
569                                  "a segment size of %" PRIu64 " is to small to "
570                                  "fit any function infos, specify a larger value",
571                                  SegmentSize);
572 
573       break;
574     }
575     SegmentFuncInfosSize += alignTo(GC->copyFunctionInfo(*this, FuncIdx), 4);
576   }
577   return std::move(GC);
578 }
579