1 //===- RawMemProfReader.cpp - Instrumented memory profiling reader --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains support for reading MemProf profiling data.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <algorithm>
14 #include <cstdint>
15 #include <memory>
16 #include <type_traits>
17 
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/DenseMap.h"
20 #include "llvm/ADT/SetVector.h"
21 #include "llvm/ADT/SmallSet.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringExtras.h"
24 #include "llvm/ADT/Twine.h"
25 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
26 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
27 #include "llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h"
28 #include "llvm/Object/Binary.h"
29 #include "llvm/Object/BuildID.h"
30 #include "llvm/Object/ELFObjectFile.h"
31 #include "llvm/Object/ObjectFile.h"
32 #include "llvm/ProfileData/InstrProf.h"
33 #include "llvm/ProfileData/MemProf.h"
34 #include "llvm/ProfileData/MemProfData.inc"
35 #include "llvm/ProfileData/RawMemProfReader.h"
36 #include "llvm/ProfileData/SampleProf.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Support/Endian.h"
39 #include "llvm/Support/Error.h"
40 #include "llvm/Support/MemoryBuffer.h"
41 #include "llvm/Support/Path.h"
42 
43 #define DEBUG_TYPE "memprof"
44 
45 namespace llvm {
46 namespace memprof {
47 namespace {
48 template <class T = uint64_t> inline T alignedRead(const char *Ptr) {
49   static_assert(std::is_pod<T>::value, "Not a pod type.");
50   assert(reinterpret_cast<size_t>(Ptr) % sizeof(T) == 0 && "Unaligned Read");
51   return *reinterpret_cast<const T *>(Ptr);
52 }
53 
54 Error checkBuffer(const MemoryBuffer &Buffer) {
55   if (!RawMemProfReader::hasFormat(Buffer))
56     return make_error<InstrProfError>(instrprof_error::bad_magic);
57 
58   if (Buffer.getBufferSize() == 0)
59     return make_error<InstrProfError>(instrprof_error::empty_raw_profile);
60 
61   if (Buffer.getBufferSize() < sizeof(Header)) {
62     return make_error<InstrProfError>(instrprof_error::truncated);
63   }
64 
65   // The size of the buffer can be > header total size since we allow repeated
66   // serialization of memprof profiles to the same file.
67   uint64_t TotalSize = 0;
68   const char *Next = Buffer.getBufferStart();
69   while (Next < Buffer.getBufferEnd()) {
70     auto *H = reinterpret_cast<const Header *>(Next);
71     if (H->Version != MEMPROF_RAW_VERSION) {
72       return make_error<InstrProfError>(instrprof_error::unsupported_version);
73     }
74 
75     TotalSize += H->TotalSize;
76     Next += H->TotalSize;
77   }
78 
79   if (Buffer.getBufferSize() != TotalSize) {
80     return make_error<InstrProfError>(instrprof_error::malformed);
81   }
82   return Error::success();
83 }
84 
85 llvm::SmallVector<SegmentEntry> readSegmentEntries(const char *Ptr) {
86   using namespace support;
87 
88   const uint64_t NumItemsToRead =
89       endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
90   llvm::SmallVector<SegmentEntry> Items;
91   for (uint64_t I = 0; I < NumItemsToRead; I++) {
92     Items.push_back(*reinterpret_cast<const SegmentEntry *>(
93         Ptr + I * sizeof(SegmentEntry)));
94   }
95   return Items;
96 }
97 
98 llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>>
99 readMemInfoBlocks(const char *Ptr) {
100   using namespace support;
101 
102   const uint64_t NumItemsToRead =
103       endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
104   llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>> Items;
105   for (uint64_t I = 0; I < NumItemsToRead; I++) {
106     const uint64_t Id =
107         endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
108     const MemInfoBlock MIB = *reinterpret_cast<const MemInfoBlock *>(Ptr);
109     Items.push_back({Id, MIB});
110     // Only increment by size of MIB since readNext implicitly increments.
111     Ptr += sizeof(MemInfoBlock);
112   }
113   return Items;
114 }
115 
116 CallStackMap readStackInfo(const char *Ptr) {
117   using namespace support;
118 
119   const uint64_t NumItemsToRead =
120       endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
121   CallStackMap Items;
122 
123   for (uint64_t I = 0; I < NumItemsToRead; I++) {
124     const uint64_t StackId =
125         endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
126     const uint64_t NumPCs =
127         endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
128 
129     SmallVector<uint64_t> CallStack;
130     for (uint64_t J = 0; J < NumPCs; J++) {
131       CallStack.push_back(
132           endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr));
133     }
134 
135     Items[StackId] = CallStack;
136   }
137   return Items;
138 }
139 
140 // Merges the contents of stack information in \p From to \p To. Returns true if
141 // any stack ids observed previously map to a different set of program counter
142 // addresses.
143 bool mergeStackMap(const CallStackMap &From, CallStackMap &To) {
144   for (const auto &IdStack : From) {
145     auto I = To.find(IdStack.first);
146     if (I == To.end()) {
147       To[IdStack.first] = IdStack.second;
148     } else {
149       // Check that the PCs are the same (in order).
150       if (IdStack.second != I->second)
151         return true;
152     }
153   }
154   return false;
155 }
156 
157 Error report(Error E, const StringRef Context) {
158   return joinErrors(createStringError(inconvertibleErrorCode(), Context),
159                     std::move(E));
160 }
161 
162 bool isRuntimePath(const StringRef Path) {
163   const StringRef Filename = llvm::sys::path::filename(Path);
164   // This list should be updated in case new files with additional interceptors
165   // are added to the memprof runtime.
166   return Filename.equals("memprof_malloc_linux.cpp") ||
167          Filename.equals("memprof_interceptors.cpp") ||
168          Filename.equals("memprof_new_delete.cpp");
169 }
170 
171 std::string getBuildIdString(const SegmentEntry &Entry) {
172   // If the build id is unset print a helpful string instead of all zeros.
173   if (Entry.BuildIdSize == 0)
174     return "<None>";
175 
176   std::string Str;
177   raw_string_ostream OS(Str);
178   for (size_t I = 0; I < Entry.BuildIdSize; I++) {
179     OS << format_hex_no_prefix(Entry.BuildId[I], 2);
180   }
181   return OS.str();
182 }
183 } // namespace
184 
185 Expected<std::unique_ptr<RawMemProfReader>>
186 RawMemProfReader::create(const Twine &Path, const StringRef ProfiledBinary,
187                          bool KeepName) {
188   auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path);
189   if (std::error_code EC = BufferOr.getError())
190     return report(errorCodeToError(EC), Path.getSingleStringRef());
191 
192   std::unique_ptr<MemoryBuffer> Buffer(BufferOr.get().release());
193   return create(std::move(Buffer), ProfiledBinary, KeepName);
194 }
195 
196 Expected<std::unique_ptr<RawMemProfReader>>
197 RawMemProfReader::create(std::unique_ptr<MemoryBuffer> Buffer,
198                          const StringRef ProfiledBinary, bool KeepName) {
199   if (Error E = checkBuffer(*Buffer))
200     return report(std::move(E), Buffer->getBufferIdentifier());
201 
202   if (ProfiledBinary.empty()) {
203     // Peek the build ids to print a helpful error message.
204     const std::vector<std::string> BuildIds = peekBuildIds(Buffer.get());
205     std::string ErrorMessage(
206         R"(Path to profiled binary is empty, expected binary with one of the following build ids:
207 )");
208     for (const auto &Id : BuildIds) {
209       ErrorMessage += "\n BuildId: ";
210       ErrorMessage += Id;
211     }
212     return report(
213         make_error<StringError>(ErrorMessage, inconvertibleErrorCode()),
214         /*Context=*/"");
215   }
216 
217   auto BinaryOr = llvm::object::createBinary(ProfiledBinary);
218   if (!BinaryOr) {
219     return report(BinaryOr.takeError(), ProfiledBinary);
220   }
221 
222   // Use new here since constructor is private.
223   std::unique_ptr<RawMemProfReader> Reader(
224       new RawMemProfReader(std::move(BinaryOr.get()), KeepName));
225   if (Error E = Reader->initialize(std::move(Buffer))) {
226     return std::move(E);
227   }
228   return std::move(Reader);
229 }
230 
231 bool RawMemProfReader::hasFormat(const StringRef Path) {
232   auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path);
233   if (!BufferOr)
234     return false;
235 
236   std::unique_ptr<MemoryBuffer> Buffer(BufferOr.get().release());
237   return hasFormat(*Buffer);
238 }
239 
240 bool RawMemProfReader::hasFormat(const MemoryBuffer &Buffer) {
241   if (Buffer.getBufferSize() < sizeof(uint64_t))
242     return false;
243   // Aligned read to sanity check that the buffer was allocated with at least 8b
244   // alignment.
245   const uint64_t Magic = alignedRead(Buffer.getBufferStart());
246   return Magic == MEMPROF_RAW_MAGIC_64;
247 }
248 
249 void RawMemProfReader::printYAML(raw_ostream &OS) {
250   uint64_t NumAllocFunctions = 0, NumMibInfo = 0;
251   for (const auto &KV : FunctionProfileData) {
252     const size_t NumAllocSites = KV.second.AllocSites.size();
253     if (NumAllocSites > 0) {
254       NumAllocFunctions++;
255       NumMibInfo += NumAllocSites;
256     }
257   }
258 
259   OS << "MemprofProfile:\n";
260   OS << "  Summary:\n";
261   OS << "    Version: " << MEMPROF_RAW_VERSION << "\n";
262   OS << "    NumSegments: " << SegmentInfo.size() << "\n";
263   OS << "    NumMibInfo: " << NumMibInfo << "\n";
264   OS << "    NumAllocFunctions: " << NumAllocFunctions << "\n";
265   OS << "    NumStackOffsets: " << StackMap.size() << "\n";
266   // Print out the segment information.
267   OS << "  Segments:\n";
268   for (const auto &Entry : SegmentInfo) {
269     OS << "  -\n";
270     OS << "    BuildId: " << getBuildIdString(Entry) << "\n";
271     OS << "    Start: 0x" << llvm::utohexstr(Entry.Start) << "\n";
272     OS << "    End: 0x" << llvm::utohexstr(Entry.End) << "\n";
273     OS << "    Offset: 0x" << llvm::utohexstr(Entry.Offset) << "\n";
274   }
275   // Print out the merged contents of the profiles.
276   OS << "  Records:\n";
277   for (const auto &Entry : *this) {
278     OS << "  -\n";
279     OS << "    FunctionGUID: " << Entry.first << "\n";
280     Entry.second.print(OS);
281   }
282 }
283 
284 Error RawMemProfReader::initialize(std::unique_ptr<MemoryBuffer> DataBuffer) {
285   const StringRef FileName = Binary.getBinary()->getFileName();
286 
287   auto *ElfObject = dyn_cast<object::ELFObjectFileBase>(Binary.getBinary());
288   if (!ElfObject) {
289     return report(make_error<StringError>(Twine("Not an ELF file: "),
290                                           inconvertibleErrorCode()),
291                   FileName);
292   }
293 
294   // Check whether the profiled binary was built with position independent code
295   // (PIC). Perform sanity checks for assumptions we rely on to simplify
296   // symbolization.
297   auto* Elf64LEObject = llvm::cast<llvm::object::ELF64LEObjectFile>(ElfObject);
298   const llvm::object::ELF64LEFile& ElfFile = Elf64LEObject->getELFFile();
299   auto PHdrsOr = ElfFile.program_headers();
300   if (!PHdrsOr)
301     return report(
302         make_error<StringError>(Twine("Could not read program headers: "),
303                                 inconvertibleErrorCode()),
304         FileName);
305 
306   int NumExecutableSegments = 0;
307   for (const auto &Phdr : *PHdrsOr) {
308     if (Phdr.p_type == ELF::PT_LOAD) {
309       if (Phdr.p_flags & ELF::PF_X) {
310         // We assume only one text segment in the main binary for simplicity and
311         // reduce the overhead of checking multiple ranges during symbolization.
312         if (++NumExecutableSegments > 1) {
313           return report(
314               make_error<StringError>(
315                   "Expect only one executable load segment in the binary",
316                   inconvertibleErrorCode()),
317               FileName);
318         }
319         // Segment will always be loaded at a page boundary, expect it to be
320         // aligned already. Assume 4K pagesize for the machine from which the
321         // profile has been collected. This should be fine for now, in case we
322         // want to support other pagesizes it can be recorded in the raw profile
323         // during collection.
324         PreferredTextSegmentAddress = Phdr.p_vaddr;
325         assert(Phdr.p_vaddr == (Phdr.p_vaddr & ~(0x1000 - 1U)) &&
326                "Expect p_vaddr to always be page aligned");
327         assert(Phdr.p_offset == 0 && "Expect p_offset = 0 for symbolization.");
328       }
329     }
330   }
331 
332   auto Triple = ElfObject->makeTriple();
333   if (!Triple.isX86())
334     return report(make_error<StringError>(Twine("Unsupported target: ") +
335                                               Triple.getArchName(),
336                                           inconvertibleErrorCode()),
337                   FileName);
338 
339   // Process the raw profile.
340   if (Error E = readRawProfile(std::move(DataBuffer)))
341     return E;
342 
343   if (Error E = setupForSymbolization())
344     return E;
345 
346   auto *Object = cast<object::ObjectFile>(Binary.getBinary());
347   std::unique_ptr<DIContext> Context = DWARFContext::create(
348       *Object, DWARFContext::ProcessDebugRelocations::Process);
349 
350   auto SOFOr = symbolize::SymbolizableObjectFile::create(
351       Object, std::move(Context), /*UntagAddresses=*/false);
352   if (!SOFOr)
353     return report(SOFOr.takeError(), FileName);
354   auto Symbolizer = std::move(SOFOr.get());
355 
356   // The symbolizer ownership is moved into symbolizeAndFilterStackFrames so
357   // that it is freed automatically at the end, when it is no longer used. This
358   // reduces peak memory since it won't be live while also mapping the raw
359   // profile into records afterwards.
360   if (Error E = symbolizeAndFilterStackFrames(std::move(Symbolizer)))
361     return E;
362 
363   return mapRawProfileToRecords();
364 }
365 
366 Error RawMemProfReader::setupForSymbolization() {
367   auto *Object = cast<object::ObjectFile>(Binary.getBinary());
368   object::BuildIDRef BinaryId = object::getBuildID(Object);
369   if (BinaryId.empty())
370     return make_error<StringError>(Twine("No build id found in binary ") +
371                                        Binary.getBinary()->getFileName(),
372                                    inconvertibleErrorCode());
373 
374   int NumMatched = 0;
375   for (const auto &Entry : SegmentInfo) {
376     llvm::ArrayRef<uint8_t> SegmentId(Entry.BuildId, Entry.BuildIdSize);
377     if (BinaryId == SegmentId) {
378       // We assume only one text segment in the main binary for simplicity and
379       // reduce the overhead of checking multiple ranges during symbolization.
380       if (++NumMatched > 1) {
381         return make_error<StringError>(
382             "We expect only one executable segment in the profiled binary",
383             inconvertibleErrorCode());
384       }
385       ProfiledTextSegmentStart = Entry.Start;
386       ProfiledTextSegmentEnd = Entry.End;
387     }
388   }
389   assert(NumMatched != 0 && "No matching executable segments in segment info.");
390   assert((PreferredTextSegmentAddress == 0 ||
391           (PreferredTextSegmentAddress == ProfiledTextSegmentStart)) &&
392          "Expect text segment address to be 0 or equal to profiled text "
393          "segment start.");
394   return Error::success();
395 }
396 
397 Error RawMemProfReader::mapRawProfileToRecords() {
398   // Hold a mapping from function to each callsite location we encounter within
399   // it that is part of some dynamic allocation context. The location is stored
400   // as a pointer to a symbolized list of inline frames.
401   using LocationPtr = const llvm::SmallVector<FrameId> *;
402   llvm::MapVector<GlobalValue::GUID, llvm::SetVector<LocationPtr>>
403       PerFunctionCallSites;
404 
405   // Convert the raw profile callstack data into memprof records. While doing so
406   // keep track of related contexts so that we can fill these in later.
407   for (const auto &Entry : CallstackProfileData) {
408     const uint64_t StackId = Entry.first;
409 
410     auto It = StackMap.find(StackId);
411     if (It == StackMap.end())
412       return make_error<InstrProfError>(
413           instrprof_error::malformed,
414           "memprof callstack record does not contain id: " + Twine(StackId));
415 
416     // Construct the symbolized callstack.
417     llvm::SmallVector<FrameId> Callstack;
418     Callstack.reserve(It->getSecond().size());
419 
420     llvm::ArrayRef<uint64_t> Addresses = It->getSecond();
421     for (size_t I = 0; I < Addresses.size(); I++) {
422       const uint64_t Address = Addresses[I];
423       assert(SymbolizedFrame.count(Address) > 0 &&
424              "Address not found in SymbolizedFrame map");
425       const SmallVector<FrameId> &Frames = SymbolizedFrame[Address];
426 
427       assert(!idToFrame(Frames.back()).IsInlineFrame &&
428              "The last frame should not be inlined");
429 
430       // Record the callsites for each function. Skip the first frame of the
431       // first address since it is the allocation site itself that is recorded
432       // as an alloc site.
433       for (size_t J = 0; J < Frames.size(); J++) {
434         if (I == 0 && J == 0)
435           continue;
436         // We attach the entire bottom-up frame here for the callsite even
437         // though we only need the frames up to and including the frame for
438         // Frames[J].Function. This will enable better deduplication for
439         // compression in the future.
440         const GlobalValue::GUID Guid = idToFrame(Frames[J]).Function;
441         PerFunctionCallSites[Guid].insert(&Frames);
442       }
443 
444       // Add all the frames to the current allocation callstack.
445       Callstack.append(Frames.begin(), Frames.end());
446     }
447 
448     // We attach the memprof record to each function bottom-up including the
449     // first non-inline frame.
450     for (size_t I = 0; /*Break out using the condition below*/; I++) {
451       const Frame &F = idToFrame(Callstack[I]);
452       auto Result =
453           FunctionProfileData.insert({F.Function, IndexedMemProfRecord()});
454       IndexedMemProfRecord &Record = Result.first->second;
455       Record.AllocSites.emplace_back(Callstack, Entry.second);
456 
457       if (!F.IsInlineFrame)
458         break;
459     }
460   }
461 
462   // Fill in the related callsites per function.
463   for (const auto &[Id, Locs] : PerFunctionCallSites) {
464     // Some functions may have only callsite data and no allocation data. Here
465     // we insert a new entry for callsite data if we need to.
466     auto Result = FunctionProfileData.insert({Id, IndexedMemProfRecord()});
467     IndexedMemProfRecord &Record = Result.first->second;
468     for (LocationPtr Loc : Locs) {
469       Record.CallSites.push_back(*Loc);
470     }
471   }
472 
473   return Error::success();
474 }
475 
476 Error RawMemProfReader::symbolizeAndFilterStackFrames(
477     std::unique_ptr<llvm::symbolize::SymbolizableModule> Symbolizer) {
478   // The specifier to use when symbolization is requested.
479   const DILineInfoSpecifier Specifier(
480       DILineInfoSpecifier::FileLineInfoKind::RawValue,
481       DILineInfoSpecifier::FunctionNameKind::LinkageName);
482 
483   // For entries where all PCs in the callstack are discarded, we erase the
484   // entry from the stack map.
485   llvm::SmallVector<uint64_t> EntriesToErase;
486   // We keep track of all prior discarded entries so that we can avoid invoking
487   // the symbolizer for such entries.
488   llvm::DenseSet<uint64_t> AllVAddrsToDiscard;
489   for (auto &Entry : StackMap) {
490     for (const uint64_t VAddr : Entry.getSecond()) {
491       // Check if we have already symbolized and cached the result or if we
492       // don't want to attempt symbolization since we know this address is bad.
493       // In this case the address is also removed from the current callstack.
494       if (SymbolizedFrame.count(VAddr) > 0 ||
495           AllVAddrsToDiscard.contains(VAddr))
496         continue;
497 
498       Expected<DIInliningInfo> DIOr = Symbolizer->symbolizeInlinedCode(
499           getModuleOffset(VAddr), Specifier, /*UseSymbolTable=*/false);
500       if (!DIOr)
501         return DIOr.takeError();
502       DIInliningInfo DI = DIOr.get();
503 
504       // Drop frames which we can't symbolize or if they belong to the runtime.
505       if (DI.getFrame(0).FunctionName == DILineInfo::BadString ||
506           isRuntimePath(DI.getFrame(0).FileName)) {
507         AllVAddrsToDiscard.insert(VAddr);
508         continue;
509       }
510 
511       for (size_t I = 0, NumFrames = DI.getNumberOfFrames(); I < NumFrames;
512            I++) {
513         const auto &DIFrame = DI.getFrame(I);
514         const uint64_t Guid =
515             IndexedMemProfRecord::getGUID(DIFrame.FunctionName);
516         const Frame F(Guid, DIFrame.Line - DIFrame.StartLine, DIFrame.Column,
517                       // Only the last entry is not an inlined location.
518                       I != NumFrames - 1);
519         // Here we retain a mapping from the GUID to canonical symbol name
520         // instead of adding it to the frame object directly to reduce memory
521         // overhead. This is because there can be many unique frames,
522         // particularly for callsite frames.
523         if (KeepSymbolName) {
524           StringRef CanonicalName =
525               sampleprof::FunctionSamples::getCanonicalFnName(
526                   DIFrame.FunctionName);
527           GuidToSymbolName.insert({Guid, CanonicalName.str()});
528         }
529 
530         const FrameId Hash = F.hash();
531         IdToFrame.insert({Hash, F});
532         SymbolizedFrame[VAddr].push_back(Hash);
533       }
534     }
535 
536     auto &CallStack = Entry.getSecond();
537     llvm::erase_if(CallStack, [&AllVAddrsToDiscard](const uint64_t A) {
538       return AllVAddrsToDiscard.contains(A);
539     });
540     if (CallStack.empty())
541       EntriesToErase.push_back(Entry.getFirst());
542   }
543 
544   // Drop the entries where the callstack is empty.
545   for (const uint64_t Id : EntriesToErase) {
546     StackMap.erase(Id);
547     CallstackProfileData.erase(Id);
548   }
549 
550   if (StackMap.empty())
551     return make_error<InstrProfError>(
552         instrprof_error::malformed,
553         "no entries in callstack map after symbolization");
554 
555   return Error::success();
556 }
557 
558 std::vector<std::string>
559 RawMemProfReader::peekBuildIds(MemoryBuffer *DataBuffer) {
560   const char *Next = DataBuffer->getBufferStart();
561   // Use a set + vector since a profile file may contain multiple raw profile
562   // dumps, each with segment information. We want them unique and in order they
563   // were stored in the profile; the profiled binary should be the first entry.
564   // The runtime uses dl_iterate_phdr and the "... first object visited by
565   // callback is the main program."
566   // https://man7.org/linux/man-pages/man3/dl_iterate_phdr.3.html
567   std::vector<std::string> BuildIds;
568   llvm::SmallSet<std::string, 10> BuildIdsSet;
569   while (Next < DataBuffer->getBufferEnd()) {
570     auto *Header = reinterpret_cast<const memprof::Header *>(Next);
571 
572     const llvm::SmallVector<SegmentEntry> Entries =
573         readSegmentEntries(Next + Header->SegmentOffset);
574 
575     for (const auto &Entry : Entries) {
576       const std::string Id = getBuildIdString(Entry);
577       if (BuildIdsSet.contains(Id))
578         continue;
579       BuildIds.push_back(Id);
580       BuildIdsSet.insert(Id);
581     }
582 
583     Next += Header->TotalSize;
584   }
585   return BuildIds;
586 }
587 
588 Error RawMemProfReader::readRawProfile(
589     std::unique_ptr<MemoryBuffer> DataBuffer) {
590   const char *Next = DataBuffer->getBufferStart();
591 
592   while (Next < DataBuffer->getBufferEnd()) {
593     auto *Header = reinterpret_cast<const memprof::Header *>(Next);
594 
595     // Read in the segment information, check whether its the same across all
596     // profiles in this binary file.
597     const llvm::SmallVector<SegmentEntry> Entries =
598         readSegmentEntries(Next + Header->SegmentOffset);
599     if (!SegmentInfo.empty() && SegmentInfo != Entries) {
600       // We do not expect segment information to change when deserializing from
601       // the same binary profile file. This can happen if dynamic libraries are
602       // loaded/unloaded between profile dumping.
603       return make_error<InstrProfError>(
604           instrprof_error::malformed,
605           "memprof raw profile has different segment information");
606     }
607     SegmentInfo.assign(Entries.begin(), Entries.end());
608 
609     // Read in the MemInfoBlocks. Merge them based on stack id - we assume that
610     // raw profiles in the same binary file are from the same process so the
611     // stackdepot ids are the same.
612     for (const auto &Value : readMemInfoBlocks(Next + Header->MIBOffset)) {
613       if (CallstackProfileData.count(Value.first)) {
614         CallstackProfileData[Value.first].Merge(Value.second);
615       } else {
616         CallstackProfileData[Value.first] = Value.second;
617       }
618     }
619 
620     // Read in the callstack for each ids. For multiple raw profiles in the same
621     // file, we expect that the callstack is the same for a unique id.
622     const CallStackMap CSM = readStackInfo(Next + Header->StackOffset);
623     if (StackMap.empty()) {
624       StackMap = CSM;
625     } else {
626       if (mergeStackMap(CSM, StackMap))
627         return make_error<InstrProfError>(
628             instrprof_error::malformed,
629             "memprof raw profile got different call stack for same id");
630     }
631 
632     Next += Header->TotalSize;
633   }
634 
635   return Error::success();
636 }
637 
638 object::SectionedAddress
639 RawMemProfReader::getModuleOffset(const uint64_t VirtualAddress) {
640   if (VirtualAddress > ProfiledTextSegmentStart &&
641       VirtualAddress <= ProfiledTextSegmentEnd) {
642     // For PIE binaries, the preferred address is zero and we adjust the virtual
643     // address by start of the profiled segment assuming that the offset of the
644     // segment in the binary is zero. For non-PIE binaries the preferred and
645     // profiled segment addresses should be equal and this is a no-op.
646     const uint64_t AdjustedAddress =
647         VirtualAddress + PreferredTextSegmentAddress - ProfiledTextSegmentStart;
648     return object::SectionedAddress{AdjustedAddress};
649   }
650   // Addresses which do not originate from the profiled text segment in the
651   // binary are not adjusted. These will fail symbolization and be filtered out
652   // during processing.
653   return object::SectionedAddress{VirtualAddress};
654 }
655 
656 Error RawMemProfReader::readNextRecord(
657     GuidMemProfRecordPair &GuidRecord,
658     std::function<const Frame(const FrameId)> Callback) {
659   // Create a new callback for the RawMemProfRecord iterator so that we can
660   // provide the symbol name if the reader was initialized with KeepSymbolName =
661   // true. This is useful for debugging and testing.
662   auto IdToFrameCallback = [this](const FrameId Id) {
663     Frame F = this->idToFrame(Id);
664     if (!this->KeepSymbolName)
665       return F;
666     auto Iter = this->GuidToSymbolName.find(F.Function);
667     assert(Iter != this->GuidToSymbolName.end());
668     F.SymbolName = Iter->getSecond();
669     return F;
670   };
671   return MemProfReader::readNextRecord(GuidRecord, IdToFrameCallback);
672 }
673 } // namespace memprof
674 } // namespace llvm
675