1 //===- SyntheticSections.h -------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLD_MACHO_SYNTHETIC_SECTIONS_H
10 #define LLD_MACHO_SYNTHETIC_SECTIONS_H
11 
12 #include "Config.h"
13 #include "ExportTrie.h"
14 #include "InputSection.h"
15 #include "OutputSection.h"
16 #include "OutputSegment.h"
17 #include "Target.h"
18 #include "Writer.h"
19 
20 #include "llvm/ADT/DenseMap.h"
21 #include "llvm/ADT/Hashing.h"
22 #include "llvm/ADT/Optional.h"
23 #include "llvm/ADT/SetVector.h"
24 #include "llvm/MC/StringTableBuilder.h"
25 #include "llvm/Support/MathExtras.h"
26 #include "llvm/Support/raw_ostream.h"
27 
28 #include <unordered_map>
29 
30 namespace llvm {
31 class DWARFUnit;
32 } // namespace llvm
33 
34 namespace lld {
35 namespace macho {
36 
37 class Defined;
38 class DylibSymbol;
39 class LoadCommand;
40 class ObjFile;
41 class UnwindInfoSection;
42 
43 class SyntheticSection : public OutputSection {
44 public:
45   SyntheticSection(const char *segname, const char *name);
46   virtual ~SyntheticSection() = default;
47 
48   static bool classof(const OutputSection *sec) {
49     return sec->kind() == SyntheticKind;
50   }
51 
52   StringRef segname;
53   // This fake InputSection makes it easier for us to write code that applies
54   // generically to both user inputs and synthetics.
55   InputSection *isec;
56 };
57 
58 // All sections in __LINKEDIT should inherit from this.
59 class LinkEditSection : public SyntheticSection {
60 public:
61   LinkEditSection(const char *segname, const char *name)
62       : SyntheticSection(segname, name) {
63     align = target->wordSize;
64   }
65 
66   // Implementations of this method can assume that the regular (non-__LINKEDIT)
67   // sections already have their addresses assigned.
68   virtual void finalizeContents() {}
69 
70   // Sections in __LINKEDIT are special: their offsets are recorded in the
71   // load commands like LC_DYLD_INFO_ONLY and LC_SYMTAB, instead of in section
72   // headers.
73   bool isHidden() const final { return true; }
74 
75   virtual uint64_t getRawSize() const = 0;
76 
77   // codesign (or more specifically libstuff) checks that each section in
78   // __LINKEDIT ends where the next one starts -- no gaps are permitted. We
79   // therefore align every section's start and end points to WordSize.
80   //
81   // NOTE: This assumes that the extra bytes required for alignment can be
82   // zero-valued bytes.
83   uint64_t getSize() const final { return llvm::alignTo(getRawSize(), align); }
84 };
85 
86 // The header of the Mach-O file, which must have a file offset of zero.
87 class MachHeaderSection final : public SyntheticSection {
88 public:
89   MachHeaderSection();
90   bool isHidden() const override { return true; }
91   uint64_t getSize() const override;
92   void writeTo(uint8_t *buf) const override;
93 
94   void addLoadCommand(LoadCommand *);
95 
96 protected:
97   std::vector<LoadCommand *> loadCommands;
98   uint32_t sizeOfCmds = 0;
99 };
100 
101 // A hidden section that exists solely for the purpose of creating the
102 // __PAGEZERO segment, which is used to catch null pointer dereferences.
103 class PageZeroSection final : public SyntheticSection {
104 public:
105   PageZeroSection();
106   bool isHidden() const override { return true; }
107   bool isNeeded() const override { return target->pageZeroSize != 0; }
108   uint64_t getSize() const override { return target->pageZeroSize; }
109   uint64_t getFileSize() const override { return 0; }
110   void writeTo(uint8_t *buf) const override {}
111 };
112 
113 // This is the base class for the GOT and TLVPointer sections, which are nearly
114 // functionally identical -- they will both be populated by dyld with addresses
115 // to non-lazily-loaded dylib symbols. The main difference is that the
116 // TLVPointerSection stores references to thread-local variables.
117 class NonLazyPointerSectionBase : public SyntheticSection {
118 public:
119   NonLazyPointerSectionBase(const char *segname, const char *name);
120   const llvm::SetVector<const Symbol *> &getEntries() const { return entries; }
121   bool isNeeded() const override { return !entries.empty(); }
122   uint64_t getSize() const override {
123     return entries.size() * target->wordSize;
124   }
125   void writeTo(uint8_t *buf) const override;
126   void addEntry(Symbol *sym);
127   uint64_t getVA(uint32_t gotIndex) const {
128     return addr + gotIndex * target->wordSize;
129   }
130 
131 private:
132   llvm::SetVector<const Symbol *> entries;
133 };
134 
135 class GotSection final : public NonLazyPointerSectionBase {
136 public:
137   GotSection();
138 };
139 
140 class TlvPointerSection final : public NonLazyPointerSectionBase {
141 public:
142   TlvPointerSection();
143 };
144 
145 struct Location {
146   const InputSection *isec;
147   uint64_t offset;
148 
149   Location(const InputSection *isec, uint64_t offset)
150       : isec(isec), offset(offset) {}
151   uint64_t getVA() const { return isec->getVA(offset); }
152 };
153 
154 // Stores rebase opcodes, which tell dyld where absolute addresses have been
155 // encoded in the binary. If the binary is not loaded at its preferred address,
156 // dyld has to rebase these addresses by adding an offset to them.
157 class RebaseSection final : public LinkEditSection {
158 public:
159   RebaseSection();
160   void finalizeContents() override;
161   uint64_t getRawSize() const override { return contents.size(); }
162   bool isNeeded() const override { return !locations.empty(); }
163   void writeTo(uint8_t *buf) const override;
164 
165   void addEntry(const InputSection *isec, uint64_t offset) {
166     if (config->isPic)
167       locations.push_back({isec, offset});
168   }
169 
170 private:
171   std::vector<Location> locations;
172   SmallVector<char, 128> contents;
173 };
174 
175 struct BindingEntry {
176   int64_t addend;
177   Location target;
178   BindingEntry(int64_t addend, Location target)
179       : addend(addend), target(std::move(target)) {}
180 };
181 
182 template <class Sym>
183 using BindingsMap = llvm::DenseMap<Sym, std::vector<BindingEntry>>;
184 
185 // Stores bind opcodes for telling dyld which symbols to load non-lazily.
186 class BindingSection final : public LinkEditSection {
187 public:
188   BindingSection();
189   void finalizeContents() override;
190   uint64_t getRawSize() const override { return contents.size(); }
191   bool isNeeded() const override { return !bindingsMap.empty(); }
192   void writeTo(uint8_t *buf) const override;
193 
194   void addEntry(const Symbol *dysym, const InputSection *isec, uint64_t offset,
195                 int64_t addend = 0) {
196     bindingsMap[dysym].emplace_back(addend, Location(isec, offset));
197   }
198 
199 private:
200   BindingsMap<const Symbol *> bindingsMap;
201   SmallVector<char, 128> contents;
202 };
203 
204 // Stores bind opcodes for telling dyld which weak symbols need coalescing.
205 // There are two types of entries in this section:
206 //
207 //   1) Non-weak definitions: This is a symbol definition that weak symbols in
208 //   other dylibs should coalesce to.
209 //
210 //   2) Weak bindings: These tell dyld that a given symbol reference should
211 //   coalesce to a non-weak definition if one is found. Note that unlike the
212 //   entries in the BindingSection, the bindings here only refer to these
213 //   symbols by name, but do not specify which dylib to load them from.
214 class WeakBindingSection final : public LinkEditSection {
215 public:
216   WeakBindingSection();
217   void finalizeContents() override;
218   uint64_t getRawSize() const override { return contents.size(); }
219   bool isNeeded() const override {
220     return !bindingsMap.empty() || !definitions.empty();
221   }
222 
223   void writeTo(uint8_t *buf) const override;
224 
225   void addEntry(const Symbol *symbol, const InputSection *isec, uint64_t offset,
226                 int64_t addend = 0) {
227     bindingsMap[symbol].emplace_back(addend, Location(isec, offset));
228   }
229 
230   bool hasEntry() const { return !bindingsMap.empty(); }
231 
232   void addNonWeakDefinition(const Defined *defined) {
233     definitions.emplace_back(defined);
234   }
235 
236   bool hasNonWeakDefinition() const { return !definitions.empty(); }
237 
238 private:
239   BindingsMap<const Symbol *> bindingsMap;
240   std::vector<const Defined *> definitions;
241   SmallVector<char, 128> contents;
242 };
243 
244 // The following sections implement lazy symbol binding -- very similar to the
245 // PLT mechanism in ELF.
246 //
247 // ELF's .plt section is broken up into two sections in Mach-O: StubsSection
248 // and StubHelperSection. Calls to functions in dylibs will end up calling into
249 // StubsSection, which contains indirect jumps to addresses stored in the
250 // LazyPointerSection (the counterpart to ELF's .plt.got).
251 //
252 // We will first describe how non-weak symbols are handled.
253 //
254 // At program start, the LazyPointerSection contains addresses that point into
255 // one of the entry points in the middle of the StubHelperSection. The code in
256 // StubHelperSection will push on the stack an offset into the
257 // LazyBindingSection. The push is followed by a jump to the beginning of the
258 // StubHelperSection (similar to PLT0), which then calls into dyld_stub_binder.
259 // dyld_stub_binder is a non-lazily-bound symbol, so this call looks it up in
260 // the GOT.
261 //
262 // The stub binder will look up the bind opcodes in the LazyBindingSection at
263 // the given offset. The bind opcodes will tell the binder to update the
264 // address in the LazyPointerSection to point to the symbol, so that subsequent
265 // calls don't have to redo the symbol resolution. The binder will then jump to
266 // the resolved symbol.
267 //
268 // With weak symbols, the situation is slightly different. Since there is no
269 // "weak lazy" lookup, function calls to weak symbols are always non-lazily
270 // bound. We emit both regular non-lazy bindings as well as weak bindings, in
271 // order that the weak bindings may overwrite the non-lazy bindings if an
272 // appropriate symbol is found at runtime. However, the bound addresses will
273 // still be written (non-lazily) into the LazyPointerSection.
274 
275 class StubsSection final : public SyntheticSection {
276 public:
277   StubsSection();
278   uint64_t getSize() const override;
279   bool isNeeded() const override { return !entries.empty(); }
280   void finalize() override;
281   void writeTo(uint8_t *buf) const override;
282   const llvm::SetVector<Symbol *> &getEntries() const { return entries; }
283   // Returns whether the symbol was added. Note that every stubs entry will
284   // have a corresponding entry in the LazyPointerSection.
285   bool addEntry(Symbol *);
286   uint64_t getVA(uint32_t stubsIndex) const {
287     assert(isFinal || target->usesThunks());
288     // ConcatOutputSection::finalize() can seek the address of a
289     // stub before its address is assigned. Before __stubs is
290     // finalized, return a contrived out-of-range address.
291     return isFinal ? addr + stubsIndex * target->stubSize
292                    : TargetInfo::outOfRangeVA;
293   }
294 
295   bool isFinal = false; // is address assigned?
296 
297 private:
298   llvm::SetVector<Symbol *> entries;
299 };
300 
301 class StubHelperSection final : public SyntheticSection {
302 public:
303   StubHelperSection();
304   uint64_t getSize() const override;
305   bool isNeeded() const override;
306   void writeTo(uint8_t *buf) const override;
307 
308   void setup();
309 
310   DylibSymbol *stubBinder = nullptr;
311   Defined *dyldPrivate = nullptr;
312 };
313 
314 // Note that this section may also be targeted by non-lazy bindings. In
315 // particular, this happens when branch relocations target weak symbols.
316 class LazyPointerSection final : public SyntheticSection {
317 public:
318   LazyPointerSection();
319   uint64_t getSize() const override;
320   bool isNeeded() const override;
321   void writeTo(uint8_t *buf) const override;
322 };
323 
324 class LazyBindingSection final : public LinkEditSection {
325 public:
326   LazyBindingSection();
327   void finalizeContents() override;
328   uint64_t getRawSize() const override { return contents.size(); }
329   bool isNeeded() const override { return !entries.empty(); }
330   void writeTo(uint8_t *buf) const override;
331   // Note that every entry here will by referenced by a corresponding entry in
332   // the StubHelperSection.
333   void addEntry(Symbol *dysym);
334   const llvm::SetVector<Symbol *> &getEntries() const { return entries; }
335 
336 private:
337   uint32_t encode(const Symbol &);
338 
339   llvm::SetVector<Symbol *> entries;
340   SmallVector<char, 128> contents;
341   llvm::raw_svector_ostream os{contents};
342 };
343 
344 // Stores a trie that describes the set of exported symbols.
345 class ExportSection final : public LinkEditSection {
346 public:
347   ExportSection();
348   void finalizeContents() override;
349   uint64_t getRawSize() const override { return size; }
350   bool isNeeded() const override { return size; }
351   void writeTo(uint8_t *buf) const override;
352 
353   bool hasWeakSymbol = false;
354 
355 private:
356   TrieBuilder trieBuilder;
357   size_t size = 0;
358 };
359 
360 // Stores 'data in code' entries that describe the locations of
361 // data regions inside code sections.
362 class DataInCodeSection final : public LinkEditSection {
363 public:
364   DataInCodeSection();
365   void finalizeContents() override;
366   uint64_t getRawSize() const override {
367     return sizeof(llvm::MachO::data_in_code_entry) * entries.size();
368   }
369   void writeTo(uint8_t *buf) const override;
370 
371 private:
372   std::vector<llvm::MachO::data_in_code_entry> entries;
373 };
374 
375 // Stores ULEB128 delta encoded addresses of functions.
376 class FunctionStartsSection final : public LinkEditSection {
377 public:
378   FunctionStartsSection();
379   void finalizeContents() override;
380   uint64_t getRawSize() const override { return contents.size(); }
381   void writeTo(uint8_t *buf) const override;
382 
383 private:
384   SmallVector<char, 128> contents;
385 };
386 
387 // Stores the strings referenced by the symbol table.
388 class StringTableSection final : public LinkEditSection {
389 public:
390   StringTableSection();
391   // Returns the start offset of the added string.
392   uint32_t addString(StringRef);
393   uint64_t getRawSize() const override { return size; }
394   void writeTo(uint8_t *buf) const override;
395 
396   static constexpr size_t emptyStringIndex = 1;
397 
398 private:
399   // ld64 emits string tables which start with a space and a zero byte. We
400   // match its behavior here since some tools depend on it.
401   // Consequently, the empty string will be at index 1, not zero.
402   std::vector<StringRef> strings{" "};
403   size_t size = 2;
404 };
405 
406 struct SymtabEntry {
407   Symbol *sym;
408   size_t strx;
409 };
410 
411 struct StabsEntry {
412   uint8_t type = 0;
413   uint32_t strx = StringTableSection::emptyStringIndex;
414   uint8_t sect = 0;
415   uint16_t desc = 0;
416   uint64_t value = 0;
417 
418   StabsEntry() = default;
419   explicit StabsEntry(uint8_t type) : type(type) {}
420 };
421 
422 // Symbols of the same type must be laid out contiguously: we choose to emit
423 // all local symbols first, then external symbols, and finally undefined
424 // symbols. For each symbol type, the LC_DYSYMTAB load command will record the
425 // range (start index and total number) of those symbols in the symbol table.
426 class SymtabSection : public LinkEditSection {
427 public:
428   void finalizeContents() override;
429   uint32_t getNumSymbols() const;
430   uint32_t getNumLocalSymbols() const {
431     return stabs.size() + localSymbols.size();
432   }
433   uint32_t getNumExternalSymbols() const { return externalSymbols.size(); }
434   uint32_t getNumUndefinedSymbols() const { return undefinedSymbols.size(); }
435 
436 private:
437   void emitBeginSourceStab(StringRef);
438   void emitEndSourceStab();
439   void emitObjectFileStab(ObjFile *);
440   void emitEndFunStab(Defined *);
441   void emitStabs();
442 
443 protected:
444   SymtabSection(StringTableSection &);
445 
446   StringTableSection &stringTableSection;
447   // STABS symbols are always local symbols, but we represent them with special
448   // entries because they may use fields like n_sect and n_desc differently.
449   std::vector<StabsEntry> stabs;
450   std::vector<SymtabEntry> localSymbols;
451   std::vector<SymtabEntry> externalSymbols;
452   std::vector<SymtabEntry> undefinedSymbols;
453 };
454 
455 template <class LP> SymtabSection *makeSymtabSection(StringTableSection &);
456 
457 // The indirect symbol table is a list of 32-bit integers that serve as indices
458 // into the (actual) symbol table. The indirect symbol table is a
459 // concatenation of several sub-arrays of indices, each sub-array belonging to
460 // a separate section. The starting offset of each sub-array is stored in the
461 // reserved1 header field of the respective section.
462 //
463 // These sub-arrays provide symbol information for sections that store
464 // contiguous sequences of symbol references. These references can be pointers
465 // (e.g. those in the GOT and TLVP sections) or assembly sequences (e.g.
466 // function stubs).
467 class IndirectSymtabSection final : public LinkEditSection {
468 public:
469   IndirectSymtabSection();
470   void finalizeContents() override;
471   uint32_t getNumSymbols() const;
472   uint64_t getRawSize() const override {
473     return getNumSymbols() * sizeof(uint32_t);
474   }
475   bool isNeeded() const override;
476   void writeTo(uint8_t *buf) const override;
477 };
478 
479 // The code signature comes at the very end of the linked output file.
480 class CodeSignatureSection final : public LinkEditSection {
481 public:
482   // NOTE: These values are duplicated in llvm-objcopy's MachO/Object.h file
483   // and any changes here, should be repeated there.
484   static constexpr uint8_t blockSizeShift = 12;
485   static constexpr size_t blockSize = (1 << blockSizeShift); // 4 KiB
486   static constexpr size_t hashSize = 256 / 8;
487   static constexpr size_t blobHeadersSize = llvm::alignTo<8>(
488       sizeof(llvm::MachO::CS_SuperBlob) + sizeof(llvm::MachO::CS_BlobIndex));
489   static constexpr uint32_t fixedHeadersSize =
490       blobHeadersSize + sizeof(llvm::MachO::CS_CodeDirectory);
491 
492   uint32_t fileNamePad = 0;
493   uint32_t allHeadersSize = 0;
494   StringRef fileName;
495 
496   CodeSignatureSection();
497   uint64_t getRawSize() const override;
498   bool isNeeded() const override { return true; }
499   void writeTo(uint8_t *buf) const override;
500   uint32_t getBlockCount() const;
501   void writeHashes(uint8_t *buf) const;
502 };
503 
504 class BitcodeBundleSection final : public SyntheticSection {
505 public:
506   BitcodeBundleSection();
507   uint64_t getSize() const override { return xarSize; }
508   void finalize() override;
509   void writeTo(uint8_t *buf) const override;
510 
511 private:
512   llvm::SmallString<261> xarPath;
513   uint64_t xarSize;
514 };
515 
516 class CStringSection : public SyntheticSection {
517 public:
518   CStringSection();
519   void addInput(CStringInputSection *);
520   uint64_t getSize() const override { return size; }
521   virtual void finalizeContents();
522   bool isNeeded() const override { return !inputs.empty(); }
523   void writeTo(uint8_t *buf) const override;
524 
525   std::vector<CStringInputSection *> inputs;
526 
527 private:
528   uint64_t size;
529 };
530 
531 class DeduplicatedCStringSection final : public CStringSection {
532 public:
533   uint64_t getSize() const override { return size; }
534   void finalizeContents() override;
535   void writeTo(uint8_t *buf) const override;
536 
537 private:
538   struct StringOffset {
539     uint8_t trailingZeros;
540     uint64_t outSecOff = UINT64_MAX;
541 
542     explicit StringOffset(uint8_t zeros) : trailingZeros(zeros) {}
543   };
544   llvm::DenseMap<llvm::CachedHashStringRef, StringOffset> stringOffsetMap;
545   size_t size = 0;
546 };
547 
548 /*
549  * This section contains deduplicated literal values. The 16-byte values are
550  * laid out first, followed by the 8- and then the 4-byte ones.
551  */
552 class WordLiteralSection final : public SyntheticSection {
553 public:
554   using UInt128 = std::pair<uint64_t, uint64_t>;
555   // I don't think the standard guarantees the size of a pair, so let's make
556   // sure it's exact -- that way we can construct it via `mmap`.
557   static_assert(sizeof(UInt128) == 16, "");
558 
559   WordLiteralSection();
560   void addInput(WordLiteralInputSection *);
561   void finalizeContents();
562   void writeTo(uint8_t *buf) const override;
563 
564   uint64_t getSize() const override {
565     return literal16Map.size() * 16 + literal8Map.size() * 8 +
566            literal4Map.size() * 4;
567   }
568 
569   bool isNeeded() const override {
570     return !literal16Map.empty() || !literal4Map.empty() ||
571            !literal8Map.empty();
572   }
573 
574   uint64_t getLiteral16Offset(uintptr_t buf) const {
575     return literal16Map.at(*reinterpret_cast<const UInt128 *>(buf)) * 16;
576   }
577 
578   uint64_t getLiteral8Offset(uintptr_t buf) const {
579     return literal16Map.size() * 16 +
580            literal8Map.at(*reinterpret_cast<const uint64_t *>(buf)) * 8;
581   }
582 
583   uint64_t getLiteral4Offset(uintptr_t buf) const {
584     return literal16Map.size() * 16 + literal8Map.size() * 8 +
585            literal4Map.at(*reinterpret_cast<const uint32_t *>(buf)) * 4;
586   }
587 
588 private:
589   std::vector<WordLiteralInputSection *> inputs;
590 
591   template <class T> struct Hasher {
592     llvm::hash_code operator()(T v) const { return llvm::hash_value(v); }
593   };
594   // We're using unordered_map instead of DenseMap here because we need to
595   // support all possible integer values -- there are no suitable tombstone
596   // values for DenseMap.
597   std::unordered_map<UInt128, uint64_t, Hasher<UInt128>> literal16Map;
598   std::unordered_map<uint64_t, uint64_t> literal8Map;
599   std::unordered_map<uint32_t, uint64_t> literal4Map;
600 };
601 
602 class ObjCImageInfoSection final : public SyntheticSection {
603 public:
604   ObjCImageInfoSection();
605   bool isNeeded() const override { return !files.empty(); }
606   uint64_t getSize() const override { return 8; }
607   void addFile(const InputFile *file) {
608     assert(!file->objCImageInfo.empty());
609     files.push_back(file);
610   }
611   void finalizeContents();
612   void writeTo(uint8_t *buf) const override;
613 
614 private:
615   struct ImageInfo {
616     uint8_t swiftVersion = 0;
617     bool hasCategoryClassProperties = false;
618   } info;
619   static ImageInfo parseImageInfo(const InputFile *);
620   std::vector<const InputFile *> files; // files with image info
621 };
622 
623 struct InStruct {
624   const uint8_t *bufferStart = nullptr;
625   MachHeaderSection *header = nullptr;
626   CStringSection *cStringSection = nullptr;
627   WordLiteralSection *wordLiteralSection = nullptr;
628   RebaseSection *rebase = nullptr;
629   BindingSection *binding = nullptr;
630   WeakBindingSection *weakBinding = nullptr;
631   LazyBindingSection *lazyBinding = nullptr;
632   ExportSection *exports = nullptr;
633   GotSection *got = nullptr;
634   TlvPointerSection *tlvPointers = nullptr;
635   LazyPointerSection *lazyPointers = nullptr;
636   StubsSection *stubs = nullptr;
637   StubHelperSection *stubHelper = nullptr;
638   UnwindInfoSection *unwindInfo = nullptr;
639   ObjCImageInfoSection *objCImageInfo = nullptr;
640   ConcatInputSection *imageLoaderCache = nullptr;
641 };
642 
643 extern InStruct in;
644 extern std::vector<SyntheticSection *> syntheticSections;
645 
646 void createSyntheticSymbols();
647 
648 } // namespace macho
649 } // namespace lld
650 
651 #endif
652