1 //===- ExportTrie.cpp -----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is a partial implementation of the Mach-O export trie format. It's
10 // essentially a symbol table encoded as a compressed prefix trie, meaning that
11 // the common prefixes of each symbol name are shared for a more compact
12 // representation. The prefixes are stored on the edges of the trie, and one
13 // edge can represent multiple characters. For example, given two exported
14 // symbols _bar and _baz, we will have a trie like this (terminal nodes are
15 // marked with an asterisk):
16 //
17 //              +-+-+
18 //              |   | // root node
19 //              +-+-+
20 //                |
21 //                | _ba
22 //                |
23 //              +-+-+
24 //              |   |
25 //              +-+-+
26 //           r /     \ z
27 //            /       \
28 //        +-+-+       +-+-+
29 //        | * |       | * |
30 //        +-+-+       +-+-+
31 //
32 // More documentation of the format can be found in
33 // llvm/tools/obj2yaml/macho2yaml.cpp.
34 //
35 //===----------------------------------------------------------------------===//
36 
37 #include "ExportTrie.h"
38 #include "Symbols.h"
39 
40 #include "lld/Common/ErrorHandler.h"
41 #include "lld/Common/Memory.h"
42 #include "llvm/ADT/Optional.h"
43 #include "llvm/BinaryFormat/MachO.h"
44 #include "llvm/Support/LEB128.h"
45 
46 using namespace llvm;
47 using namespace lld;
48 using namespace lld::macho;
49 
50 namespace {
51 
52 struct Edge {
53   Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
54 
55   StringRef substring;
56   struct TrieNode *child;
57 };
58 
59 struct ExportInfo {
60   uint64_t address;
61   uint8_t flags = 0;
62   ExportInfo(const Symbol &sym, uint64_t imageBase)
63       : address(sym.getVA() - imageBase) {
64     using namespace llvm::MachO;
65     // Set the symbol type.
66     if (sym.isWeakDef())
67       flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;
68     // TODO: Add proper support for re-exports & stub-and-resolver flags.
69 
70     // Set the symbol kind.
71     if (sym.isTlv()) {
72       flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;
73     } else if (auto *defined = dyn_cast<Defined>(&sym)) {
74       if (defined->isAbsolute())
75         flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE;
76     }
77   }
78 };
79 
80 } // namespace
81 
82 struct macho::TrieNode {
83   std::vector<Edge> edges;
84   Optional<ExportInfo> info;
85   // Estimated offset from the start of the serialized trie to the current node.
86   // This will converge to the true offset when updateOffset() is run to a
87   // fixpoint.
88   size_t offset = 0;
89 
90   // Returns whether the new estimated offset differs from the old one.
91   bool updateOffset(size_t &nextOffset);
92   void writeTo(uint8_t *buf) const;
93 };
94 
95 bool TrieNode::updateOffset(size_t &nextOffset) {
96   // Size of the whole node (including the terminalSize and the outgoing edges.)
97   // In contrast, terminalSize only records the size of the other data in the
98   // node.
99   size_t nodeSize;
100   if (info) {
101     uint32_t terminalSize =
102         getULEB128Size(info->flags) + getULEB128Size(info->address);
103     // Overall node size so far is the uleb128 size of the length of the symbol
104     // info + the symbol info itself.
105     nodeSize = terminalSize + getULEB128Size(terminalSize);
106   } else {
107     nodeSize = 1; // Size of terminalSize (which has a value of 0)
108   }
109   // Compute size of all child edges.
110   ++nodeSize; // Byte for number of children.
111   for (const Edge &edge : edges) {
112     nodeSize += edge.substring.size() + 1             // String length.
113                 + getULEB128Size(edge.child->offset); // Offset len.
114   }
115   // On input, 'nextOffset' is the new preferred location for this node.
116   bool result = (offset != nextOffset);
117   // Store new location in node object for use by parents.
118   offset = nextOffset;
119   nextOffset += nodeSize;
120   return result;
121 }
122 
123 void TrieNode::writeTo(uint8_t *buf) const {
124   buf += offset;
125   if (info) {
126     // TrieNodes with Symbol info: size, flags address
127     uint32_t terminalSize =
128         getULEB128Size(info->flags) + getULEB128Size(info->address);
129     buf += encodeULEB128(terminalSize, buf);
130     buf += encodeULEB128(info->flags, buf);
131     buf += encodeULEB128(info->address, buf);
132   } else {
133     // TrieNode with no Symbol info.
134     *buf++ = 0; // terminalSize
135   }
136   // Add number of children. TODO: Handle case where we have more than 256.
137   assert(edges.size() < 256);
138   *buf++ = edges.size();
139   // Append each child edge substring and node offset.
140   for (const Edge &edge : edges) {
141     memcpy(buf, edge.substring.data(), edge.substring.size());
142     buf += edge.substring.size();
143     *buf++ = '\0';
144     buf += encodeULEB128(edge.child->offset, buf);
145   }
146 }
147 
148 TrieNode *TrieBuilder::makeNode() {
149   auto *node = make<TrieNode>();
150   nodes.emplace_back(node);
151   return node;
152 }
153 
154 static int charAt(const Symbol *sym, size_t pos) {
155   StringRef str = sym->getName();
156   if (pos >= str.size())
157     return -1;
158   return str[pos];
159 }
160 
161 // Build the trie by performing a three-way radix quicksort: We start by sorting
162 // the strings by their first characters, then sort the strings with the same
163 // first characters by their second characters, and so on recursively. Each
164 // time the prefixes diverge, we add a node to the trie.
165 //
166 // node:    The most recently created node along this path in the trie (i.e.
167 //          the furthest from the root.)
168 // lastPos: The prefix length of the most recently created node, i.e. the number
169 //          of characters along its path from the root.
170 // pos:     The string index we are currently sorting on. Note that each symbol
171 //          S contained in vec has the same prefix S[0...pos).
172 void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
173                                TrieNode *node, size_t lastPos, size_t pos) {
174 tailcall:
175   if (vec.empty())
176     return;
177 
178   // Partition items so that items in [0, i) are less than the pivot,
179   // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
180   // the pivot.
181   const Symbol *pivotSymbol = vec[vec.size() / 2];
182   int pivot = charAt(pivotSymbol, pos);
183   size_t i = 0;
184   size_t j = vec.size();
185   for (size_t k = 0; k < j;) {
186     int c = charAt(vec[k], pos);
187     if (c < pivot)
188       std::swap(vec[i++], vec[k++]);
189     else if (c > pivot)
190       std::swap(vec[--j], vec[k]);
191     else
192       k++;
193   }
194 
195   bool isTerminal = pivot == -1;
196   bool prefixesDiverge = i != 0 || j != vec.size();
197   if (lastPos != pos && (isTerminal || prefixesDiverge)) {
198     TrieNode *newNode = makeNode();
199     node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
200                              newNode);
201     node = newNode;
202     lastPos = pos;
203   }
204 
205   sortAndBuild(vec.slice(0, i), node, lastPos, pos);
206   sortAndBuild(vec.slice(j), node, lastPos, pos);
207 
208   if (isTerminal) {
209     assert(j - i == 1); // no duplicate symbols
210     node->info = ExportInfo(*pivotSymbol, imageBase);
211   } else {
212     // This is the tail-call-optimized version of the following:
213     // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
214     vec = vec.slice(i, j - i);
215     ++pos;
216     goto tailcall;
217   }
218 }
219 
220 size_t TrieBuilder::build() {
221   if (exported.empty())
222     return 0;
223 
224   TrieNode *root = makeNode();
225   sortAndBuild(exported, root, 0, 0);
226 
227   // Assign each node in the vector an offset in the trie stream, iterating
228   // until all uleb128 sizes have stabilized.
229   size_t offset;
230   bool more;
231   do {
232     offset = 0;
233     more = false;
234     for (TrieNode *node : nodes)
235       more |= node->updateOffset(offset);
236   } while (more);
237 
238   return offset;
239 }
240 
241 void TrieBuilder::writeTo(uint8_t *buf) const {
242   for (TrieNode *node : nodes)
243     node->writeTo(buf);
244 }
245 
246 namespace {
247 
248 // Parse a serialized trie and invoke a callback for each entry.
249 class TrieParser {
250 public:
251   TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
252       : start(buf), end(start + size), callback(callback) {}
253 
254   void parse(const uint8_t *buf, const Twine &cumulativeString);
255 
256   void parse() { parse(start, ""); }
257 
258   const uint8_t *start;
259   const uint8_t *end;
260   const TrieEntryCallback &callback;
261 };
262 
263 } // namespace
264 
265 void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
266   if (buf >= end)
267     fatal("Node offset points outside export section");
268 
269   unsigned ulebSize;
270   uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
271   buf += ulebSize;
272   uint64_t flags = 0;
273   size_t offset;
274   if (terminalSize != 0) {
275     flags = decodeULEB128(buf, &ulebSize);
276     callback(cumulativeString, flags);
277   }
278   buf += terminalSize;
279   uint8_t numEdges = *buf++;
280   for (uint8_t i = 0; i < numEdges; ++i) {
281     const char *cbuf = reinterpret_cast<const char *>(buf);
282     StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
283     buf += substring.size() + 1;
284     offset = decodeULEB128(buf, &ulebSize);
285     buf += ulebSize;
286     parse(start + offset, cumulativeString + substring);
287   }
288 }
289 
290 void macho::parseTrie(const uint8_t *buf, size_t size,
291                       const TrieEntryCallback &callback) {
292   if (size == 0)
293     return;
294 
295   TrieParser(buf, size, callback).parse();
296 }
297