1 //===- ExportTrie.cpp -----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is a partial implementation of the Mach-O export trie format. It's
10 // essentially a symbol table encoded as a compressed prefix trie, meaning that
11 // the common prefixes of each symbol name are shared for a more compact
12 // representation. The prefixes are stored on the edges of the trie, and one
13 // edge can represent multiple characters. For example, given two exported
14 // symbols _bar and _baz, we will have a trie like this (terminal nodes are
15 // marked with an asterisk):
16 //
17 //              +-+-+
18 //              |   | // root node
19 //              +-+-+
20 //                |
21 //                | _ba
22 //                |
23 //              +-+-+
24 //              |   |
25 //              +-+-+
26 //           r /     \ z
27 //            /       \
28 //        +-+-+       +-+-+
29 //        | * |       | * |
30 //        +-+-+       +-+-+
31 //
32 // More documentation of the format can be found in
33 // llvm/tools/obj2yaml/macho2yaml.cpp.
34 //
35 //===----------------------------------------------------------------------===//
36 
37 #include "ExportTrie.h"
38 #include "Symbols.h"
39 
40 #include "lld/Common/ErrorHandler.h"
41 #include "lld/Common/Memory.h"
42 #include "llvm/ADT/Optional.h"
43 #include "llvm/BinaryFormat/MachO.h"
44 #include "llvm/Support/LEB128.h"
45 
46 using namespace llvm;
47 using namespace lld;
48 using namespace lld::macho;
49 
50 namespace {
51 
52 struct Edge {
53   Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
54 
55   StringRef substring;
56   struct TrieNode *child;
57 };
58 
59 struct ExportInfo {
60   uint64_t address;
61   uint8_t flags = 0;
62   ExportInfo(const Symbol &sym, uint64_t imageBase)
63       : address(sym.getVA() - imageBase) {
64     using namespace llvm::MachO;
65     // Set the symbol type.
66     if (sym.isWeakDef())
67       flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;
68     // TODO: Add proper support for re-exports & stub-and-resolver flags.
69 
70     // Set the symbol kind.
71     if (sym.isTlv()) {
72       flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;
73     } else if (auto *defined = dyn_cast<Defined>(&sym)) {
74       if (defined->isAbsolute())
75         flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE;
76     }
77   }
78 };
79 
80 } // namespace
81 
82 struct macho::TrieNode {
83   std::vector<Edge> edges;
84   Optional<ExportInfo> info;
85   // Estimated offset from the start of the serialized trie to the current node.
86   // This will converge to the true offset when updateOffset() is run to a
87   // fixpoint.
88   size_t offset = 0;
89 
90   // Returns whether the new estimated offset differs from the old one.
91   bool updateOffset(size_t &nextOffset);
92   void writeTo(uint8_t *buf) const;
93 };
94 
95 bool TrieNode::updateOffset(size_t &nextOffset) {
96   // Size of the whole node (including the terminalSize and the outgoing edges.)
97   // In contrast, terminalSize only records the size of the other data in the
98   // node.
99   size_t nodeSize;
100   if (info) {
101     uint32_t terminalSize =
102         getULEB128Size(info->flags) + getULEB128Size(info->address);
103     // Overall node size so far is the uleb128 size of the length of the symbol
104     // info + the symbol info itself.
105     nodeSize = terminalSize + getULEB128Size(terminalSize);
106   } else {
107     nodeSize = 1; // Size of terminalSize (which has a value of 0)
108   }
109   // Compute size of all child edges.
110   ++nodeSize; // Byte for number of children.
111   for (const Edge &edge : edges) {
112     nodeSize += edge.substring.size() + 1             // String length.
113                 + getULEB128Size(edge.child->offset); // Offset len.
114   }
115   // On input, 'nextOffset' is the new preferred location for this node.
116   bool result = (offset != nextOffset);
117   // Store new location in node object for use by parents.
118   offset = nextOffset;
119   nextOffset += nodeSize;
120   return result;
121 }
122 
123 void TrieNode::writeTo(uint8_t *buf) const {
124   buf += offset;
125   if (info) {
126     // TrieNodes with Symbol info: size, flags address
127     uint32_t terminalSize =
128         getULEB128Size(info->flags) + getULEB128Size(info->address);
129     buf += encodeULEB128(terminalSize, buf);
130     buf += encodeULEB128(info->flags, buf);
131     buf += encodeULEB128(info->address, buf);
132   } else {
133     // TrieNode with no Symbol info.
134     *buf++ = 0; // terminalSize
135   }
136   // Add number of children. TODO: Handle case where we have more than 256.
137   assert(edges.size() < 256);
138   *buf++ = edges.size();
139   // Append each child edge substring and node offset.
140   for (const Edge &edge : edges) {
141     memcpy(buf, edge.substring.data(), edge.substring.size());
142     buf += edge.substring.size();
143     *buf++ = '\0';
144     buf += encodeULEB128(edge.child->offset, buf);
145   }
146 }
147 
148 TrieBuilder::~TrieBuilder() {
149   for (TrieNode *node : nodes)
150     delete node;
151 }
152 
153 TrieNode *TrieBuilder::makeNode() {
154   auto *node = new TrieNode();
155   nodes.emplace_back(node);
156   return node;
157 }
158 
159 static int charAt(const Symbol *sym, size_t pos) {
160   StringRef str = sym->getName();
161   if (pos >= str.size())
162     return -1;
163   return str[pos];
164 }
165 
166 // Build the trie by performing a three-way radix quicksort: We start by sorting
167 // the strings by their first characters, then sort the strings with the same
168 // first characters by their second characters, and so on recursively. Each
169 // time the prefixes diverge, we add a node to the trie.
170 //
171 // node:    The most recently created node along this path in the trie (i.e.
172 //          the furthest from the root.)
173 // lastPos: The prefix length of the most recently created node, i.e. the number
174 //          of characters along its path from the root.
175 // pos:     The string index we are currently sorting on. Note that each symbol
176 //          S contained in vec has the same prefix S[0...pos).
177 void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
178                                TrieNode *node, size_t lastPos, size_t pos) {
179 tailcall:
180   if (vec.empty())
181     return;
182 
183   // Partition items so that items in [0, i) are less than the pivot,
184   // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
185   // the pivot.
186   const Symbol *pivotSymbol = vec[vec.size() / 2];
187   int pivot = charAt(pivotSymbol, pos);
188   size_t i = 0;
189   size_t j = vec.size();
190   for (size_t k = 0; k < j;) {
191     int c = charAt(vec[k], pos);
192     if (c < pivot)
193       std::swap(vec[i++], vec[k++]);
194     else if (c > pivot)
195       std::swap(vec[--j], vec[k]);
196     else
197       k++;
198   }
199 
200   bool isTerminal = pivot == -1;
201   bool prefixesDiverge = i != 0 || j != vec.size();
202   if (lastPos != pos && (isTerminal || prefixesDiverge)) {
203     TrieNode *newNode = makeNode();
204     node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
205                              newNode);
206     node = newNode;
207     lastPos = pos;
208   }
209 
210   sortAndBuild(vec.slice(0, i), node, lastPos, pos);
211   sortAndBuild(vec.slice(j), node, lastPos, pos);
212 
213   if (isTerminal) {
214     assert(j - i == 1); // no duplicate symbols
215     node->info = ExportInfo(*pivotSymbol, imageBase);
216   } else {
217     // This is the tail-call-optimized version of the following:
218     // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
219     vec = vec.slice(i, j - i);
220     ++pos;
221     goto tailcall;
222   }
223 }
224 
225 size_t TrieBuilder::build() {
226   if (exported.empty())
227     return 0;
228 
229   TrieNode *root = makeNode();
230   sortAndBuild(exported, root, 0, 0);
231 
232   // Assign each node in the vector an offset in the trie stream, iterating
233   // until all uleb128 sizes have stabilized.
234   size_t offset;
235   bool more;
236   do {
237     offset = 0;
238     more = false;
239     for (TrieNode *node : nodes)
240       more |= node->updateOffset(offset);
241   } while (more);
242 
243   return offset;
244 }
245 
246 void TrieBuilder::writeTo(uint8_t *buf) const {
247   for (TrieNode *node : nodes)
248     node->writeTo(buf);
249 }
250 
251 namespace {
252 
253 // Parse a serialized trie and invoke a callback for each entry.
254 class TrieParser {
255 public:
256   TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
257       : start(buf), end(start + size), callback(callback) {}
258 
259   void parse(const uint8_t *buf, const Twine &cumulativeString);
260 
261   void parse() { parse(start, ""); }
262 
263   const uint8_t *start;
264   const uint8_t *end;
265   const TrieEntryCallback &callback;
266 };
267 
268 } // namespace
269 
270 void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
271   if (buf >= end)
272     fatal("Node offset points outside export section");
273 
274   unsigned ulebSize;
275   uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
276   buf += ulebSize;
277   uint64_t flags = 0;
278   size_t offset;
279   if (terminalSize != 0) {
280     flags = decodeULEB128(buf, &ulebSize);
281     callback(cumulativeString, flags);
282   }
283   buf += terminalSize;
284   uint8_t numEdges = *buf++;
285   for (uint8_t i = 0; i < numEdges; ++i) {
286     const char *cbuf = reinterpret_cast<const char *>(buf);
287     StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
288     buf += substring.size() + 1;
289     offset = decodeULEB128(buf, &ulebSize);
290     buf += ulebSize;
291     parse(start + offset, cumulativeString + substring);
292   }
293 }
294 
295 void macho::parseTrie(const uint8_t *buf, size_t size,
296                       const TrieEntryCallback &callback) {
297   if (size == 0)
298     return;
299 
300   TrieParser(buf, size, callback).parse();
301 }
302