1 //===- ExportTrie.cpp -----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is a partial implementation of the Mach-O export trie format. It's
10 // essentially a symbol table encoded as a compressed prefix trie, meaning that
11 // the common prefixes of each symbol name are shared for a more compact
12 // representation. The prefixes are stored on the edges of the trie, and one
13 // edge can represent multiple characters. For example, given two exported
14 // symbols _bar and _baz, we will have a trie like this (terminal nodes are
15 // marked with an asterisk):
16 //
17 // +-+-+
18 // | | // root node
19 // +-+-+
20 // |
21 // | _ba
22 // |
23 // +-+-+
24 // | |
25 // +-+-+
26 // r / \ z
27 // / \
28 // +-+-+ +-+-+
29 // | * | | * |
30 // +-+-+ +-+-+
31 //
32 // More documentation of the format can be found in
33 // llvm/tools/obj2yaml/macho2yaml.cpp.
34 //
35 //===----------------------------------------------------------------------===//
36
37 #include "ExportTrie.h"
38 #include "Symbols.h"
39
40 #include "lld/Common/ErrorHandler.h"
41 #include "lld/Common/Memory.h"
42 #include "llvm/ADT/Optional.h"
43 #include "llvm/BinaryFormat/MachO.h"
44 #include "llvm/Support/LEB128.h"
45
46 using namespace llvm;
47 using namespace llvm::MachO;
48 using namespace lld;
49 using namespace lld::macho;
50
51 namespace {
52
53 struct Edge {
Edge__anonbdcd48ea0111::Edge54 Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
55
56 StringRef substring;
57 struct TrieNode *child;
58 };
59
60 struct ExportInfo {
61 uint64_t address;
62 // TODO: Add proper support for re-exports & stub-and-resolver flags.
63 };
64
65 } // namespace
66
67 struct macho::TrieNode {
68 std::vector<Edge> edges;
69 Optional<ExportInfo> info;
70 // Estimated offset from the start of the serialized trie to the current node.
71 // This will converge to the true offset when updateOffset() is run to a
72 // fixpoint.
73 size_t offset = 0;
74
75 // Returns whether the new estimated offset differs from the old one.
76 bool updateOffset(size_t &nextOffset);
77 void writeTo(uint8_t *buf) const;
78 };
79
updateOffset(size_t & nextOffset)80 bool TrieNode::updateOffset(size_t &nextOffset) {
81 // Size of the whole node (including the terminalSize and the outgoing edges.)
82 // In contrast, terminalSize only records the size of the other data in the
83 // node.
84 size_t nodeSize;
85 if (info) {
86 uint64_t flags = 0;
87 uint32_t terminalSize =
88 getULEB128Size(flags) + getULEB128Size(info->address);
89 // Overall node size so far is the uleb128 size of the length of the symbol
90 // info + the symbol info itself.
91 nodeSize = terminalSize + getULEB128Size(terminalSize);
92 } else {
93 nodeSize = 1; // Size of terminalSize (which has a value of 0)
94 }
95 // Compute size of all child edges.
96 ++nodeSize; // Byte for number of children.
97 for (Edge &edge : edges) {
98 nodeSize += edge.substring.size() + 1 // String length.
99 + getULEB128Size(edge.child->offset); // Offset len.
100 }
101 // On input, 'nextOffset' is the new preferred location for this node.
102 bool result = (offset != nextOffset);
103 // Store new location in node object for use by parents.
104 offset = nextOffset;
105 nextOffset += nodeSize;
106 return result;
107 }
108
writeTo(uint8_t * buf) const109 void TrieNode::writeTo(uint8_t *buf) const {
110 buf += offset;
111 if (info) {
112 // TrieNodes with Symbol info: size, flags address
113 uint64_t flags = 0; // TODO: emit proper flags
114 uint32_t terminalSize =
115 getULEB128Size(flags) + getULEB128Size(info->address);
116 buf += encodeULEB128(terminalSize, buf);
117 buf += encodeULEB128(flags, buf);
118 buf += encodeULEB128(info->address, buf);
119 } else {
120 // TrieNode with no Symbol info.
121 *buf++ = 0; // terminalSize
122 }
123 // Add number of children. TODO: Handle case where we have more than 256.
124 assert(edges.size() < 256);
125 *buf++ = edges.size();
126 // Append each child edge substring and node offset.
127 for (const Edge &edge : edges) {
128 memcpy(buf, edge.substring.data(), edge.substring.size());
129 buf += edge.substring.size();
130 *buf++ = '\0';
131 buf += encodeULEB128(edge.child->offset, buf);
132 }
133 }
134
makeNode()135 TrieNode *TrieBuilder::makeNode() {
136 auto *node = make<TrieNode>();
137 nodes.emplace_back(node);
138 return node;
139 }
140
charAt(const Symbol * sym,size_t pos)141 static int charAt(const Symbol *sym, size_t pos) {
142 StringRef str = sym->getName();
143 if (pos >= str.size())
144 return -1;
145 return str[pos];
146 }
147
148 // Build the trie by performing a three-way radix quicksort: We start by sorting
149 // the strings by their first characters, then sort the strings with the same
150 // first characters by their second characters, and so on recursively. Each
151 // time the prefixes diverge, we add a node to the trie.
152 //
153 // node: The most recently created node along this path in the trie (i.e.
154 // the furthest from the root.)
155 // lastPos: The prefix length of the most recently created node, i.e. the number
156 // of characters along its path from the root.
157 // pos: The string index we are currently sorting on. Note that each symbol
158 // S contained in vec has the same prefix S[0...pos).
sortAndBuild(MutableArrayRef<const Symbol * > vec,TrieNode * node,size_t lastPos,size_t pos)159 void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
160 TrieNode *node, size_t lastPos, size_t pos) {
161 tailcall:
162 if (vec.empty())
163 return;
164
165 // Partition items so that items in [0, i) are less than the pivot,
166 // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
167 // the pivot.
168 const Symbol *pivotSymbol = vec[vec.size() / 2];
169 int pivot = charAt(pivotSymbol, pos);
170 size_t i = 0;
171 size_t j = vec.size();
172 for (size_t k = 0; k < j;) {
173 int c = charAt(vec[k], pos);
174 if (c < pivot)
175 std::swap(vec[i++], vec[k++]);
176 else if (c > pivot)
177 std::swap(vec[--j], vec[k]);
178 else
179 k++;
180 }
181
182 bool isTerminal = pivot == -1;
183 bool prefixesDiverge = i != 0 || j != vec.size();
184 if (lastPos != pos && (isTerminal || prefixesDiverge)) {
185 TrieNode *newNode = makeNode();
186 node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
187 newNode);
188 node = newNode;
189 lastPos = pos;
190 }
191
192 sortAndBuild(vec.slice(0, i), node, lastPos, pos);
193 sortAndBuild(vec.slice(j), node, lastPos, pos);
194
195 if (isTerminal) {
196 assert(j - i == 1); // no duplicate symbols
197 node->info = {pivotSymbol->getVA()};
198 } else {
199 // This is the tail-call-optimized version of the following:
200 // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
201 vec = vec.slice(i, j - i);
202 ++pos;
203 goto tailcall;
204 }
205 }
206
build()207 size_t TrieBuilder::build() {
208 if (exported.empty())
209 return 0;
210
211 TrieNode *root = makeNode();
212 sortAndBuild(exported, root, 0, 0);
213
214 // Assign each node in the vector an offset in the trie stream, iterating
215 // until all uleb128 sizes have stabilized.
216 size_t offset;
217 bool more;
218 do {
219 offset = 0;
220 more = false;
221 for (TrieNode *node : nodes)
222 more |= node->updateOffset(offset);
223 } while (more);
224
225 return offset;
226 }
227
writeTo(uint8_t * buf) const228 void TrieBuilder::writeTo(uint8_t *buf) const {
229 for (TrieNode *node : nodes)
230 node->writeTo(buf);
231 }
232
233 namespace {
234
235 // Parse a serialized trie and invoke a callback for each entry.
236 class TrieParser {
237 public:
TrieParser(const uint8_t * buf,size_t size,const TrieEntryCallback & callback)238 TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
239 : start(buf), end(start + size), callback(callback) {}
240
241 void parse(const uint8_t *buf, const Twine &cumulativeString);
242
parse()243 void parse() { parse(start, ""); }
244
245 const uint8_t *start;
246 const uint8_t *end;
247 const TrieEntryCallback &callback;
248 };
249
250 } // namespace
251
parse(const uint8_t * buf,const Twine & cumulativeString)252 void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
253 if (buf >= end)
254 fatal("Node offset points outside export section");
255
256 unsigned ulebSize;
257 uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
258 buf += ulebSize;
259 uint64_t flags = 0;
260 size_t offset;
261 if (terminalSize != 0) {
262 flags = decodeULEB128(buf, &ulebSize);
263 callback(cumulativeString, flags);
264 }
265 buf += terminalSize;
266 uint8_t numEdges = *buf++;
267 for (uint8_t i = 0; i < numEdges; ++i) {
268 const char *cbuf = reinterpret_cast<const char *>(buf);
269 StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
270 buf += substring.size() + 1;
271 offset = decodeULEB128(buf, &ulebSize);
272 buf += ulebSize;
273 parse(start + offset, cumulativeString + substring);
274 }
275 }
276
parseTrie(const uint8_t * buf,size_t size,const TrieEntryCallback & callback)277 void macho::parseTrie(const uint8_t *buf, size_t size,
278 const TrieEntryCallback &callback) {
279 if (size == 0)
280 return;
281
282 TrieParser(buf, size, callback).parse();
283 }
284