1 //===- ExportTrie.cpp -----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is a partial implementation of the Mach-O export trie format. It's
10 // essentially a symbol table encoded as a compressed prefix trie, meaning that
11 // the common prefixes of each symbol name are shared for a more compact
12 // representation. The prefixes are stored on the edges of the trie, and one
13 // edge can represent multiple characters. For example, given two exported
14 // symbols _bar and _baz, we will have a trie like this (terminal nodes are
15 // marked with an asterisk):
16 //
17 //              +-+-+
18 //              |   | // root node
19 //              +-+-+
20 //                |
21 //                | _ba
22 //                |
23 //              +-+-+
24 //              |   |
25 //              +-+-+
26 //           r /     \ z
27 //            /       \
28 //        +-+-+       +-+-+
29 //        | * |       | * |
30 //        +-+-+       +-+-+
31 //
32 // More documentation of the format can be found in
33 // llvm/tools/obj2yaml/macho2yaml.cpp.
34 //
35 //===----------------------------------------------------------------------===//
36 
37 #include "ExportTrie.h"
38 #include "Symbols.h"
39 
40 #include "lld/Common/ErrorHandler.h"
41 #include "lld/Common/Memory.h"
42 #include "llvm/ADT/Optional.h"
43 #include "llvm/BinaryFormat/MachO.h"
44 #include "llvm/Support/LEB128.h"
45 
46 using namespace llvm;
47 using namespace llvm::MachO;
48 using namespace lld;
49 using namespace lld::macho;
50 
51 namespace {
52 
53 struct Edge {
54   Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
55 
56   StringRef substring;
57   struct TrieNode *child;
58 };
59 
60 struct ExportInfo {
61   uint64_t address;
62   // TODO: Add proper support for re-exports & stub-and-resolver flags.
63 };
64 
65 } // namespace
66 
67 struct macho::TrieNode {
68   std::vector<Edge> edges;
69   Optional<ExportInfo> info;
70   // Estimated offset from the start of the serialized trie to the current node.
71   // This will converge to the true offset when updateOffset() is run to a
72   // fixpoint.
73   size_t offset = 0;
74 
75   // Returns whether the new estimated offset differs from the old one.
76   bool updateOffset(size_t &nextOffset);
77   void writeTo(uint8_t *buf) const;
78 };
79 
80 bool TrieNode::updateOffset(size_t &nextOffset) {
81   // Size of the whole node (including the terminalSize and the outgoing edges.)
82   // In contrast, terminalSize only records the size of the other data in the
83   // node.
84   size_t nodeSize;
85   if (info) {
86     uint64_t flags = 0;
87     uint32_t terminalSize =
88         getULEB128Size(flags) + getULEB128Size(info->address);
89     // Overall node size so far is the uleb128 size of the length of the symbol
90     // info + the symbol info itself.
91     nodeSize = terminalSize + getULEB128Size(terminalSize);
92   } else {
93     nodeSize = 1; // Size of terminalSize (which has a value of 0)
94   }
95   // Compute size of all child edges.
96   ++nodeSize; // Byte for number of children.
97   for (Edge &edge : edges) {
98     nodeSize += edge.substring.size() + 1             // String length.
99                 + getULEB128Size(edge.child->offset); // Offset len.
100   }
101   // On input, 'nextOffset' is the new preferred location for this node.
102   bool result = (offset != nextOffset);
103   // Store new location in node object for use by parents.
104   offset = nextOffset;
105   nextOffset += nodeSize;
106   return result;
107 }
108 
109 void TrieNode::writeTo(uint8_t *buf) const {
110   buf += offset;
111   if (info) {
112     // TrieNodes with Symbol info: size, flags address
113     uint64_t flags = 0; // TODO: emit proper flags
114     uint32_t terminalSize =
115         getULEB128Size(flags) + getULEB128Size(info->address);
116     buf += encodeULEB128(terminalSize, buf);
117     buf += encodeULEB128(flags, buf);
118     buf += encodeULEB128(info->address, buf);
119   } else {
120     // TrieNode with no Symbol info.
121     *buf++ = 0; // terminalSize
122   }
123   // Add number of children. TODO: Handle case where we have more than 256.
124   assert(edges.size() < 256);
125   *buf++ = edges.size();
126   // Append each child edge substring and node offset.
127   for (const Edge &edge : edges) {
128     memcpy(buf, edge.substring.data(), edge.substring.size());
129     buf += edge.substring.size();
130     *buf++ = '\0';
131     buf += encodeULEB128(edge.child->offset, buf);
132   }
133 }
134 
135 TrieNode *TrieBuilder::makeNode() {
136   auto *node = make<TrieNode>();
137   nodes.emplace_back(node);
138   return node;
139 }
140 
141 static int charAt(const Symbol *sym, size_t pos) {
142   StringRef str = sym->getName();
143   if (pos >= str.size())
144     return -1;
145   return str[pos];
146 }
147 
148 // Build the trie by performing a three-way radix quicksort: We start by sorting
149 // the strings by their first characters, then sort the strings with the same
150 // first characters by their second characters, and so on recursively. Each
151 // time the prefixes diverge, we add a node to the trie.
152 //
153 // node:    The most recently created node along this path in the trie (i.e.
154 //          the furthest from the root.)
155 // lastPos: The prefix length of the most recently created node, i.e. the number
156 //          of characters along its path from the root.
157 // pos:     The string index we are currently sorting on. Note that each symbol
158 //          S contained in vec has the same prefix S[0...pos).
159 void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
160                                TrieNode *node, size_t lastPos, size_t pos) {
161 tailcall:
162   if (vec.empty())
163     return;
164 
165   // Partition items so that items in [0, i) are less than the pivot,
166   // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
167   // the pivot.
168   const Symbol *pivotSymbol = vec[vec.size() / 2];
169   int pivot = charAt(pivotSymbol, pos);
170   size_t i = 0;
171   size_t j = vec.size();
172   for (size_t k = 0; k < j;) {
173     int c = charAt(vec[k], pos);
174     if (c < pivot)
175       std::swap(vec[i++], vec[k++]);
176     else if (c > pivot)
177       std::swap(vec[--j], vec[k]);
178     else
179       k++;
180   }
181 
182   bool isTerminal = pivot == -1;
183   bool prefixesDiverge = i != 0 || j != vec.size();
184   if (lastPos != pos && (isTerminal || prefixesDiverge)) {
185     TrieNode *newNode = makeNode();
186     node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
187                              newNode);
188     node = newNode;
189     lastPos = pos;
190   }
191 
192   sortAndBuild(vec.slice(0, i), node, lastPos, pos);
193   sortAndBuild(vec.slice(j), node, lastPos, pos);
194 
195   if (isTerminal) {
196     assert(j - i == 1); // no duplicate symbols
197     node->info = {pivotSymbol->getVA()};
198   } else {
199     // This is the tail-call-optimized version of the following:
200     // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
201     vec = vec.slice(i, j - i);
202     ++pos;
203     goto tailcall;
204   }
205 }
206 
207 size_t TrieBuilder::build() {
208   if (exported.empty())
209     return 0;
210 
211   TrieNode *root = makeNode();
212   sortAndBuild(exported, root, 0, 0);
213 
214   // Assign each node in the vector an offset in the trie stream, iterating
215   // until all uleb128 sizes have stabilized.
216   size_t offset;
217   bool more;
218   do {
219     offset = 0;
220     more = false;
221     for (TrieNode *node : nodes)
222       more |= node->updateOffset(offset);
223   } while (more);
224 
225   return offset;
226 }
227 
228 void TrieBuilder::writeTo(uint8_t *buf) const {
229   for (TrieNode *node : nodes)
230     node->writeTo(buf);
231 }
232 
233 namespace {
234 
235 // Parse a serialized trie and invoke a callback for each entry.
236 class TrieParser {
237 public:
238   TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
239       : start(buf), end(start + size), callback(callback) {}
240 
241   void parse(const uint8_t *buf, const Twine &cumulativeString);
242 
243   void parse() { parse(start, ""); }
244 
245   const uint8_t *start;
246   const uint8_t *end;
247   const TrieEntryCallback &callback;
248 };
249 
250 } // namespace
251 
252 void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
253   if (buf >= end)
254     fatal("Node offset points outside export section");
255 
256   unsigned ulebSize;
257   uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
258   buf += ulebSize;
259   uint64_t flags = 0;
260   size_t offset;
261   if (terminalSize != 0) {
262     flags = decodeULEB128(buf, &ulebSize);
263     callback(cumulativeString, flags);
264   }
265   buf += terminalSize;
266   uint8_t numEdges = *buf++;
267   for (uint8_t i = 0; i < numEdges; ++i) {
268     const char *cbuf = reinterpret_cast<const char *>(buf);
269     StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
270     buf += substring.size() + 1;
271     offset = decodeULEB128(buf, &ulebSize);
272     buf += ulebSize;
273     parse(start + offset, cumulativeString + substring);
274   }
275 }
276 
277 void macho::parseTrie(const uint8_t *buf, size_t size,
278                       const TrieEntryCallback &callback) {
279   if (size == 0)
280     return;
281 
282   TrieParser(buf, size, callback).parse();
283 }
284