1 //===-- Serialization.cpp - Binary serialization of index data ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "Serialization.h"
10 #include "Headers.h"
11 #include "RIFF.h"
12 #include "SymbolLocation.h"
13 #include "SymbolOrigin.h"
14 #include "dex/Dex.h"
15 #include "support/Logger.h"
16 #include "support/Trace.h"
17 #include "clang/Tooling/CompilationDatabase.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/Support/Compiler.h"
20 #include "llvm/Support/Compression.h"
21 #include "llvm/Support/Endian.h"
22 #include "llvm/Support/Error.h"
23 #include "llvm/Support/raw_ostream.h"
24 #include <cstdint>
25 #include <vector>
26 
27 namespace clang {
28 namespace clangd {
29 namespace {
30 
31 // IO PRIMITIVES
32 // We use little-endian 32 bit ints, sometimes with variable-length encoding.
33 //
34 // Variable-length int encoding (varint) uses the bottom 7 bits of each byte
35 // to encode the number, and the top bit to indicate whether more bytes follow.
36 // e.g. 9a 2f means [0x1a and keep reading, 0x2f and stop].
37 // This represents 0x1a | 0x2f<<7 = 6042.
38 // A 32-bit integer takes 1-5 bytes to encode; small numbers are more compact.
39 
40 // Reads binary data from a StringRef, and keeps track of position.
41 class Reader {
42   const char *Begin, *End;
43   bool Err = false;
44 
45 public:
Reader(llvm::StringRef Data)46   Reader(llvm::StringRef Data) : Begin(Data.begin()), End(Data.end()) {}
47   // The "error" bit is set by reading past EOF or reading invalid data.
48   // When in an error state, reads may return zero values: callers should check.
err() const49   bool err() const { return Err; }
50   // Did we read all the data, or encounter an error?
eof() const51   bool eof() const { return Begin == End || Err; }
52   // All the data we didn't read yet.
rest() const53   llvm::StringRef rest() const { return llvm::StringRef(Begin, End - Begin); }
54 
consume8()55   uint8_t consume8() {
56     if (LLVM_UNLIKELY(Begin == End)) {
57       Err = true;
58       return 0;
59     }
60     return *Begin++;
61   }
62 
consume32()63   uint32_t consume32() {
64     if (LLVM_UNLIKELY(Begin + 4 > End)) {
65       Err = true;
66       return 0;
67     }
68     auto Ret = llvm::support::endian::read32le(Begin);
69     Begin += 4;
70     return Ret;
71   }
72 
consume(int N)73   llvm::StringRef consume(int N) {
74     if (LLVM_UNLIKELY(Begin + N > End)) {
75       Err = true;
76       return llvm::StringRef();
77     }
78     llvm::StringRef Ret(Begin, N);
79     Begin += N;
80     return Ret;
81   }
82 
consumeVar()83   uint32_t consumeVar() {
84     constexpr static uint8_t More = 1 << 7;
85 
86     // Use a 32 bit unsigned here to prevent promotion to signed int (unless int
87     // is wider than 32 bits).
88     uint32_t B = consume8();
89     if (LLVM_LIKELY(!(B & More)))
90       return B;
91     uint32_t Val = B & ~More;
92     for (int Shift = 7; B & More && Shift < 32; Shift += 7) {
93       B = consume8();
94       // 5th byte of a varint can only have lowest 4 bits set.
95       assert((Shift != 28 || B == (B & 0x0f)) && "Invalid varint encoding");
96       Val |= (B & ~More) << Shift;
97     }
98     return Val;
99   }
100 
consumeString(llvm::ArrayRef<llvm::StringRef> Strings)101   llvm::StringRef consumeString(llvm::ArrayRef<llvm::StringRef> Strings) {
102     auto StringIndex = consumeVar();
103     if (LLVM_UNLIKELY(StringIndex >= Strings.size())) {
104       Err = true;
105       return llvm::StringRef();
106     }
107     return Strings[StringIndex];
108   }
109 
consumeID()110   SymbolID consumeID() {
111     llvm::StringRef Raw = consume(SymbolID::RawSize); // short if truncated.
112     return LLVM_UNLIKELY(err()) ? SymbolID() : SymbolID::fromRaw(Raw);
113   }
114 
115   // Read a varint (as consumeVar) and resize the container accordingly.
116   // If the size is invalid, return false and mark an error.
117   // (The caller should abort in this case).
consumeSize(T & Container)118   template <typename T> LLVM_NODISCARD bool consumeSize(T &Container) {
119     auto Size = consumeVar();
120     // Conservatively assume each element is at least one byte.
121     if (Size > (size_t)(End - Begin)) {
122       Err = true;
123       return false;
124     }
125     Container.resize(Size);
126     return true;
127   }
128 };
129 
write32(uint32_t I,llvm::raw_ostream & OS)130 void write32(uint32_t I, llvm::raw_ostream &OS) {
131   char Buf[4];
132   llvm::support::endian::write32le(Buf, I);
133   OS.write(Buf, sizeof(Buf));
134 }
135 
writeVar(uint32_t I,llvm::raw_ostream & OS)136 void writeVar(uint32_t I, llvm::raw_ostream &OS) {
137   constexpr static uint8_t More = 1 << 7;
138   if (LLVM_LIKELY(I < 1 << 7)) {
139     OS.write(I);
140     return;
141   }
142   for (;;) {
143     OS.write(I | More);
144     I >>= 7;
145     if (I < 1 << 7) {
146       OS.write(I);
147       return;
148     }
149   }
150 }
151 
152 // STRING TABLE ENCODING
153 // Index data has many string fields, and many strings are identical.
154 // We store each string once, and refer to them by index.
155 //
156 // The string table's format is:
157 //   - UncompressedSize : uint32 (or 0 for no compression)
158 //   - CompressedData   : byte[CompressedSize]
159 //
160 // CompressedData is a zlib-compressed byte[UncompressedSize].
161 // It contains a sequence of null-terminated strings, e.g. "foo\0bar\0".
162 // These are sorted to improve compression.
163 
164 // Maps each string to a canonical representation.
165 // Strings remain owned externally (e.g. by SymbolSlab).
166 class StringTableOut {
167   llvm::DenseSet<llvm::StringRef> Unique;
168   std::vector<llvm::StringRef> Sorted;
169   // Since strings are interned, look up can be by pointer.
170   llvm::DenseMap<std::pair<const char *, size_t>, unsigned> Index;
171 
172 public:
StringTableOut()173   StringTableOut() {
174     // Ensure there's at least one string in the table.
175     // Table size zero is reserved to indicate no compression.
176     Unique.insert("");
177   }
178   // Add a string to the table. Overwrites S if an identical string exists.
intern(llvm::StringRef & S)179   void intern(llvm::StringRef &S) { S = *Unique.insert(S).first; };
180   // Finalize the table and write it to OS. No more strings may be added.
finalize(llvm::raw_ostream & OS)181   void finalize(llvm::raw_ostream &OS) {
182     Sorted = {Unique.begin(), Unique.end()};
183     llvm::sort(Sorted);
184     for (unsigned I = 0; I < Sorted.size(); ++I)
185       Index.try_emplace({Sorted[I].data(), Sorted[I].size()}, I);
186 
187     std::string RawTable;
188     for (llvm::StringRef S : Sorted) {
189       RawTable.append(std::string(S));
190       RawTable.push_back(0);
191     }
192     if (llvm::zlib::isAvailable()) {
193       llvm::SmallString<1> Compressed;
194       llvm::cantFail(llvm::zlib::compress(RawTable, Compressed));
195       write32(RawTable.size(), OS);
196       OS << Compressed;
197     } else {
198       write32(0, OS); // No compression.
199       OS << RawTable;
200     }
201   }
202   // Get the ID of an string, which must be interned. Table must be finalized.
index(llvm::StringRef S) const203   unsigned index(llvm::StringRef S) const {
204     assert(!Sorted.empty() && "table not finalized");
205     assert(Index.count({S.data(), S.size()}) && "string not interned");
206     return Index.find({S.data(), S.size()})->second;
207   }
208 };
209 
210 struct StringTableIn {
211   llvm::BumpPtrAllocator Arena;
212   std::vector<llvm::StringRef> Strings;
213 };
214 
readStringTable(llvm::StringRef Data)215 llvm::Expected<StringTableIn> readStringTable(llvm::StringRef Data) {
216   Reader R(Data);
217   size_t UncompressedSize = R.consume32();
218   if (R.err())
219     return error("Truncated string table");
220 
221   llvm::StringRef Uncompressed;
222   llvm::SmallString<1> UncompressedStorage;
223   if (UncompressedSize == 0) // No compression
224     Uncompressed = R.rest();
225   else if (llvm::zlib::isAvailable()) {
226     // Don't allocate a massive buffer if UncompressedSize was corrupted
227     // This is effective for sharded index, but not big monolithic ones, as
228     // once compressed size reaches 4MB nothing can be ruled out.
229     // Theoretical max ratio from https://zlib.net/zlib_tech.html
230     constexpr int MaxCompressionRatio = 1032;
231     if (UncompressedSize / MaxCompressionRatio > R.rest().size())
232       return error("Bad stri table: uncompress {0} -> {1} bytes is implausible",
233                    R.rest().size(), UncompressedSize);
234 
235     if (llvm::Error E = llvm::zlib::uncompress(R.rest(), UncompressedStorage,
236                                                UncompressedSize))
237       return std::move(E);
238     Uncompressed = UncompressedStorage;
239   } else
240     return error("Compressed string table, but zlib is unavailable");
241 
242   StringTableIn Table;
243   llvm::StringSaver Saver(Table.Arena);
244   R = Reader(Uncompressed);
245   for (Reader R(Uncompressed); !R.eof();) {
246     auto Len = R.rest().find(0);
247     if (Len == llvm::StringRef::npos)
248       return error("Bad string table: not null terminated");
249     Table.Strings.push_back(Saver.save(R.consume(Len)));
250     R.consume8();
251   }
252   if (R.err())
253     return error("Truncated string table");
254   return std::move(Table);
255 }
256 
257 // SYMBOL ENCODING
258 // Each field of clangd::Symbol is encoded in turn (see implementation).
259 //  - StringRef fields encode as varint (index into the string table)
260 //  - enums encode as the underlying type
261 //  - most numbers encode as varint
262 
writeLocation(const SymbolLocation & Loc,const StringTableOut & Strings,llvm::raw_ostream & OS)263 void writeLocation(const SymbolLocation &Loc, const StringTableOut &Strings,
264                    llvm::raw_ostream &OS) {
265   writeVar(Strings.index(Loc.FileURI), OS);
266   for (const auto &Endpoint : {Loc.Start, Loc.End}) {
267     writeVar(Endpoint.line(), OS);
268     writeVar(Endpoint.column(), OS);
269   }
270 }
271 
readLocation(Reader & Data,llvm::ArrayRef<llvm::StringRef> Strings)272 SymbolLocation readLocation(Reader &Data,
273                             llvm::ArrayRef<llvm::StringRef> Strings) {
274   SymbolLocation Loc;
275   Loc.FileURI = Data.consumeString(Strings).data();
276   for (auto *Endpoint : {&Loc.Start, &Loc.End}) {
277     Endpoint->setLine(Data.consumeVar());
278     Endpoint->setColumn(Data.consumeVar());
279   }
280   return Loc;
281 }
282 
readIncludeGraphNode(Reader & Data,llvm::ArrayRef<llvm::StringRef> Strings)283 IncludeGraphNode readIncludeGraphNode(Reader &Data,
284                                       llvm::ArrayRef<llvm::StringRef> Strings) {
285   IncludeGraphNode IGN;
286   IGN.Flags = static_cast<IncludeGraphNode::SourceFlag>(Data.consume8());
287   IGN.URI = Data.consumeString(Strings);
288   llvm::StringRef Digest = Data.consume(IGN.Digest.size());
289   std::copy(Digest.bytes_begin(), Digest.bytes_end(), IGN.Digest.begin());
290   if (!Data.consumeSize(IGN.DirectIncludes))
291     return IGN;
292   for (llvm::StringRef &Include : IGN.DirectIncludes)
293     Include = Data.consumeString(Strings);
294   return IGN;
295 }
296 
writeIncludeGraphNode(const IncludeGraphNode & IGN,const StringTableOut & Strings,llvm::raw_ostream & OS)297 void writeIncludeGraphNode(const IncludeGraphNode &IGN,
298                            const StringTableOut &Strings,
299                            llvm::raw_ostream &OS) {
300   OS.write(static_cast<uint8_t>(IGN.Flags));
301   writeVar(Strings.index(IGN.URI), OS);
302   llvm::StringRef Hash(reinterpret_cast<const char *>(IGN.Digest.data()),
303                        IGN.Digest.size());
304   OS << Hash;
305   writeVar(IGN.DirectIncludes.size(), OS);
306   for (llvm::StringRef Include : IGN.DirectIncludes)
307     writeVar(Strings.index(Include), OS);
308 }
309 
writeSymbol(const Symbol & Sym,const StringTableOut & Strings,llvm::raw_ostream & OS)310 void writeSymbol(const Symbol &Sym, const StringTableOut &Strings,
311                  llvm::raw_ostream &OS) {
312   OS << Sym.ID.raw(); // TODO: once we start writing xrefs and posting lists,
313                       // symbol IDs should probably be in a string table.
314   OS.write(static_cast<uint8_t>(Sym.SymInfo.Kind));
315   OS.write(static_cast<uint8_t>(Sym.SymInfo.Lang));
316   writeVar(Strings.index(Sym.Name), OS);
317   writeVar(Strings.index(Sym.Scope), OS);
318   writeVar(Strings.index(Sym.TemplateSpecializationArgs), OS);
319   writeLocation(Sym.Definition, Strings, OS);
320   writeLocation(Sym.CanonicalDeclaration, Strings, OS);
321   writeVar(Sym.References, OS);
322   OS.write(static_cast<uint8_t>(Sym.Flags));
323   OS.write(static_cast<uint8_t>(Sym.Origin));
324   writeVar(Strings.index(Sym.Signature), OS);
325   writeVar(Strings.index(Sym.CompletionSnippetSuffix), OS);
326   writeVar(Strings.index(Sym.Documentation), OS);
327   writeVar(Strings.index(Sym.ReturnType), OS);
328   writeVar(Strings.index(Sym.Type), OS);
329 
330   auto WriteInclude = [&](const Symbol::IncludeHeaderWithReferences &Include) {
331     writeVar(Strings.index(Include.IncludeHeader), OS);
332     writeVar(Include.References, OS);
333   };
334   writeVar(Sym.IncludeHeaders.size(), OS);
335   for (const auto &Include : Sym.IncludeHeaders)
336     WriteInclude(Include);
337 }
338 
readSymbol(Reader & Data,llvm::ArrayRef<llvm::StringRef> Strings)339 Symbol readSymbol(Reader &Data, llvm::ArrayRef<llvm::StringRef> Strings) {
340   Symbol Sym;
341   Sym.ID = Data.consumeID();
342   Sym.SymInfo.Kind = static_cast<index::SymbolKind>(Data.consume8());
343   Sym.SymInfo.Lang = static_cast<index::SymbolLanguage>(Data.consume8());
344   Sym.Name = Data.consumeString(Strings);
345   Sym.Scope = Data.consumeString(Strings);
346   Sym.TemplateSpecializationArgs = Data.consumeString(Strings);
347   Sym.Definition = readLocation(Data, Strings);
348   Sym.CanonicalDeclaration = readLocation(Data, Strings);
349   Sym.References = Data.consumeVar();
350   Sym.Flags = static_cast<Symbol::SymbolFlag>(Data.consume8());
351   Sym.Origin = static_cast<SymbolOrigin>(Data.consume8());
352   Sym.Signature = Data.consumeString(Strings);
353   Sym.CompletionSnippetSuffix = Data.consumeString(Strings);
354   Sym.Documentation = Data.consumeString(Strings);
355   Sym.ReturnType = Data.consumeString(Strings);
356   Sym.Type = Data.consumeString(Strings);
357   if (!Data.consumeSize(Sym.IncludeHeaders))
358     return Sym;
359   for (auto &I : Sym.IncludeHeaders) {
360     I.IncludeHeader = Data.consumeString(Strings);
361     I.References = Data.consumeVar();
362   }
363   return Sym;
364 }
365 
366 // REFS ENCODING
367 // A refs section has data grouped by Symbol. Each symbol has:
368 //  - SymbolID: 8 bytes
369 //  - NumRefs: varint
370 //  - Ref[NumRefs]
371 // Fields of Ref are encoded in turn, see implementation.
372 
writeRefs(const SymbolID & ID,llvm::ArrayRef<Ref> Refs,const StringTableOut & Strings,llvm::raw_ostream & OS)373 void writeRefs(const SymbolID &ID, llvm::ArrayRef<Ref> Refs,
374                const StringTableOut &Strings, llvm::raw_ostream &OS) {
375   OS << ID.raw();
376   writeVar(Refs.size(), OS);
377   for (const auto &Ref : Refs) {
378     OS.write(static_cast<unsigned char>(Ref.Kind));
379     writeLocation(Ref.Location, Strings, OS);
380     OS << Ref.Container.raw();
381   }
382 }
383 
384 std::pair<SymbolID, std::vector<Ref>>
readRefs(Reader & Data,llvm::ArrayRef<llvm::StringRef> Strings)385 readRefs(Reader &Data, llvm::ArrayRef<llvm::StringRef> Strings) {
386   std::pair<SymbolID, std::vector<Ref>> Result;
387   Result.first = Data.consumeID();
388   if (!Data.consumeSize(Result.second))
389     return Result;
390   for (auto &Ref : Result.second) {
391     Ref.Kind = static_cast<RefKind>(Data.consume8());
392     Ref.Location = readLocation(Data, Strings);
393     Ref.Container = Data.consumeID();
394   }
395   return Result;
396 }
397 
398 // RELATIONS ENCODING
399 // A relations section is a flat list of relations. Each relation has:
400 //  - SymbolID (subject): 8 bytes
401 //  - relation kind (predicate): 1 byte
402 //  - SymbolID (object): 8 bytes
403 // In the future, we might prefer a packed representation if the need arises.
404 
writeRelation(const Relation & R,llvm::raw_ostream & OS)405 void writeRelation(const Relation &R, llvm::raw_ostream &OS) {
406   OS << R.Subject.raw();
407   OS.write(static_cast<uint8_t>(R.Predicate));
408   OS << R.Object.raw();
409 }
410 
readRelation(Reader & Data)411 Relation readRelation(Reader &Data) {
412   SymbolID Subject = Data.consumeID();
413   RelationKind Predicate = static_cast<RelationKind>(Data.consume8());
414   SymbolID Object = Data.consumeID();
415   return {Subject, Predicate, Object};
416 }
417 
418 struct InternedCompileCommand {
419   llvm::StringRef Directory;
420   std::vector<llvm::StringRef> CommandLine;
421 };
422 
writeCompileCommand(const InternedCompileCommand & Cmd,const StringTableOut & Strings,llvm::raw_ostream & CmdOS)423 void writeCompileCommand(const InternedCompileCommand &Cmd,
424                          const StringTableOut &Strings,
425                          llvm::raw_ostream &CmdOS) {
426   writeVar(Strings.index(Cmd.Directory), CmdOS);
427   writeVar(Cmd.CommandLine.size(), CmdOS);
428   for (llvm::StringRef C : Cmd.CommandLine)
429     writeVar(Strings.index(C), CmdOS);
430 }
431 
432 InternedCompileCommand
readCompileCommand(Reader CmdReader,llvm::ArrayRef<llvm::StringRef> Strings)433 readCompileCommand(Reader CmdReader, llvm::ArrayRef<llvm::StringRef> Strings) {
434   InternedCompileCommand Cmd;
435   Cmd.Directory = CmdReader.consumeString(Strings);
436   if (!CmdReader.consumeSize(Cmd.CommandLine))
437     return Cmd;
438   for (llvm::StringRef &C : Cmd.CommandLine)
439     C = CmdReader.consumeString(Strings);
440   return Cmd;
441 }
442 
443 // FILE ENCODING
444 // A file is a RIFF chunk with type 'CdIx'.
445 // It contains the sections:
446 //   - meta: version number
447 //   - srcs: information related to include graph
448 //   - stri: string table
449 //   - symb: symbols
450 //   - refs: references to symbols
451 
452 // The current versioning scheme is simple - non-current versions are rejected.
453 // If you make a breaking change, bump this version number to invalidate stored
454 // data. Later we may want to support some backward compatibility.
455 constexpr static uint32_t Version = 16;
456 
readRIFF(llvm::StringRef Data)457 llvm::Expected<IndexFileIn> readRIFF(llvm::StringRef Data) {
458   auto RIFF = riff::readFile(Data);
459   if (!RIFF)
460     return RIFF.takeError();
461   if (RIFF->Type != riff::fourCC("CdIx"))
462     return error("wrong RIFF filetype: {0}", riff::fourCCStr(RIFF->Type));
463   llvm::StringMap<llvm::StringRef> Chunks;
464   for (const auto &Chunk : RIFF->Chunks)
465     Chunks.try_emplace(llvm::StringRef(Chunk.ID.data(), Chunk.ID.size()),
466                        Chunk.Data);
467 
468   if (!Chunks.count("meta"))
469     return error("missing meta chunk");
470   Reader Meta(Chunks.lookup("meta"));
471   auto SeenVersion = Meta.consume32();
472   if (SeenVersion != Version)
473     return error("wrong version: want {0}, got {1}", Version, SeenVersion);
474 
475   // meta chunk is checked above, as we prefer the "version mismatch" error.
476   for (llvm::StringRef RequiredChunk : {"stri"})
477     if (!Chunks.count(RequiredChunk))
478       return error("missing required chunk {0}", RequiredChunk);
479 
480   auto Strings = readStringTable(Chunks.lookup("stri"));
481   if (!Strings)
482     return Strings.takeError();
483 
484   IndexFileIn Result;
485   if (Chunks.count("srcs")) {
486     Reader SrcsReader(Chunks.lookup("srcs"));
487     Result.Sources.emplace();
488     while (!SrcsReader.eof()) {
489       auto IGN = readIncludeGraphNode(SrcsReader, Strings->Strings);
490       auto Entry = Result.Sources->try_emplace(IGN.URI).first;
491       Entry->getValue() = std::move(IGN);
492       // We change all the strings inside the structure to point at the keys in
493       // the map, since it is the only copy of the string that's going to live.
494       Entry->getValue().URI = Entry->getKey();
495       for (auto &Include : Entry->getValue().DirectIncludes)
496         Include = Result.Sources->try_emplace(Include).first->getKey();
497     }
498     if (SrcsReader.err())
499       return error("malformed or truncated include uri");
500   }
501 
502   if (Chunks.count("symb")) {
503     Reader SymbolReader(Chunks.lookup("symb"));
504     SymbolSlab::Builder Symbols;
505     while (!SymbolReader.eof())
506       Symbols.insert(readSymbol(SymbolReader, Strings->Strings));
507     if (SymbolReader.err())
508       return error("malformed or truncated symbol");
509     Result.Symbols = std::move(Symbols).build();
510   }
511   if (Chunks.count("refs")) {
512     Reader RefsReader(Chunks.lookup("refs"));
513     RefSlab::Builder Refs;
514     while (!RefsReader.eof()) {
515       auto RefsBundle = readRefs(RefsReader, Strings->Strings);
516       for (const auto &Ref : RefsBundle.second) // FIXME: bulk insert?
517         Refs.insert(RefsBundle.first, Ref);
518     }
519     if (RefsReader.err())
520       return error("malformed or truncated refs");
521     Result.Refs = std::move(Refs).build();
522   }
523   if (Chunks.count("rela")) {
524     Reader RelationsReader(Chunks.lookup("rela"));
525     RelationSlab::Builder Relations;
526     while (!RelationsReader.eof())
527       Relations.insert(readRelation(RelationsReader));
528     if (RelationsReader.err())
529       return error("malformed or truncated relations");
530     Result.Relations = std::move(Relations).build();
531   }
532   if (Chunks.count("cmdl")) {
533     Reader CmdReader(Chunks.lookup("cmdl"));
534     InternedCompileCommand Cmd =
535         readCompileCommand(CmdReader, Strings->Strings);
536     if (CmdReader.err())
537       return error("malformed or truncated commandline section");
538     Result.Cmd.emplace();
539     Result.Cmd->Directory = std::string(Cmd.Directory);
540     Result.Cmd->CommandLine.reserve(Cmd.CommandLine.size());
541     for (llvm::StringRef C : Cmd.CommandLine)
542       Result.Cmd->CommandLine.emplace_back(C);
543   }
544   return std::move(Result);
545 }
546 
547 template <class Callback>
visitStrings(IncludeGraphNode & IGN,const Callback & CB)548 void visitStrings(IncludeGraphNode &IGN, const Callback &CB) {
549   CB(IGN.URI);
550   for (llvm::StringRef &Include : IGN.DirectIncludes)
551     CB(Include);
552 }
553 
writeRIFF(const IndexFileOut & Data,llvm::raw_ostream & OS)554 void writeRIFF(const IndexFileOut &Data, llvm::raw_ostream &OS) {
555   assert(Data.Symbols && "An index file without symbols makes no sense!");
556   riff::File RIFF;
557   RIFF.Type = riff::fourCC("CdIx");
558 
559   llvm::SmallString<4> Meta;
560   {
561     llvm::raw_svector_ostream MetaOS(Meta);
562     write32(Version, MetaOS);
563   }
564   RIFF.Chunks.push_back({riff::fourCC("meta"), Meta});
565 
566   StringTableOut Strings;
567   std::vector<Symbol> Symbols;
568   for (const auto &Sym : *Data.Symbols) {
569     Symbols.emplace_back(Sym);
570     visitStrings(Symbols.back(),
571                  [&](llvm::StringRef &S) { Strings.intern(S); });
572   }
573   std::vector<IncludeGraphNode> Sources;
574   if (Data.Sources)
575     for (const auto &Source : *Data.Sources) {
576       Sources.push_back(Source.getValue());
577       visitStrings(Sources.back(),
578                    [&](llvm::StringRef &S) { Strings.intern(S); });
579     }
580 
581   std::vector<std::pair<SymbolID, std::vector<Ref>>> Refs;
582   if (Data.Refs) {
583     for (const auto &Sym : *Data.Refs) {
584       Refs.emplace_back(Sym);
585       for (auto &Ref : Refs.back().second) {
586         llvm::StringRef File = Ref.Location.FileURI;
587         Strings.intern(File);
588         Ref.Location.FileURI = File.data();
589       }
590     }
591   }
592 
593   std::vector<Relation> Relations;
594   if (Data.Relations) {
595     for (const auto &Relation : *Data.Relations) {
596       Relations.emplace_back(Relation);
597       // No strings to be interned in relations.
598     }
599   }
600 
601   InternedCompileCommand InternedCmd;
602   if (Data.Cmd) {
603     InternedCmd.CommandLine.reserve(Data.Cmd->CommandLine.size());
604     InternedCmd.Directory = Data.Cmd->Directory;
605     Strings.intern(InternedCmd.Directory);
606     for (llvm::StringRef C : Data.Cmd->CommandLine) {
607       InternedCmd.CommandLine.emplace_back(C);
608       Strings.intern(InternedCmd.CommandLine.back());
609     }
610   }
611 
612   std::string StringSection;
613   {
614     llvm::raw_string_ostream StringOS(StringSection);
615     Strings.finalize(StringOS);
616   }
617   RIFF.Chunks.push_back({riff::fourCC("stri"), StringSection});
618 
619   std::string SymbolSection;
620   {
621     llvm::raw_string_ostream SymbolOS(SymbolSection);
622     for (const auto &Sym : Symbols)
623       writeSymbol(Sym, Strings, SymbolOS);
624   }
625   RIFF.Chunks.push_back({riff::fourCC("symb"), SymbolSection});
626 
627   std::string RefsSection;
628   if (Data.Refs) {
629     {
630       llvm::raw_string_ostream RefsOS(RefsSection);
631       for (const auto &Sym : Refs)
632         writeRefs(Sym.first, Sym.second, Strings, RefsOS);
633     }
634     RIFF.Chunks.push_back({riff::fourCC("refs"), RefsSection});
635   }
636 
637   std::string RelationSection;
638   if (Data.Relations) {
639     {
640       llvm::raw_string_ostream RelationOS{RelationSection};
641       for (const auto &Relation : Relations)
642         writeRelation(Relation, RelationOS);
643     }
644     RIFF.Chunks.push_back({riff::fourCC("rela"), RelationSection});
645   }
646 
647   std::string SrcsSection;
648   {
649     {
650       llvm::raw_string_ostream SrcsOS(SrcsSection);
651       for (const auto &SF : Sources)
652         writeIncludeGraphNode(SF, Strings, SrcsOS);
653     }
654     RIFF.Chunks.push_back({riff::fourCC("srcs"), SrcsSection});
655   }
656 
657   std::string CmdlSection;
658   if (Data.Cmd) {
659     {
660       llvm::raw_string_ostream CmdOS(CmdlSection);
661       writeCompileCommand(InternedCmd, Strings, CmdOS);
662     }
663     RIFF.Chunks.push_back({riff::fourCC("cmdl"), CmdlSection});
664   }
665 
666   OS << RIFF;
667 }
668 
669 } // namespace
670 
671 // Defined in YAMLSerialization.cpp.
672 void writeYAML(const IndexFileOut &, llvm::raw_ostream &);
673 llvm::Expected<IndexFileIn> readYAML(llvm::StringRef);
674 
operator <<(llvm::raw_ostream & OS,const IndexFileOut & O)675 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const IndexFileOut &O) {
676   switch (O.Format) {
677   case IndexFileFormat::RIFF:
678     writeRIFF(O, OS);
679     break;
680   case IndexFileFormat::YAML:
681     writeYAML(O, OS);
682     break;
683   }
684   return OS;
685 }
686 
readIndexFile(llvm::StringRef Data)687 llvm::Expected<IndexFileIn> readIndexFile(llvm::StringRef Data) {
688   if (Data.startswith("RIFF")) {
689     return readRIFF(Data);
690   } else if (auto YAMLContents = readYAML(Data)) {
691     return std::move(*YAMLContents);
692   } else {
693     return error("Not a RIFF file and failed to parse as YAML: {0}",
694                  YAMLContents.takeError());
695   }
696 }
697 
loadIndex(llvm::StringRef SymbolFilename,bool UseDex)698 std::unique_ptr<SymbolIndex> loadIndex(llvm::StringRef SymbolFilename,
699                                        bool UseDex) {
700   trace::Span OverallTracer("LoadIndex");
701   auto Buffer = llvm::MemoryBuffer::getFile(SymbolFilename);
702   if (!Buffer) {
703     elog("Can't open {0}: {1}", SymbolFilename, Buffer.getError().message());
704     return nullptr;
705   }
706 
707   SymbolSlab Symbols;
708   RefSlab Refs;
709   RelationSlab Relations;
710   {
711     trace::Span Tracer("ParseIndex");
712     if (auto I = readIndexFile(Buffer->get()->getBuffer())) {
713       if (I->Symbols)
714         Symbols = std::move(*I->Symbols);
715       if (I->Refs)
716         Refs = std::move(*I->Refs);
717       if (I->Relations)
718         Relations = std::move(*I->Relations);
719     } else {
720       elog("Bad index file: {0}", I.takeError());
721       return nullptr;
722     }
723   }
724 
725   size_t NumSym = Symbols.size();
726   size_t NumRefs = Refs.numRefs();
727   size_t NumRelations = Relations.size();
728 
729   trace::Span Tracer("BuildIndex");
730   auto Index = UseDex ? dex::Dex::build(std::move(Symbols), std::move(Refs),
731                                         std::move(Relations))
732                       : MemIndex::build(std::move(Symbols), std::move(Refs),
733                                         std::move(Relations));
734   vlog("Loaded {0} from {1} with estimated memory usage {2} bytes\n"
735        "  - number of symbols: {3}\n"
736        "  - number of refs: {4}\n"
737        "  - number of relations: {5}",
738        UseDex ? "Dex" : "MemIndex", SymbolFilename,
739        Index->estimateMemoryUsage(), NumSym, NumRefs, NumRelations);
740   return Index;
741 }
742 
743 } // namespace clangd
744 } // namespace clang
745