1 //===--- Dex.cpp - Dex Symbol Index Implementation --------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "Dex.h"
10 #include "FileDistance.h"
11 #include "FuzzyMatch.h"
12 #include "Quality.h"
13 #include "index/Index.h"
14 #include "index/dex/Iterator.h"
15 #include "support/Logger.h"
16 #include "support/Trace.h"
17 #include "llvm/ADT/StringSet.h"
18 #include "llvm/Support/ScopedPrinter.h"
19 #include <algorithm>
20 #include <queue>
21 
22 namespace clang {
23 namespace clangd {
24 namespace dex {
25 
build(SymbolSlab Symbols,RefSlab Refs,RelationSlab Rels)26 std::unique_ptr<SymbolIndex> Dex::build(SymbolSlab Symbols, RefSlab Refs,
27                                         RelationSlab Rels) {
28   auto Size = Symbols.bytes() + Refs.bytes();
29   // There is no need to include "Rels" in Data because the relations are self-
30   // contained, without references into a backing store.
31   auto Data = std::make_pair(std::move(Symbols), std::move(Refs));
32   return std::make_unique<Dex>(Data.first, Data.second, Rels, std::move(Data),
33                                 Size);
34 }
35 
36 namespace {
37 
38 // Mark symbols which are can be used for code completion.
39 const Token RestrictedForCodeCompletion =
40     Token(Token::Kind::Sentinel, "Restricted For Code Completion");
41 
42 // Helper to efficiently assemble the inverse index (token -> matching docs).
43 // The output is a nice uniform structure keyed on Token, but constructing
44 // the Token object every time we want to insert into the map is wasteful.
45 // Instead we have various maps keyed on things that are cheap to compute,
46 // and produce the Token keys once at the end.
47 class IndexBuilder {
48   llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
49   std::vector<DocID> RestrictedCCDocs;
50   llvm::StringMap<std::vector<DocID>> TypeDocs;
51   llvm::StringMap<std::vector<DocID>> ScopeDocs;
52   llvm::StringMap<std::vector<DocID>> ProximityDocs;
53   std::vector<Trigram> TrigramScratch;
54 
55 public:
56   // Add the tokens which are given symbol's characteristics.
57   // This includes fuzzy matching trigrams, symbol's scope, etc.
58   // FIXME(kbobyrev): Support more token types:
59   // * Namespace proximity
add(const Symbol & Sym,DocID D)60   void add(const Symbol &Sym, DocID D) {
61     generateIdentifierTrigrams(Sym.Name, TrigramScratch);
62     for (Trigram T : TrigramScratch)
63       TrigramDocs[T].push_back(D);
64     ScopeDocs[Sym.Scope].push_back(D);
65     if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
66       for (const auto &ProximityURI :
67            generateProximityURIs(Sym.CanonicalDeclaration.FileURI))
68         ProximityDocs[ProximityURI].push_back(D);
69     if (Sym.Flags & Symbol::IndexedForCodeCompletion)
70       RestrictedCCDocs.push_back(D);
71     if (!Sym.Type.empty())
72       TypeDocs[Sym.Type].push_back(D);
73   }
74 
75   // Assemble the final compressed posting lists for the added symbols.
build()76   llvm::DenseMap<Token, PostingList> build() {
77     llvm::DenseMap<Token, PostingList> Result(/*InitialReserve=*/
78                                               TrigramDocs.size() +
79                                               RestrictedCCDocs.size() +
80                                               TypeDocs.size() +
81                                               ScopeDocs.size() +
82                                               ProximityDocs.size());
83     for (const auto &E : TrigramDocs)
84       Result.try_emplace(Token(Token::Kind::Trigram, E.first.str()), E.second);
85     for (const auto &E : TypeDocs)
86       Result.try_emplace(Token(Token::Kind::Type, E.first()), E.second);
87     for (const auto &E : ScopeDocs)
88       Result.try_emplace(Token(Token::Kind::Scope, E.first()), E.second);
89     for (const auto &E : ProximityDocs)
90       Result.try_emplace(Token(Token::Kind::ProximityURI, E.first()), E.second);
91     if (!RestrictedCCDocs.empty())
92       Result.try_emplace(RestrictedForCodeCompletion, RestrictedCCDocs);
93     return Result;
94   }
95 };
96 
97 } // namespace
98 
buildIndex()99 void Dex::buildIndex() {
100   this->Corpus = dex::Corpus(Symbols.size());
101   std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
102 
103   for (size_t I = 0; I < Symbols.size(); ++I) {
104     const Symbol *Sym = Symbols[I];
105     LookupTable[Sym->ID] = Sym;
106     ScoredSymbols[I] = {quality(*Sym), Sym};
107   }
108 
109   // Symbols are sorted by symbol qualities so that items in the posting lists
110   // are stored in the descending order of symbol quality.
111   llvm::sort(ScoredSymbols, std::greater<std::pair<float, const Symbol *>>());
112 
113   // SymbolQuality was empty up until now.
114   SymbolQuality.resize(Symbols.size());
115   // Populate internal storage using Symbol + Score pairs.
116   for (size_t I = 0; I < ScoredSymbols.size(); ++I) {
117     SymbolQuality[I] = ScoredSymbols[I].first;
118     Symbols[I] = ScoredSymbols[I].second;
119   }
120 
121   // Build posting lists for symbols.
122   IndexBuilder Builder;
123   for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
124     Builder.add(*Symbols[SymbolRank], SymbolRank);
125   InvertedIndex = Builder.build();
126 }
127 
iterator(const Token & Tok) const128 std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
129   auto It = InvertedIndex.find(Tok);
130   return It == InvertedIndex.end() ? Corpus.none()
131                                    : It->second.iterator(&It->first);
132 }
133 
134 // Constructs BOOST iterators for Path Proximities.
createFileProximityIterator(llvm::ArrayRef<std::string> ProximityPaths) const135 std::unique_ptr<Iterator> Dex::createFileProximityIterator(
136     llvm::ArrayRef<std::string> ProximityPaths) const {
137   std::vector<std::unique_ptr<Iterator>> BoostingIterators;
138   // Deduplicate parent URIs extracted from the ProximityPaths.
139   llvm::StringSet<> ParentURIs;
140   llvm::StringMap<SourceParams> Sources;
141   for (const auto &Path : ProximityPaths) {
142     Sources[Path] = SourceParams();
143     auto PathURI = URI::create(Path);
144     const auto PathProximityURIs = generateProximityURIs(PathURI.toString());
145     for (const auto &ProximityURI : PathProximityURIs)
146       ParentURIs.insert(ProximityURI);
147   }
148   // Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults
149   // for all parameters except for Proximity Path distance signal.
150   SymbolRelevanceSignals PathProximitySignals;
151   // DistanceCalculator will find the shortest distance from ProximityPaths to
152   // any URI extracted from the ProximityPaths.
153   URIDistance DistanceCalculator(Sources);
154   PathProximitySignals.FileProximityMatch = &DistanceCalculator;
155   // Try to build BOOST iterator for each Proximity Path provided by
156   // ProximityPaths. Boosting factor should depend on the distance to the
157   // Proximity Path: the closer processed path is, the higher boosting factor.
158   for (const auto &ParentURI : ParentURIs.keys()) {
159     // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
160     auto It = iterator(Token(Token::Kind::ProximityURI, ParentURI));
161     if (It->kind() != Iterator::Kind::False) {
162       PathProximitySignals.SymbolURI = ParentURI;
163       BoostingIterators.push_back(Corpus.boost(
164           std::move(It), PathProximitySignals.evaluateHeuristics()));
165     }
166   }
167   BoostingIterators.push_back(Corpus.all());
168   return Corpus.unionOf(std::move(BoostingIterators));
169 }
170 
171 // Constructs BOOST iterators for preferred types.
172 std::unique_ptr<Iterator>
createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const173 Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const {
174   std::vector<std::unique_ptr<Iterator>> BoostingIterators;
175   SymbolRelevanceSignals PreferredTypeSignals;
176   PreferredTypeSignals.TypeMatchesPreferred = true;
177   auto Boost = PreferredTypeSignals.evaluateHeuristics();
178   for (const auto &T : Types)
179     BoostingIterators.push_back(
180         Corpus.boost(iterator(Token(Token::Kind::Type, T)), Boost));
181   BoostingIterators.push_back(Corpus.all());
182   return Corpus.unionOf(std::move(BoostingIterators));
183 }
184 
185 /// Constructs iterators over tokens extracted from the query and exhausts it
186 /// while applying Callback to each symbol in the order of decreasing quality
187 /// of the matched symbols.
fuzzyFind(const FuzzyFindRequest & Req,llvm::function_ref<void (const Symbol &)> Callback) const188 bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
189                     llvm::function_ref<void(const Symbol &)> Callback) const {
190   assert(!StringRef(Req.Query).contains("::") &&
191          "There must be no :: in query.");
192   trace::Span Tracer("Dex fuzzyFind");
193   FuzzyMatcher Filter(Req.Query);
194   // For short queries we use specialized trigrams that don't yield all results.
195   // Prevent clients from postfiltering them for longer queries.
196   bool More = !Req.Query.empty() && Req.Query.size() < 3;
197 
198   std::vector<std::unique_ptr<Iterator>> Criteria;
199   const auto TrigramTokens = generateQueryTrigrams(Req.Query);
200 
201   // Generate query trigrams and construct AND iterator over all query
202   // trigrams.
203   std::vector<std::unique_ptr<Iterator>> TrigramIterators;
204   for (const auto &Trigram : TrigramTokens)
205     TrigramIterators.push_back(iterator(Trigram));
206   Criteria.push_back(Corpus.intersect(move(TrigramIterators)));
207 
208   // Generate scope tokens for search query.
209   std::vector<std::unique_ptr<Iterator>> ScopeIterators;
210   for (const auto &Scope : Req.Scopes)
211     ScopeIterators.push_back(iterator(Token(Token::Kind::Scope, Scope)));
212   if (Req.AnyScope)
213     ScopeIterators.push_back(
214         Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2));
215   Criteria.push_back(Corpus.unionOf(move(ScopeIterators)));
216 
217   // Add proximity paths boosting (all symbols, some boosted).
218   Criteria.push_back(createFileProximityIterator(Req.ProximityPaths));
219   // Add boosting for preferred types.
220   Criteria.push_back(createTypeBoostingIterator(Req.PreferredTypes));
221 
222   if (Req.RestrictForCodeCompletion)
223     Criteria.push_back(iterator(RestrictedForCodeCompletion));
224 
225   // Use TRUE iterator if both trigrams and scopes from the query are not
226   // present in the symbol index.
227   auto Root = Corpus.intersect(move(Criteria));
228   // Retrieve more items than it was requested: some of  the items with high
229   // final score might not be retrieved otherwise.
230   // FIXME(kbobyrev): Tune this ratio.
231   if (Req.Limit)
232     Root = Corpus.limit(move(Root), *Req.Limit * 100);
233   SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root));
234   vlog("Dex query tree: {0}", *Root);
235 
236   using IDAndScore = std::pair<DocID, float>;
237   std::vector<IDAndScore> IDAndScores = consume(*Root);
238 
239   auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) {
240     return LHS.second > RHS.second;
241   };
242   TopN<IDAndScore, decltype(Compare)> Top(
243       Req.Limit ? *Req.Limit : std::numeric_limits<size_t>::max(), Compare);
244   for (const auto &IDAndScore : IDAndScores) {
245     const DocID SymbolDocID = IDAndScore.first;
246     const auto *Sym = Symbols[SymbolDocID];
247     const llvm::Optional<float> Score = Filter.match(Sym->Name);
248     if (!Score)
249       continue;
250     // Combine Fuzzy Matching score, precomputed symbol quality and boosting
251     // score for a cumulative final symbol score.
252     const float FinalScore =
253         (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
254     // If Top.push(...) returns true, it means that it had to pop an item. In
255     // this case, it is possible to retrieve more symbols.
256     if (Top.push({SymbolDocID, FinalScore}))
257       More = true;
258   }
259 
260   // Apply callback to the top Req.Limit items in the descending
261   // order of cumulative score.
262   for (const auto &Item : std::move(Top).items())
263     Callback(*Symbols[Item.first]);
264   return More;
265 }
266 
lookup(const LookupRequest & Req,llvm::function_ref<void (const Symbol &)> Callback) const267 void Dex::lookup(const LookupRequest &Req,
268                  llvm::function_ref<void(const Symbol &)> Callback) const {
269   trace::Span Tracer("Dex lookup");
270   for (const auto &ID : Req.IDs) {
271     auto I = LookupTable.find(ID);
272     if (I != LookupTable.end())
273       Callback(*I->second);
274   }
275 }
276 
refs(const RefsRequest & Req,llvm::function_ref<void (const Ref &)> Callback) const277 bool Dex::refs(const RefsRequest &Req,
278                llvm::function_ref<void(const Ref &)> Callback) const {
279   trace::Span Tracer("Dex refs");
280   uint32_t Remaining =
281       Req.Limit.getValueOr(std::numeric_limits<uint32_t>::max());
282   for (const auto &ID : Req.IDs)
283     for (const auto &Ref : Refs.lookup(ID)) {
284       if (!static_cast<int>(Req.Filter & Ref.Kind))
285         continue;
286       if (Remaining == 0)
287         return true; // More refs were available.
288       --Remaining;
289       Callback(Ref);
290     }
291   return false; // We reported all refs.
292 }
293 
relations(const RelationsRequest & Req,llvm::function_ref<void (const SymbolID &,const Symbol &)> Callback) const294 void Dex::relations(
295     const RelationsRequest &Req,
296     llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
297   trace::Span Tracer("Dex relations");
298   uint32_t Remaining =
299       Req.Limit.getValueOr(std::numeric_limits<uint32_t>::max());
300   for (const SymbolID &Subject : Req.Subjects) {
301     LookupRequest LookupReq;
302     auto It = Relations.find(
303         std::make_pair(Subject, static_cast<uint8_t>(Req.Predicate)));
304     if (It != Relations.end()) {
305       for (const auto &Object : It->second) {
306         if (Remaining > 0) {
307           --Remaining;
308           LookupReq.IDs.insert(Object);
309         }
310       }
311     }
312     lookup(LookupReq, [&](const Symbol &Object) { Callback(Subject, Object); });
313   }
314 }
315 
estimateMemoryUsage() const316 size_t Dex::estimateMemoryUsage() const {
317   size_t Bytes = Symbols.size() * sizeof(const Symbol *);
318   Bytes += SymbolQuality.size() * sizeof(float);
319   Bytes += LookupTable.getMemorySize();
320   Bytes += InvertedIndex.getMemorySize();
321   for (const auto &TokenToPostingList : InvertedIndex)
322     Bytes += TokenToPostingList.second.bytes();
323   Bytes += Refs.getMemorySize();
324   Bytes += Relations.getMemorySize();
325   return Bytes + BackingDataSize;
326 }
327 
generateProximityURIs(llvm::StringRef URIPath)328 std::vector<std::string> generateProximityURIs(llvm::StringRef URIPath) {
329   std::vector<std::string> Result;
330   auto ParsedURI = URI::parse(URIPath);
331   assert(ParsedURI &&
332          "Non-empty argument of generateProximityURIs() should be a valid "
333          "URI.");
334   llvm::StringRef Body = ParsedURI->body();
335   // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
336   // size of resulting vector. Some projects might want to have higher limit if
337   // the file hierarchy is deeper. For the generic case, it would be useful to
338   // calculate Limit in the index build stage by calculating the maximum depth
339   // of the project source tree at runtime.
340   size_t Limit = 5;
341   // Insert original URI before the loop: this would save a redundant iteration
342   // with a URI parse.
343   Result.emplace_back(ParsedURI->toString());
344   while (!Body.empty() && --Limit > 0) {
345     // FIXME(kbobyrev): Parsing and encoding path to URIs is not necessary and
346     // could be optimized.
347     Body = llvm::sys::path::parent_path(Body, llvm::sys::path::Style::posix);
348     if (!Body.empty())
349       Result.emplace_back(
350           URI(ParsedURI->scheme(), ParsedURI->authority(), Body).toString());
351   }
352   return Result;
353 }
354 
355 } // namespace dex
356 } // namespace clangd
357 } // namespace clang
358