1 //===--- Quality.cpp ---------------------------------------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "Quality.h"
10 #include "AST.h"
11 #include "FileDistance.h"
12 #include "SourceCode.h"
13 #include "URI.h"
14 #include "index/Symbol.h"
15 #include "clang/AST/ASTContext.h"
16 #include "clang/AST/Decl.h"
17 #include "clang/AST/DeclCXX.h"
18 #include "clang/AST/DeclTemplate.h"
19 #include "clang/AST/DeclVisitor.h"
20 #include "clang/Basic/CharInfo.h"
21 #include "clang/Basic/SourceManager.h"
22 #include "clang/Sema/CodeCompleteConsumer.h"
23 #include "llvm/ADT/ArrayRef.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/StringExtras.h"
27 #include "llvm/ADT/StringRef.h"
28 #include "llvm/Support/Casting.h"
29 #include "llvm/Support/FormatVariadic.h"
30 #include "llvm/Support/MathExtras.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include <algorithm>
33 #include <cmath>
34 
35 namespace clang {
36 namespace clangd {
isReserved(llvm::StringRef Name)37 static bool isReserved(llvm::StringRef Name) {
38   // FIXME: Should we exclude _Bool and others recognized by the standard?
39   return Name.size() >= 2 && Name[0] == '_' &&
40          (isUppercase(Name[1]) || Name[1] == '_');
41 }
42 
hasDeclInMainFile(const Decl & D)43 static bool hasDeclInMainFile(const Decl &D) {
44   auto &SourceMgr = D.getASTContext().getSourceManager();
45   for (auto *Redecl : D.redecls()) {
46     if (isInsideMainFile(Redecl->getLocation(), SourceMgr))
47       return true;
48   }
49   return false;
50 }
51 
hasUsingDeclInMainFile(const CodeCompletionResult & R)52 static bool hasUsingDeclInMainFile(const CodeCompletionResult &R) {
53   const auto &Context = R.Declaration->getASTContext();
54   const auto &SourceMgr = Context.getSourceManager();
55   if (R.ShadowDecl) {
56     if (isInsideMainFile(R.ShadowDecl->getLocation(), SourceMgr))
57       return true;
58   }
59   return false;
60 }
61 
categorize(const NamedDecl & ND)62 static SymbolQualitySignals::SymbolCategory categorize(const NamedDecl &ND) {
63   if (const auto *FD = dyn_cast<FunctionDecl>(&ND)) {
64     if (FD->isOverloadedOperator())
65       return SymbolQualitySignals::Operator;
66   }
67   class Switch
68       : public ConstDeclVisitor<Switch, SymbolQualitySignals::SymbolCategory> {
69   public:
70 #define MAP(DeclType, Category)                                                \
71   SymbolQualitySignals::SymbolCategory Visit##DeclType(const DeclType *) {     \
72     return SymbolQualitySignals::Category;                                     \
73   }
74     MAP(NamespaceDecl, Namespace);
75     MAP(NamespaceAliasDecl, Namespace);
76     MAP(TypeDecl, Type);
77     MAP(TypeAliasTemplateDecl, Type);
78     MAP(ClassTemplateDecl, Type);
79     MAP(CXXConstructorDecl, Constructor);
80     MAP(CXXDestructorDecl, Destructor);
81     MAP(ValueDecl, Variable);
82     MAP(VarTemplateDecl, Variable);
83     MAP(FunctionDecl, Function);
84     MAP(FunctionTemplateDecl, Function);
85     MAP(Decl, Unknown);
86 #undef MAP
87   };
88   return Switch().Visit(&ND);
89 }
90 
91 static SymbolQualitySignals::SymbolCategory
categorize(const CodeCompletionResult & R)92 categorize(const CodeCompletionResult &R) {
93   if (R.Declaration)
94     return categorize(*R.Declaration);
95   if (R.Kind == CodeCompletionResult::RK_Macro)
96     return SymbolQualitySignals::Macro;
97   // Everything else is a keyword or a pattern. Patterns are mostly keywords
98   // too, except a few which we recognize by cursor kind.
99   switch (R.CursorKind) {
100   case CXCursor_CXXMethod:
101     return SymbolQualitySignals::Function;
102   case CXCursor_ModuleImportDecl:
103     return SymbolQualitySignals::Namespace;
104   case CXCursor_MacroDefinition:
105     return SymbolQualitySignals::Macro;
106   case CXCursor_TypeRef:
107     return SymbolQualitySignals::Type;
108   case CXCursor_MemberRef:
109     return SymbolQualitySignals::Variable;
110   case CXCursor_Constructor:
111     return SymbolQualitySignals::Constructor;
112   default:
113     return SymbolQualitySignals::Keyword;
114   }
115 }
116 
117 static SymbolQualitySignals::SymbolCategory
categorize(const index::SymbolInfo & D)118 categorize(const index::SymbolInfo &D) {
119   switch (D.Kind) {
120   case index::SymbolKind::Namespace:
121   case index::SymbolKind::NamespaceAlias:
122     return SymbolQualitySignals::Namespace;
123   case index::SymbolKind::Macro:
124     return SymbolQualitySignals::Macro;
125   case index::SymbolKind::Enum:
126   case index::SymbolKind::Struct:
127   case index::SymbolKind::Class:
128   case index::SymbolKind::Protocol:
129   case index::SymbolKind::Extension:
130   case index::SymbolKind::Union:
131   case index::SymbolKind::TypeAlias:
132     return SymbolQualitySignals::Type;
133   case index::SymbolKind::Function:
134   case index::SymbolKind::ClassMethod:
135   case index::SymbolKind::InstanceMethod:
136   case index::SymbolKind::StaticMethod:
137   case index::SymbolKind::InstanceProperty:
138   case index::SymbolKind::ClassProperty:
139   case index::SymbolKind::StaticProperty:
140   case index::SymbolKind::ConversionFunction:
141     return SymbolQualitySignals::Function;
142   case index::SymbolKind::Destructor:
143     return SymbolQualitySignals::Destructor;
144   case index::SymbolKind::Constructor:
145     return SymbolQualitySignals::Constructor;
146   case index::SymbolKind::Variable:
147   case index::SymbolKind::Field:
148   case index::SymbolKind::EnumConstant:
149   case index::SymbolKind::Parameter:
150     return SymbolQualitySignals::Variable;
151   case index::SymbolKind::Using:
152   case index::SymbolKind::Module:
153   case index::SymbolKind::Unknown:
154     return SymbolQualitySignals::Unknown;
155   }
156   llvm_unreachable("Unknown index::SymbolKind");
157 }
158 
isInstanceMember(const NamedDecl * ND)159 static bool isInstanceMember(const NamedDecl *ND) {
160   if (!ND)
161     return false;
162   if (const auto *TP = dyn_cast<FunctionTemplateDecl>(ND))
163     ND = TP->TemplateDecl::getTemplatedDecl();
164   if (const auto *CM = dyn_cast<CXXMethodDecl>(ND))
165     return !CM->isStatic();
166   return isa<FieldDecl>(ND); // Note that static fields are VarDecl.
167 }
168 
isInstanceMember(const index::SymbolInfo & D)169 static bool isInstanceMember(const index::SymbolInfo &D) {
170   switch (D.Kind) {
171   case index::SymbolKind::InstanceMethod:
172   case index::SymbolKind::InstanceProperty:
173   case index::SymbolKind::Field:
174     return true;
175   default:
176     return false;
177   }
178 }
179 
merge(const CodeCompletionResult & SemaCCResult)180 void SymbolQualitySignals::merge(const CodeCompletionResult &SemaCCResult) {
181   Deprecated |= (SemaCCResult.Availability == CXAvailability_Deprecated);
182   Category = categorize(SemaCCResult);
183 
184   if (SemaCCResult.Declaration) {
185     ImplementationDetail |= isImplementationDetail(SemaCCResult.Declaration);
186     if (auto *ID = SemaCCResult.Declaration->getIdentifier())
187       ReservedName = ReservedName || isReserved(ID->getName());
188   } else if (SemaCCResult.Kind == CodeCompletionResult::RK_Macro)
189     ReservedName = ReservedName || isReserved(SemaCCResult.Macro->getName());
190 }
191 
merge(const Symbol & IndexResult)192 void SymbolQualitySignals::merge(const Symbol &IndexResult) {
193   Deprecated |= (IndexResult.Flags & Symbol::Deprecated);
194   ImplementationDetail |= (IndexResult.Flags & Symbol::ImplementationDetail);
195   References = std::max(IndexResult.References, References);
196   Category = categorize(IndexResult.SymInfo);
197   ReservedName = ReservedName || isReserved(IndexResult.Name);
198 }
199 
evaluate() const200 float SymbolQualitySignals::evaluate() const {
201   float Score = 1;
202 
203   // This avoids a sharp gradient for tail symbols, and also neatly avoids the
204   // question of whether 0 references means a bad symbol or missing data.
205   if (References >= 10) {
206     // Use a sigmoid style boosting function, which flats out nicely for large
207     // numbers (e.g. 2.58 for 1M refererences).
208     // The following boosting function is equivalent to:
209     //   m = 0.06
210     //   f = 12.0
211     //   boost = f * sigmoid(m * std::log(References)) - 0.5 * f + 0.59
212     // Sample data points: (10, 1.00), (100, 1.41), (1000, 1.82),
213     //                     (10K, 2.21), (100K, 2.58), (1M, 2.94)
214     float S = std::pow(References, -0.06);
215     Score *= 6.0 * (1 - S) / (1 + S) + 0.59;
216   }
217 
218   if (Deprecated)
219     Score *= 0.1f;
220   if (ReservedName)
221     Score *= 0.1f;
222   if (ImplementationDetail)
223     Score *= 0.2f;
224 
225   switch (Category) {
226   case Keyword: // Often relevant, but misses most signals.
227     Score *= 4; // FIXME: important keywords should have specific boosts.
228     break;
229   case Type:
230   case Function:
231   case Variable:
232     Score *= 1.1f;
233     break;
234   case Namespace:
235     Score *= 0.8f;
236     break;
237   case Macro:
238   case Destructor:
239   case Operator:
240     Score *= 0.5f;
241     break;
242   case Constructor: // No boost constructors so they are after class types.
243   case Unknown:
244     break;
245   }
246 
247   return Score;
248 }
249 
operator <<(llvm::raw_ostream & OS,const SymbolQualitySignals & S)250 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
251                               const SymbolQualitySignals &S) {
252   OS << llvm::formatv("=== Symbol quality: {0}\n", S.evaluate());
253   OS << llvm::formatv("\tReferences: {0}\n", S.References);
254   OS << llvm::formatv("\tDeprecated: {0}\n", S.Deprecated);
255   OS << llvm::formatv("\tReserved name: {0}\n", S.ReservedName);
256   OS << llvm::formatv("\tCategory: {0}\n", static_cast<int>(S.Category));
257   return OS;
258 }
259 
260 static SymbolRelevanceSignals::AccessibleScope
computeScope(const NamedDecl * D)261 computeScope(const NamedDecl *D) {
262   // Injected "Foo" within the class "Foo" has file scope, not class scope.
263   const DeclContext *DC = D->getDeclContext();
264   if (auto *R = dyn_cast_or_null<RecordDecl>(D))
265     if (R->isInjectedClassName())
266       DC = DC->getParent();
267   // Class constructor should have the same scope as the class.
268   if (isa<CXXConstructorDecl>(D))
269     DC = DC->getParent();
270   bool InClass = false;
271   for (; !DC->isFileContext(); DC = DC->getParent()) {
272     if (DC->isFunctionOrMethod())
273       return SymbolRelevanceSignals::FunctionScope;
274     InClass = InClass || DC->isRecord();
275   }
276   if (InClass)
277     return SymbolRelevanceSignals::ClassScope;
278   // ExternalLinkage threshold could be tweaked, e.g. module-visible as global.
279   // Avoid caching linkage if it may change after enclosing code completion.
280   if (hasUnstableLinkage(D) || D->getLinkageInternal() < ExternalLinkage)
281     return SymbolRelevanceSignals::FileScope;
282   return SymbolRelevanceSignals::GlobalScope;
283 }
284 
merge(const Symbol & IndexResult)285 void SymbolRelevanceSignals::merge(const Symbol &IndexResult) {
286   SymbolURI = IndexResult.CanonicalDeclaration.FileURI;
287   SymbolScope = IndexResult.Scope;
288   IsInstanceMember |= isInstanceMember(IndexResult.SymInfo);
289   if (!(IndexResult.Flags & Symbol::VisibleOutsideFile)) {
290     Scope = AccessibleScope::FileScope;
291   }
292 }
293 
merge(const CodeCompletionResult & SemaCCResult)294 void SymbolRelevanceSignals::merge(const CodeCompletionResult &SemaCCResult) {
295   if (SemaCCResult.Availability == CXAvailability_NotAvailable ||
296       SemaCCResult.Availability == CXAvailability_NotAccessible)
297     Forbidden = true;
298 
299   if (SemaCCResult.Declaration) {
300     SemaSaysInScope = true;
301     // We boost things that have decls in the main file. We give a fixed score
302     // for all other declarations in sema as they are already included in the
303     // translation unit.
304     float DeclProximity = (hasDeclInMainFile(*SemaCCResult.Declaration) ||
305                            hasUsingDeclInMainFile(SemaCCResult))
306                               ? 1.0
307                               : 0.6;
308     SemaFileProximityScore = std::max(DeclProximity, SemaFileProximityScore);
309     IsInstanceMember |= isInstanceMember(SemaCCResult.Declaration);
310     InBaseClass |= SemaCCResult.InBaseClass;
311   }
312 
313   // Declarations are scoped, others (like macros) are assumed global.
314   if (SemaCCResult.Declaration)
315     Scope = std::min(Scope, computeScope(SemaCCResult.Declaration));
316 
317   NeedsFixIts = !SemaCCResult.FixIts.empty();
318 }
319 
uriProximity(llvm::StringRef SymbolURI,URIDistance * D)320 static std::pair<float, unsigned> uriProximity(llvm::StringRef SymbolURI,
321                                                URIDistance *D) {
322   if (!D || SymbolURI.empty())
323     return {0.f, 0u};
324   unsigned Distance = D->distance(SymbolURI);
325   // Assume approximately default options are used for sensible scoring.
326   return {std::exp(Distance * -0.4f / FileDistanceOptions().UpCost), Distance};
327 }
328 
scopeBoost(ScopeDistance & Distance,llvm::Optional<llvm::StringRef> SymbolScope)329 static float scopeBoost(ScopeDistance &Distance,
330                         llvm::Optional<llvm::StringRef> SymbolScope) {
331   if (!SymbolScope)
332     return 1;
333   auto D = Distance.distance(*SymbolScope);
334   if (D == FileDistance::Unreachable)
335     return 0.6f;
336   return std::max(0.65, 2.0 * std::pow(0.6, D / 2.0));
337 }
338 
339 static llvm::Optional<llvm::StringRef>
wordMatching(llvm::StringRef Name,const llvm::StringSet<> * ContextWords)340 wordMatching(llvm::StringRef Name, const llvm::StringSet<> *ContextWords) {
341   if (ContextWords)
342     for (const auto& Word : ContextWords->keys())
343       if (Name.contains_lower(Word))
344         return Word;
345   return llvm::None;
346 }
347 
evaluate() const348 float SymbolRelevanceSignals::evaluate() const {
349   float Score = 1;
350 
351   if (Forbidden)
352     return 0;
353 
354   Score *= NameMatch;
355 
356   // File proximity scores are [0,1] and we translate them into a multiplier in
357   // the range from 1 to 3.
358   Score *= 1 + 2 * std::max(uriProximity(SymbolURI, FileProximityMatch).first,
359                             SemaFileProximityScore);
360 
361   if (ScopeProximityMatch)
362     // Use a constant scope boost for sema results, as scopes of sema results
363     // can be tricky (e.g. class/function scope). Set to the max boost as we
364     // don't load top-level symbols from the preamble and sema results are
365     // always in the accessible scope.
366     Score *=
367         SemaSaysInScope ? 2.0 : scopeBoost(*ScopeProximityMatch, SymbolScope);
368 
369   if (wordMatching(Name, ContextWords))
370     Score *= 1.5;
371 
372   // Symbols like local variables may only be referenced within their scope.
373   // Conversely if we're in that scope, it's likely we'll reference them.
374   if (Query == CodeComplete) {
375     // The narrower the scope where a symbol is visible, the more likely it is
376     // to be relevant when it is available.
377     switch (Scope) {
378     case GlobalScope:
379       break;
380     case FileScope:
381       Score *= 1.5f;
382       break;
383     case ClassScope:
384       Score *= 2;
385       break;
386     case FunctionScope:
387       Score *= 4;
388       break;
389     }
390   } else {
391     // For non-completion queries, the wider the scope where a symbol is
392     // visible, the more likely it is to be relevant.
393     switch (Scope) {
394     case GlobalScope:
395       break;
396     case FileScope:
397       Score *= 0.5f;
398       break;
399     default:
400       // TODO: Handle other scopes as we start to use them for index results.
401       break;
402     }
403   }
404 
405   if (TypeMatchesPreferred)
406     Score *= 5.0;
407 
408   // Penalize non-instance members when they are accessed via a class instance.
409   if (!IsInstanceMember &&
410       (Context == CodeCompletionContext::CCC_DotMemberAccess ||
411        Context == CodeCompletionContext::CCC_ArrowMemberAccess)) {
412     Score *= 0.2f;
413   }
414 
415   if (InBaseClass)
416     Score *= 0.5f;
417 
418   // Penalize for FixIts.
419   if (NeedsFixIts)
420     Score *= 0.5f;
421 
422   return Score;
423 }
424 
operator <<(llvm::raw_ostream & OS,const SymbolRelevanceSignals & S)425 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
426                               const SymbolRelevanceSignals &S) {
427   OS << llvm::formatv("=== Symbol relevance: {0}\n", S.evaluate());
428   OS << llvm::formatv("\tName: {0}\n", S.Name);
429   OS << llvm::formatv("\tName match: {0}\n", S.NameMatch);
430   if (S.ContextWords)
431     OS << llvm::formatv(
432         "\tMatching context word: {0}\n",
433         wordMatching(S.Name, S.ContextWords).getValueOr("<none>"));
434   OS << llvm::formatv("\tForbidden: {0}\n", S.Forbidden);
435   OS << llvm::formatv("\tNeedsFixIts: {0}\n", S.NeedsFixIts);
436   OS << llvm::formatv("\tIsInstanceMember: {0}\n", S.IsInstanceMember);
437   OS << llvm::formatv("\tContext: {0}\n", getCompletionKindString(S.Context));
438   OS << llvm::formatv("\tQuery type: {0}\n", static_cast<int>(S.Query));
439   OS << llvm::formatv("\tScope: {0}\n", static_cast<int>(S.Scope));
440 
441   OS << llvm::formatv("\tSymbol URI: {0}\n", S.SymbolURI);
442   OS << llvm::formatv("\tSymbol scope: {0}\n",
443                       S.SymbolScope ? *S.SymbolScope : "<None>");
444 
445   if (S.FileProximityMatch) {
446     auto Score = uriProximity(S.SymbolURI, S.FileProximityMatch);
447     OS << llvm::formatv("\tIndex URI proximity: {0} (distance={1})\n",
448                         Score.first, Score.second);
449   }
450   OS << llvm::formatv("\tSema file proximity: {0}\n", S.SemaFileProximityScore);
451 
452   OS << llvm::formatv("\tSema says in scope: {0}\n", S.SemaSaysInScope);
453   if (S.ScopeProximityMatch)
454     OS << llvm::formatv("\tIndex scope boost: {0}\n",
455                         scopeBoost(*S.ScopeProximityMatch, S.SymbolScope));
456 
457   OS << llvm::formatv(
458       "\tType matched preferred: {0} (Context type: {1}, Symbol type: {2}\n",
459       S.TypeMatchesPreferred, S.HadContextType, S.HadSymbolType);
460 
461   return OS;
462 }
463 
evaluateSymbolAndRelevance(float SymbolQuality,float SymbolRelevance)464 float evaluateSymbolAndRelevance(float SymbolQuality, float SymbolRelevance) {
465   return SymbolQuality * SymbolRelevance;
466 }
467 
468 // Produces an integer that sorts in the same order as F.
469 // That is: a < b <==> encodeFloat(a) < encodeFloat(b).
encodeFloat(float F)470 static uint32_t encodeFloat(float F) {
471   static_assert(std::numeric_limits<float>::is_iec559, "");
472   constexpr uint32_t TopBit = ~(~uint32_t{0} >> 1);
473 
474   // Get the bits of the float. Endianness is the same as for integers.
475   uint32_t U = llvm::FloatToBits(F);
476   // IEEE 754 floats compare like sign-magnitude integers.
477   if (U & TopBit)    // Negative float.
478     return 0 - U;    // Map onto the low half of integers, order reversed.
479   return U + TopBit; // Positive floats map onto the high half of integers.
480 }
481 
sortText(float Score,llvm::StringRef Name)482 std::string sortText(float Score, llvm::StringRef Name) {
483   // We convert -Score to an integer, and hex-encode for readability.
484   // Example: [0.5, "foo"] -> "41000000foo"
485   std::string S;
486   llvm::raw_string_ostream OS(S);
487   llvm::write_hex(OS, encodeFloat(-Score), llvm::HexPrintStyle::Lower,
488                   /*Width=*/2 * sizeof(Score));
489   OS << Name;
490   OS.flush();
491   return S;
492 }
493 
operator <<(llvm::raw_ostream & OS,const SignatureQualitySignals & S)494 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
495                               const SignatureQualitySignals &S) {
496   OS << llvm::formatv("=== Signature Quality:\n");
497   OS << llvm::formatv("\tNumber of parameters: {0}\n", S.NumberOfParameters);
498   OS << llvm::formatv("\tNumber of optional parameters: {0}\n",
499                       S.NumberOfOptionalParameters);
500   OS << llvm::formatv("\tKind: {0}\n", S.Kind);
501   return OS;
502 }
503 
504 } // namespace clangd
505 } // namespace clang
506