1 //===--- Quality.cpp ---------------------------------------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "Quality.h"
10 #include "AST.h"
11 #include "FileDistance.h"
12 #include "SourceCode.h"
13 #include "URI.h"
14 #include "index/Symbol.h"
15 #include "clang/AST/ASTContext.h"
16 #include "clang/AST/Decl.h"
17 #include "clang/AST/DeclCXX.h"
18 #include "clang/AST/DeclTemplate.h"
19 #include "clang/AST/DeclVisitor.h"
20 #include "clang/Basic/CharInfo.h"
21 #include "clang/Basic/SourceManager.h"
22 #include "clang/Sema/CodeCompleteConsumer.h"
23 #include "llvm/ADT/ArrayRef.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/StringExtras.h"
27 #include "llvm/ADT/StringRef.h"
28 #include "llvm/Support/Casting.h"
29 #include "llvm/Support/FormatVariadic.h"
30 #include "llvm/Support/MathExtras.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include <algorithm>
33 #include <cmath>
34 
35 namespace clang {
36 namespace clangd {
isReserved(llvm::StringRef Name)37 static bool isReserved(llvm::StringRef Name) {
38   // FIXME: Should we exclude _Bool and others recognized by the standard?
39   return Name.size() >= 2 && Name[0] == '_' &&
40          (isUppercase(Name[1]) || Name[1] == '_');
41 }
42 
hasDeclInMainFile(const Decl & D)43 static bool hasDeclInMainFile(const Decl &D) {
44   auto &SourceMgr = D.getASTContext().getSourceManager();
45   for (auto *Redecl : D.redecls()) {
46     if (isInsideMainFile(Redecl->getLocation(), SourceMgr))
47       return true;
48   }
49   return false;
50 }
51 
hasUsingDeclInMainFile(const CodeCompletionResult & R)52 static bool hasUsingDeclInMainFile(const CodeCompletionResult &R) {
53   const auto &Context = R.Declaration->getASTContext();
54   const auto &SourceMgr = Context.getSourceManager();
55   if (R.ShadowDecl) {
56     if (isInsideMainFile(R.ShadowDecl->getLocation(), SourceMgr))
57       return true;
58   }
59   return false;
60 }
61 
categorize(const NamedDecl & ND)62 static SymbolQualitySignals::SymbolCategory categorize(const NamedDecl &ND) {
63   if (const auto *FD = dyn_cast<FunctionDecl>(&ND)) {
64     if (FD->isOverloadedOperator())
65       return SymbolQualitySignals::Operator;
66   }
67   class Switch
68       : public ConstDeclVisitor<Switch, SymbolQualitySignals::SymbolCategory> {
69   public:
70 #define MAP(DeclType, Category)                                                \
71   SymbolQualitySignals::SymbolCategory Visit##DeclType(const DeclType *) {     \
72     return SymbolQualitySignals::Category;                                     \
73   }
74     MAP(NamespaceDecl, Namespace);
75     MAP(NamespaceAliasDecl, Namespace);
76     MAP(TypeDecl, Type);
77     MAP(TypeAliasTemplateDecl, Type);
78     MAP(ClassTemplateDecl, Type);
79     MAP(CXXConstructorDecl, Constructor);
80     MAP(CXXDestructorDecl, Destructor);
81     MAP(ValueDecl, Variable);
82     MAP(VarTemplateDecl, Variable);
83     MAP(FunctionDecl, Function);
84     MAP(FunctionTemplateDecl, Function);
85     MAP(Decl, Unknown);
86 #undef MAP
87   };
88   return Switch().Visit(&ND);
89 }
90 
91 static SymbolQualitySignals::SymbolCategory
categorize(const CodeCompletionResult & R)92 categorize(const CodeCompletionResult &R) {
93   if (R.Declaration)
94     return categorize(*R.Declaration);
95   if (R.Kind == CodeCompletionResult::RK_Macro)
96     return SymbolQualitySignals::Macro;
97   // Everything else is a keyword or a pattern. Patterns are mostly keywords
98   // too, except a few which we recognize by cursor kind.
99   switch (R.CursorKind) {
100   case CXCursor_CXXMethod:
101     return SymbolQualitySignals::Function;
102   case CXCursor_ModuleImportDecl:
103     return SymbolQualitySignals::Namespace;
104   case CXCursor_MacroDefinition:
105     return SymbolQualitySignals::Macro;
106   case CXCursor_TypeRef:
107     return SymbolQualitySignals::Type;
108   case CXCursor_MemberRef:
109     return SymbolQualitySignals::Variable;
110   case CXCursor_Constructor:
111     return SymbolQualitySignals::Constructor;
112   default:
113     return SymbolQualitySignals::Keyword;
114   }
115 }
116 
117 static SymbolQualitySignals::SymbolCategory
categorize(const index::SymbolInfo & D)118 categorize(const index::SymbolInfo &D) {
119   switch (D.Kind) {
120   case index::SymbolKind::Namespace:
121   case index::SymbolKind::NamespaceAlias:
122     return SymbolQualitySignals::Namespace;
123   case index::SymbolKind::Macro:
124     return SymbolQualitySignals::Macro;
125   case index::SymbolKind::Enum:
126   case index::SymbolKind::Struct:
127   case index::SymbolKind::Class:
128   case index::SymbolKind::Protocol:
129   case index::SymbolKind::Extension:
130   case index::SymbolKind::Union:
131   case index::SymbolKind::TypeAlias:
132   case index::SymbolKind::TemplateTypeParm:
133   case index::SymbolKind::TemplateTemplateParm:
134     return SymbolQualitySignals::Type;
135   case index::SymbolKind::Function:
136   case index::SymbolKind::ClassMethod:
137   case index::SymbolKind::InstanceMethod:
138   case index::SymbolKind::StaticMethod:
139   case index::SymbolKind::InstanceProperty:
140   case index::SymbolKind::ClassProperty:
141   case index::SymbolKind::StaticProperty:
142   case index::SymbolKind::ConversionFunction:
143     return SymbolQualitySignals::Function;
144   case index::SymbolKind::Destructor:
145     return SymbolQualitySignals::Destructor;
146   case index::SymbolKind::Constructor:
147     return SymbolQualitySignals::Constructor;
148   case index::SymbolKind::Variable:
149   case index::SymbolKind::Field:
150   case index::SymbolKind::EnumConstant:
151   case index::SymbolKind::Parameter:
152   case index::SymbolKind::NonTypeTemplateParm:
153     return SymbolQualitySignals::Variable;
154   case index::SymbolKind::Using:
155   case index::SymbolKind::Module:
156   case index::SymbolKind::Unknown:
157     return SymbolQualitySignals::Unknown;
158   }
159   llvm_unreachable("Unknown index::SymbolKind");
160 }
161 
isInstanceMember(const NamedDecl * ND)162 static bool isInstanceMember(const NamedDecl *ND) {
163   if (!ND)
164     return false;
165   if (const auto *TP = dyn_cast<FunctionTemplateDecl>(ND))
166     ND = TP->TemplateDecl::getTemplatedDecl();
167   if (const auto *CM = dyn_cast<CXXMethodDecl>(ND))
168     return !CM->isStatic();
169   return isa<FieldDecl>(ND); // Note that static fields are VarDecl.
170 }
171 
isInstanceMember(const index::SymbolInfo & D)172 static bool isInstanceMember(const index::SymbolInfo &D) {
173   switch (D.Kind) {
174   case index::SymbolKind::InstanceMethod:
175   case index::SymbolKind::InstanceProperty:
176   case index::SymbolKind::Field:
177     return true;
178   default:
179     return false;
180   }
181 }
182 
merge(const CodeCompletionResult & SemaCCResult)183 void SymbolQualitySignals::merge(const CodeCompletionResult &SemaCCResult) {
184   Deprecated |= (SemaCCResult.Availability == CXAvailability_Deprecated);
185   Category = categorize(SemaCCResult);
186 
187   if (SemaCCResult.Declaration) {
188     ImplementationDetail |= isImplementationDetail(SemaCCResult.Declaration);
189     if (auto *ID = SemaCCResult.Declaration->getIdentifier())
190       ReservedName = ReservedName || isReserved(ID->getName());
191   } else if (SemaCCResult.Kind == CodeCompletionResult::RK_Macro)
192     ReservedName = ReservedName || isReserved(SemaCCResult.Macro->getName());
193 }
194 
merge(const Symbol & IndexResult)195 void SymbolQualitySignals::merge(const Symbol &IndexResult) {
196   Deprecated |= (IndexResult.Flags & Symbol::Deprecated);
197   ImplementationDetail |= (IndexResult.Flags & Symbol::ImplementationDetail);
198   References = std::max(IndexResult.References, References);
199   Category = categorize(IndexResult.SymInfo);
200   ReservedName = ReservedName || isReserved(IndexResult.Name);
201 }
202 
evaluate() const203 float SymbolQualitySignals::evaluate() const {
204   float Score = 1;
205 
206   // This avoids a sharp gradient for tail symbols, and also neatly avoids the
207   // question of whether 0 references means a bad symbol or missing data.
208   if (References >= 10) {
209     // Use a sigmoid style boosting function, which flats out nicely for large
210     // numbers (e.g. 2.58 for 1M references).
211     // The following boosting function is equivalent to:
212     //   m = 0.06
213     //   f = 12.0
214     //   boost = f * sigmoid(m * std::log(References)) - 0.5 * f + 0.59
215     // Sample data points: (10, 1.00), (100, 1.41), (1000, 1.82),
216     //                     (10K, 2.21), (100K, 2.58), (1M, 2.94)
217     float S = std::pow(References, -0.06);
218     Score *= 6.0 * (1 - S) / (1 + S) + 0.59;
219   }
220 
221   if (Deprecated)
222     Score *= 0.1f;
223   if (ReservedName)
224     Score *= 0.1f;
225   if (ImplementationDetail)
226     Score *= 0.2f;
227 
228   switch (Category) {
229   case Keyword: // Often relevant, but misses most signals.
230     Score *= 4; // FIXME: important keywords should have specific boosts.
231     break;
232   case Type:
233   case Function:
234   case Variable:
235     Score *= 1.1f;
236     break;
237   case Namespace:
238     Score *= 0.8f;
239     break;
240   case Macro:
241   case Destructor:
242   case Operator:
243     Score *= 0.5f;
244     break;
245   case Constructor: // No boost constructors so they are after class types.
246   case Unknown:
247     break;
248   }
249 
250   return Score;
251 }
252 
operator <<(llvm::raw_ostream & OS,const SymbolQualitySignals & S)253 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
254                               const SymbolQualitySignals &S) {
255   OS << llvm::formatv("=== Symbol quality: {0}\n", S.evaluate());
256   OS << llvm::formatv("\tReferences: {0}\n", S.References);
257   OS << llvm::formatv("\tDeprecated: {0}\n", S.Deprecated);
258   OS << llvm::formatv("\tReserved name: {0}\n", S.ReservedName);
259   OS << llvm::formatv("\tCategory: {0}\n", static_cast<int>(S.Category));
260   return OS;
261 }
262 
263 static SymbolRelevanceSignals::AccessibleScope
computeScope(const NamedDecl * D)264 computeScope(const NamedDecl *D) {
265   // Injected "Foo" within the class "Foo" has file scope, not class scope.
266   const DeclContext *DC = D->getDeclContext();
267   if (auto *R = dyn_cast_or_null<RecordDecl>(D))
268     if (R->isInjectedClassName())
269       DC = DC->getParent();
270   // Class constructor should have the same scope as the class.
271   if (isa<CXXConstructorDecl>(D))
272     DC = DC->getParent();
273   bool InClass = false;
274   for (; !DC->isFileContext(); DC = DC->getParent()) {
275     if (DC->isFunctionOrMethod())
276       return SymbolRelevanceSignals::FunctionScope;
277     InClass = InClass || DC->isRecord();
278   }
279   if (InClass)
280     return SymbolRelevanceSignals::ClassScope;
281   // ExternalLinkage threshold could be tweaked, e.g. module-visible as global.
282   // Avoid caching linkage if it may change after enclosing code completion.
283   if (hasUnstableLinkage(D) || D->getLinkageInternal() < ExternalLinkage)
284     return SymbolRelevanceSignals::FileScope;
285   return SymbolRelevanceSignals::GlobalScope;
286 }
287 
merge(const Symbol & IndexResult)288 void SymbolRelevanceSignals::merge(const Symbol &IndexResult) {
289   SymbolURI = IndexResult.CanonicalDeclaration.FileURI;
290   SymbolScope = IndexResult.Scope;
291   IsInstanceMember |= isInstanceMember(IndexResult.SymInfo);
292   if (!(IndexResult.Flags & Symbol::VisibleOutsideFile)) {
293     Scope = AccessibleScope::FileScope;
294   }
295 }
296 
merge(const CodeCompletionResult & SemaCCResult)297 void SymbolRelevanceSignals::merge(const CodeCompletionResult &SemaCCResult) {
298   if (SemaCCResult.Availability == CXAvailability_NotAvailable ||
299       SemaCCResult.Availability == CXAvailability_NotAccessible)
300     Forbidden = true;
301 
302   if (SemaCCResult.Declaration) {
303     SemaSaysInScope = true;
304     // We boost things that have decls in the main file. We give a fixed score
305     // for all other declarations in sema as they are already included in the
306     // translation unit.
307     float DeclProximity = (hasDeclInMainFile(*SemaCCResult.Declaration) ||
308                            hasUsingDeclInMainFile(SemaCCResult))
309                               ? 1.0
310                               : 0.6;
311     SemaFileProximityScore = std::max(DeclProximity, SemaFileProximityScore);
312     IsInstanceMember |= isInstanceMember(SemaCCResult.Declaration);
313     InBaseClass |= SemaCCResult.InBaseClass;
314   }
315 
316   // Declarations are scoped, others (like macros) are assumed global.
317   if (SemaCCResult.Declaration)
318     Scope = std::min(Scope, computeScope(SemaCCResult.Declaration));
319 
320   NeedsFixIts = !SemaCCResult.FixIts.empty();
321 }
322 
uriProximity(llvm::StringRef SymbolURI,URIDistance * D)323 static std::pair<float, unsigned> uriProximity(llvm::StringRef SymbolURI,
324                                                URIDistance *D) {
325   if (!D || SymbolURI.empty())
326     return {0.f, 0u};
327   unsigned Distance = D->distance(SymbolURI);
328   // Assume approximately default options are used for sensible scoring.
329   return {std::exp(Distance * -0.4f / FileDistanceOptions().UpCost), Distance};
330 }
331 
scopeBoost(ScopeDistance & Distance,llvm::Optional<llvm::StringRef> SymbolScope)332 static float scopeBoost(ScopeDistance &Distance,
333                         llvm::Optional<llvm::StringRef> SymbolScope) {
334   if (!SymbolScope)
335     return 1;
336   auto D = Distance.distance(*SymbolScope);
337   if (D == FileDistance::Unreachable)
338     return 0.6f;
339   return std::max(0.65, 2.0 * std::pow(0.6, D / 2.0));
340 }
341 
342 static llvm::Optional<llvm::StringRef>
wordMatching(llvm::StringRef Name,const llvm::StringSet<> * ContextWords)343 wordMatching(llvm::StringRef Name, const llvm::StringSet<> *ContextWords) {
344   if (ContextWords)
345     for (const auto& Word : ContextWords->keys())
346       if (Name.contains_lower(Word))
347         return Word;
348   return llvm::None;
349 }
350 
evaluate() const351 float SymbolRelevanceSignals::evaluate() const {
352   float Score = 1;
353 
354   if (Forbidden)
355     return 0;
356 
357   Score *= NameMatch;
358 
359   // File proximity scores are [0,1] and we translate them into a multiplier in
360   // the range from 1 to 3.
361   Score *= 1 + 2 * std::max(uriProximity(SymbolURI, FileProximityMatch).first,
362                             SemaFileProximityScore);
363 
364   if (ScopeProximityMatch)
365     // Use a constant scope boost for sema results, as scopes of sema results
366     // can be tricky (e.g. class/function scope). Set to the max boost as we
367     // don't load top-level symbols from the preamble and sema results are
368     // always in the accessible scope.
369     Score *=
370         SemaSaysInScope ? 2.0 : scopeBoost(*ScopeProximityMatch, SymbolScope);
371 
372   if (wordMatching(Name, ContextWords))
373     Score *= 1.5;
374 
375   // Symbols like local variables may only be referenced within their scope.
376   // Conversely if we're in that scope, it's likely we'll reference them.
377   if (Query == CodeComplete) {
378     // The narrower the scope where a symbol is visible, the more likely it is
379     // to be relevant when it is available.
380     switch (Scope) {
381     case GlobalScope:
382       break;
383     case FileScope:
384       Score *= 1.5f;
385       break;
386     case ClassScope:
387       Score *= 2;
388       break;
389     case FunctionScope:
390       Score *= 4;
391       break;
392     }
393   } else {
394     // For non-completion queries, the wider the scope where a symbol is
395     // visible, the more likely it is to be relevant.
396     switch (Scope) {
397     case GlobalScope:
398       break;
399     case FileScope:
400       Score *= 0.5f;
401       break;
402     default:
403       // TODO: Handle other scopes as we start to use them for index results.
404       break;
405     }
406   }
407 
408   if (TypeMatchesPreferred)
409     Score *= 5.0;
410 
411   // Penalize non-instance members when they are accessed via a class instance.
412   if (!IsInstanceMember &&
413       (Context == CodeCompletionContext::CCC_DotMemberAccess ||
414        Context == CodeCompletionContext::CCC_ArrowMemberAccess)) {
415     Score *= 0.2f;
416   }
417 
418   if (InBaseClass)
419     Score *= 0.5f;
420 
421   // Penalize for FixIts.
422   if (NeedsFixIts)
423     Score *= 0.5f;
424 
425   return Score;
426 }
427 
operator <<(llvm::raw_ostream & OS,const SymbolRelevanceSignals & S)428 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
429                               const SymbolRelevanceSignals &S) {
430   OS << llvm::formatv("=== Symbol relevance: {0}\n", S.evaluate());
431   OS << llvm::formatv("\tName: {0}\n", S.Name);
432   OS << llvm::formatv("\tName match: {0}\n", S.NameMatch);
433   if (S.ContextWords)
434     OS << llvm::formatv(
435         "\tMatching context word: {0}\n",
436         wordMatching(S.Name, S.ContextWords).getValueOr("<none>"));
437   OS << llvm::formatv("\tForbidden: {0}\n", S.Forbidden);
438   OS << llvm::formatv("\tNeedsFixIts: {0}\n", S.NeedsFixIts);
439   OS << llvm::formatv("\tIsInstanceMember: {0}\n", S.IsInstanceMember);
440   OS << llvm::formatv("\tContext: {0}\n", getCompletionKindString(S.Context));
441   OS << llvm::formatv("\tQuery type: {0}\n", static_cast<int>(S.Query));
442   OS << llvm::formatv("\tScope: {0}\n", static_cast<int>(S.Scope));
443 
444   OS << llvm::formatv("\tSymbol URI: {0}\n", S.SymbolURI);
445   OS << llvm::formatv("\tSymbol scope: {0}\n",
446                       S.SymbolScope ? *S.SymbolScope : "<None>");
447 
448   if (S.FileProximityMatch) {
449     auto Score = uriProximity(S.SymbolURI, S.FileProximityMatch);
450     OS << llvm::formatv("\tIndex URI proximity: {0} (distance={1})\n",
451                         Score.first, Score.second);
452   }
453   OS << llvm::formatv("\tSema file proximity: {0}\n", S.SemaFileProximityScore);
454 
455   OS << llvm::formatv("\tSema says in scope: {0}\n", S.SemaSaysInScope);
456   if (S.ScopeProximityMatch)
457     OS << llvm::formatv("\tIndex scope boost: {0}\n",
458                         scopeBoost(*S.ScopeProximityMatch, S.SymbolScope));
459 
460   OS << llvm::formatv(
461       "\tType matched preferred: {0} (Context type: {1}, Symbol type: {2}\n",
462       S.TypeMatchesPreferred, S.HadContextType, S.HadSymbolType);
463 
464   return OS;
465 }
466 
evaluateSymbolAndRelevance(float SymbolQuality,float SymbolRelevance)467 float evaluateSymbolAndRelevance(float SymbolQuality, float SymbolRelevance) {
468   return SymbolQuality * SymbolRelevance;
469 }
470 
471 // Produces an integer that sorts in the same order as F.
472 // That is: a < b <==> encodeFloat(a) < encodeFloat(b).
encodeFloat(float F)473 static uint32_t encodeFloat(float F) {
474   static_assert(std::numeric_limits<float>::is_iec559, "");
475   constexpr uint32_t TopBit = ~(~uint32_t{0} >> 1);
476 
477   // Get the bits of the float. Endianness is the same as for integers.
478   uint32_t U = llvm::FloatToBits(F);
479   // IEEE 754 floats compare like sign-magnitude integers.
480   if (U & TopBit)    // Negative float.
481     return 0 - U;    // Map onto the low half of integers, order reversed.
482   return U + TopBit; // Positive floats map onto the high half of integers.
483 }
484 
sortText(float Score,llvm::StringRef Name)485 std::string sortText(float Score, llvm::StringRef Name) {
486   // We convert -Score to an integer, and hex-encode for readability.
487   // Example: [0.5, "foo"] -> "41000000foo"
488   std::string S;
489   llvm::raw_string_ostream OS(S);
490   llvm::write_hex(OS, encodeFloat(-Score), llvm::HexPrintStyle::Lower,
491                   /*Width=*/2 * sizeof(Score));
492   OS << Name;
493   OS.flush();
494   return S;
495 }
496 
operator <<(llvm::raw_ostream & OS,const SignatureQualitySignals & S)497 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
498                               const SignatureQualitySignals &S) {
499   OS << llvm::formatv("=== Signature Quality:\n");
500   OS << llvm::formatv("\tNumber of parameters: {0}\n", S.NumberOfParameters);
501   OS << llvm::formatv("\tNumber of optional parameters: {0}\n",
502                       S.NumberOfOptionalParameters);
503   OS << llvm::formatv("\tKind: {0}\n", S.Kind);
504   return OS;
505 }
506 
507 } // namespace clangd
508 } // namespace clang
509