1 //===-- BlockCoverageInference.cpp - Minimal Execution Coverage -*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Our algorithm works by first identifying a subset of nodes that must always
10 // be instrumented. We call these nodes ambiguous because knowing the coverage
11 // of all remaining nodes is not enough to infer their coverage status.
12 //
13 // In general a node v is ambiguous if there exists two entry-to-terminal paths
14 // P_1 and P_2 such that:
15 //   1. v not in P_1 but P_1 visits a predecessor of v, and
16 //   2. v not in P_2 but P_2 visits a successor of v.
17 //
18 // If a node v is not ambiguous, then if condition 1 fails, we can infer v’s
19 // coverage from the coverage of its predecessors, or if condition 2 fails, we
20 // can infer v’s coverage from the coverage of its successors.
21 //
22 // Sadly, there are example CFGs where it is not possible to infer all nodes
23 // from the ambiguous nodes alone. Our algorithm selects a minimum number of
24 // extra nodes to add to the ambiguous nodes to form a valid instrumentation S.
25 //
26 // Details on this algorithm can be found in https://arxiv.org/abs/2208.13907
27 //
28 //===----------------------------------------------------------------------===//
29 
30 #include "llvm/Transforms/Instrumentation/BlockCoverageInference.h"
31 #include "llvm/ADT/DepthFirstIterator.h"
32 #include "llvm/ADT/Statistic.h"
33 #include "llvm/Support/CRC.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/GraphWriter.h"
36 #include "llvm/Support/raw_ostream.h"
37 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
38 
39 using namespace llvm;
40 
41 #define DEBUG_TYPE "pgo-block-coverage"
42 
43 STATISTIC(NumFunctions, "Number of total functions that BCI has processed");
44 STATISTIC(NumIneligibleFunctions,
45           "Number of functions for which BCI cannot run on");
46 STATISTIC(NumBlocks, "Number of total basic blocks that BCI has processed");
47 STATISTIC(NumInstrumentedBlocks,
48           "Number of basic blocks instrumented for coverage");
49 
BlockCoverageInference(const Function & F,bool ForceInstrumentEntry)50 BlockCoverageInference::BlockCoverageInference(const Function &F,
51                                                bool ForceInstrumentEntry)
52     : F(F), ForceInstrumentEntry(ForceInstrumentEntry) {
53   findDependencies();
54   assert(!ForceInstrumentEntry || shouldInstrumentBlock(F.getEntryBlock()));
55 
56   ++NumFunctions;
57   for (auto &BB : F) {
58     ++NumBlocks;
59     if (shouldInstrumentBlock(BB))
60       ++NumInstrumentedBlocks;
61   }
62 }
63 
64 BlockCoverageInference::BlockSet
getDependencies(const BasicBlock & BB) const65 BlockCoverageInference::getDependencies(const BasicBlock &BB) const {
66   assert(BB.getParent() == &F);
67   BlockSet Dependencies;
68   auto It = PredecessorDependencies.find(&BB);
69   if (It != PredecessorDependencies.end())
70     Dependencies.set_union(It->second);
71   It = SuccessorDependencies.find(&BB);
72   if (It != SuccessorDependencies.end())
73     Dependencies.set_union(It->second);
74   return Dependencies;
75 }
76 
getInstrumentedBlocksHash() const77 uint64_t BlockCoverageInference::getInstrumentedBlocksHash() const {
78   JamCRC JC;
79   uint64_t Index = 0;
80   for (auto &BB : F) {
81     if (shouldInstrumentBlock(BB)) {
82       uint8_t Data[8];
83       support::endian::write64le(Data, Index);
84       JC.update(Data);
85     }
86     Index++;
87   }
88   return JC.getCRC();
89 }
90 
shouldInstrumentBlock(const BasicBlock & BB) const91 bool BlockCoverageInference::shouldInstrumentBlock(const BasicBlock &BB) const {
92   assert(BB.getParent() == &F);
93   auto It = PredecessorDependencies.find(&BB);
94   if (It != PredecessorDependencies.end() && It->second.size())
95     return false;
96   It = SuccessorDependencies.find(&BB);
97   if (It != SuccessorDependencies.end() && It->second.size())
98     return false;
99   return true;
100 }
101 
findDependencies()102 void BlockCoverageInference::findDependencies() {
103   assert(PredecessorDependencies.empty() && SuccessorDependencies.empty());
104   // Empirical analysis shows that this algorithm finishes within 5 seconds for
105   // functions with fewer than 1.5K blocks.
106   if (F.hasFnAttribute(Attribute::NoReturn) || F.size() > 1500) {
107     ++NumIneligibleFunctions;
108     return;
109   }
110 
111   SmallVector<const BasicBlock *, 4> TerminalBlocks;
112   for (auto &BB : F)
113     if (succ_empty(&BB))
114       TerminalBlocks.push_back(&BB);
115 
116   // Traverse the CFG backwards from the terminal blocks to make sure every
117   // block can reach some terminal block. Otherwise this algorithm will not work
118   // and we must fall back to instrumenting every block.
119   df_iterator_default_set<const BasicBlock *> Visited;
120   for (auto *BB : TerminalBlocks)
121     for (auto *N : inverse_depth_first_ext(BB, Visited))
122       (void)N;
123   if (F.size() != Visited.size()) {
124     ++NumIneligibleFunctions;
125     return;
126   }
127 
128   // The current implementation for computing `PredecessorDependencies` and
129   // `SuccessorDependencies` runs in quadratic time with respect to the number
130   // of basic blocks. While we do have a more complicated linear time algorithm
131   // in https://arxiv.org/abs/2208.13907 we do not know if it will give a
132   // significant speedup in practice given that most functions tend to be
133   // relatively small in size for intended use cases.
134   auto &EntryBlock = F.getEntryBlock();
135   for (auto &BB : F) {
136     // The set of blocks that are reachable while avoiding BB.
137     BlockSet ReachableFromEntry, ReachableFromTerminal;
138     getReachableAvoiding(EntryBlock, BB, /*IsForward=*/true,
139                          ReachableFromEntry);
140     for (auto *TerminalBlock : TerminalBlocks)
141       getReachableAvoiding(*TerminalBlock, BB, /*IsForward=*/false,
142                            ReachableFromTerminal);
143 
144     auto Preds = predecessors(&BB);
145     bool HasSuperReachablePred = llvm::any_of(Preds, [&](auto *Pred) {
146       return ReachableFromEntry.count(Pred) &&
147              ReachableFromTerminal.count(Pred);
148     });
149     if (!HasSuperReachablePred)
150       for (auto *Pred : Preds)
151         if (ReachableFromEntry.count(Pred))
152           PredecessorDependencies[&BB].insert(Pred);
153 
154     auto Succs = successors(&BB);
155     bool HasSuperReachableSucc = llvm::any_of(Succs, [&](auto *Succ) {
156       return ReachableFromEntry.count(Succ) &&
157              ReachableFromTerminal.count(Succ);
158     });
159     if (!HasSuperReachableSucc)
160       for (auto *Succ : Succs)
161         if (ReachableFromTerminal.count(Succ))
162           SuccessorDependencies[&BB].insert(Succ);
163   }
164 
165   if (ForceInstrumentEntry) {
166     // Force the entry block to be instrumented by clearing the blocks it can
167     // infer coverage from.
168     PredecessorDependencies[&EntryBlock].clear();
169     SuccessorDependencies[&EntryBlock].clear();
170   }
171 
172   // Construct a graph where blocks are connected if there is a mutual
173   // dependency between them. This graph has a special property that it contains
174   // only paths.
175   DenseMap<const BasicBlock *, BlockSet> AdjacencyList;
176   for (auto &BB : F) {
177     for (auto *Succ : successors(&BB)) {
178       if (SuccessorDependencies[&BB].count(Succ) &&
179           PredecessorDependencies[Succ].count(&BB)) {
180         AdjacencyList[&BB].insert(Succ);
181         AdjacencyList[Succ].insert(&BB);
182       }
183     }
184   }
185 
186   // Given a path with at least one node, return the next node on the path.
187   auto getNextOnPath = [&](BlockSet &Path) -> const BasicBlock * {
188     assert(Path.size());
189     auto &Neighbors = AdjacencyList[Path.back()];
190     if (Path.size() == 1) {
191       // This is the first node on the path, return its neighbor.
192       assert(Neighbors.size() == 1);
193       return Neighbors.front();
194     } else if (Neighbors.size() == 2) {
195       // This is the middle of the path, find the neighbor that is not on the
196       // path already.
197       assert(Path.size() >= 2);
198       return Path.count(Neighbors[0]) ? Neighbors[1] : Neighbors[0];
199     }
200     // This is the end of the path.
201     assert(Neighbors.size() == 1);
202     return nullptr;
203   };
204 
205   // Remove all cycles in the inferencing graph.
206   for (auto &BB : F) {
207     if (AdjacencyList[&BB].size() == 1) {
208       // We found the head of some path.
209       BlockSet Path;
210       Path.insert(&BB);
211       while (const BasicBlock *Next = getNextOnPath(Path))
212         Path.insert(Next);
213       LLVM_DEBUG(dbgs() << "Found path: " << getBlockNames(Path) << "\n");
214 
215       // Remove these nodes from the graph so we don't discover this path again.
216       for (auto *BB : Path)
217         AdjacencyList[BB].clear();
218 
219       // Finally, remove the cycles.
220       if (PredecessorDependencies[Path.front()].size()) {
221         for (auto *BB : Path)
222           if (BB != Path.back())
223             SuccessorDependencies[BB].clear();
224       } else {
225         for (auto *BB : Path)
226           if (BB != Path.front())
227             PredecessorDependencies[BB].clear();
228       }
229     }
230   }
231   LLVM_DEBUG(dump(dbgs()));
232 }
233 
getReachableAvoiding(const BasicBlock & Start,const BasicBlock & Avoid,bool IsForward,BlockSet & Reachable) const234 void BlockCoverageInference::getReachableAvoiding(const BasicBlock &Start,
235                                                   const BasicBlock &Avoid,
236                                                   bool IsForward,
237                                                   BlockSet &Reachable) const {
238   df_iterator_default_set<const BasicBlock *> Visited;
239   Visited.insert(&Avoid);
240   if (IsForward) {
241     auto Range = depth_first_ext(&Start, Visited);
242     Reachable.insert(Range.begin(), Range.end());
243   } else {
244     auto Range = inverse_depth_first_ext(&Start, Visited);
245     Reachable.insert(Range.begin(), Range.end());
246   }
247 }
248 
249 namespace llvm {
250 class DotFuncBCIInfo {
251 private:
252   const BlockCoverageInference *BCI;
253   const DenseMap<const BasicBlock *, bool> *Coverage;
254 
255 public:
DotFuncBCIInfo(const BlockCoverageInference * BCI,const DenseMap<const BasicBlock *,bool> * Coverage)256   DotFuncBCIInfo(const BlockCoverageInference *BCI,
257                  const DenseMap<const BasicBlock *, bool> *Coverage)
258       : BCI(BCI), Coverage(Coverage) {}
259 
getFunction()260   const Function &getFunction() { return BCI->F; }
261 
isInstrumented(const BasicBlock * BB) const262   bool isInstrumented(const BasicBlock *BB) const {
263     return BCI->shouldInstrumentBlock(*BB);
264   }
265 
isCovered(const BasicBlock * BB) const266   bool isCovered(const BasicBlock *BB) const {
267     return Coverage && Coverage->lookup(BB);
268   }
269 
isDependent(const BasicBlock * Src,const BasicBlock * Dest) const270   bool isDependent(const BasicBlock *Src, const BasicBlock *Dest) const {
271     return BCI->getDependencies(*Src).count(Dest);
272   }
273 };
274 
275 template <>
276 struct GraphTraits<DotFuncBCIInfo *> : public GraphTraits<const BasicBlock *> {
getEntryNodellvm::GraphTraits277   static NodeRef getEntryNode(DotFuncBCIInfo *Info) {
278     return &(Info->getFunction().getEntryBlock());
279   }
280 
281   // nodes_iterator/begin/end - Allow iteration over all nodes in the graph
282   using nodes_iterator = pointer_iterator<Function::const_iterator>;
283 
nodes_beginllvm::GraphTraits284   static nodes_iterator nodes_begin(DotFuncBCIInfo *Info) {
285     return nodes_iterator(Info->getFunction().begin());
286   }
287 
nodes_endllvm::GraphTraits288   static nodes_iterator nodes_end(DotFuncBCIInfo *Info) {
289     return nodes_iterator(Info->getFunction().end());
290   }
291 
sizellvm::GraphTraits292   static size_t size(DotFuncBCIInfo *Info) {
293     return Info->getFunction().size();
294   }
295 };
296 
297 template <>
298 struct DOTGraphTraits<DotFuncBCIInfo *> : public DefaultDOTGraphTraits {
299 
DOTGraphTraitsllvm::DOTGraphTraits300   DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
301 
getGraphNamellvm::DOTGraphTraits302   static std::string getGraphName(DotFuncBCIInfo *Info) {
303     return "BCI CFG for " + Info->getFunction().getName().str();
304   }
305 
getNodeLabelllvm::DOTGraphTraits306   std::string getNodeLabel(const BasicBlock *Node, DotFuncBCIInfo *Info) {
307     return Node->getName().str();
308   }
309 
getEdgeAttributesllvm::DOTGraphTraits310   std::string getEdgeAttributes(const BasicBlock *Src, const_succ_iterator I,
311                                 DotFuncBCIInfo *Info) {
312     const BasicBlock *Dest = *I;
313     if (Info->isDependent(Src, Dest))
314       return "color=red";
315     if (Info->isDependent(Dest, Src))
316       return "color=blue";
317     return "";
318   }
319 
getNodeAttributesllvm::DOTGraphTraits320   std::string getNodeAttributes(const BasicBlock *Node, DotFuncBCIInfo *Info) {
321     std::string Result;
322     if (Info->isInstrumented(Node))
323       Result += "style=filled,fillcolor=gray";
324     if (Info->isCovered(Node))
325       Result += std::string(Result.empty() ? "" : ",") + "color=red";
326     return Result;
327   }
328 };
329 
330 } // namespace llvm
331 
viewBlockCoverageGraph(const DenseMap<const BasicBlock *,bool> * Coverage) const332 void BlockCoverageInference::viewBlockCoverageGraph(
333     const DenseMap<const BasicBlock *, bool> *Coverage) const {
334   DotFuncBCIInfo Info(this, Coverage);
335   WriteGraph(&Info, "BCI", false,
336              "Block Coverage Inference for " + F.getName());
337 }
338 
dump(raw_ostream & OS) const339 void BlockCoverageInference::dump(raw_ostream &OS) const {
340   OS << "Minimal block coverage for function \'" << F.getName()
341      << "\' (Instrumented=*)\n";
342   for (auto &BB : F) {
343     OS << (shouldInstrumentBlock(BB) ? "* " : "  ") << BB.getName() << "\n";
344     auto It = PredecessorDependencies.find(&BB);
345     if (It != PredecessorDependencies.end() && It->second.size())
346       OS << "    PredDeps = " << getBlockNames(It->second) << "\n";
347     It = SuccessorDependencies.find(&BB);
348     if (It != SuccessorDependencies.end() && It->second.size())
349       OS << "    SuccDeps = " << getBlockNames(It->second) << "\n";
350   }
351   OS << "  Instrumented Blocks Hash = 0x"
352      << Twine::utohexstr(getInstrumentedBlocksHash()) << "\n";
353 }
354 
355 std::string
getBlockNames(ArrayRef<const BasicBlock * > BBs)356 BlockCoverageInference::getBlockNames(ArrayRef<const BasicBlock *> BBs) {
357   std::string Result;
358   raw_string_ostream OS(Result);
359   OS << "[";
360   if (!BBs.empty()) {
361     OS << BBs.front()->getName();
362     BBs = BBs.drop_front();
363   }
364   for (auto *BB : BBs)
365     OS << ", " << BB->getName();
366   OS << "]";
367   return OS.str();
368 }
369