1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the transformation that optimizes memory intrinsics
10 // such as memcpy using the size value profile. When memory intrinsic size
11 // value profile metadata is available, a single memory intrinsic is expanded
12 // to a sequence of guarded specialized versions that are called with the
13 // hottest size(s), for later expansion into more optimal inline sequences.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "llvm/ADT/ArrayRef.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Twine.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/DomTreeUpdater.h"
23 #include "llvm/Analysis/GlobalsModRef.h"
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/CallSite.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/InstVisitor.h"
32 #include "llvm/IR/InstrTypes.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/LLVMContext.h"
36 #include "llvm/IR/PassManager.h"
37 #include "llvm/IR/Type.h"
38 #include "llvm/InitializePasses.h"
39 #include "llvm/Pass.h"
40 #include "llvm/PassRegistry.h"
41 #include "llvm/PassSupport.h"
42 #include "llvm/ProfileData/InstrProf.h"
43 #include "llvm/Support/Casting.h"
44 #include "llvm/Support/CommandLine.h"
45 #include "llvm/Support/Debug.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/MathExtras.h"
48 #include "llvm/Transforms/Instrumentation.h"
49 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
50 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
51 #include <cassert>
52 #include <cstdint>
53 #include <vector>
54 
55 using namespace llvm;
56 
57 #define DEBUG_TYPE "pgo-memop-opt"
58 
59 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
60 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
61 
62 // The minimum call count to optimize memory intrinsic calls.
63 static cl::opt<unsigned>
64     MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
65                         cl::init(1000),
66                         cl::desc("The minimum count to optimize memory "
67                                  "intrinsic calls"));
68 
69 // Command line option to disable memory intrinsic optimization. The default is
70 // false. This is for debug purpose.
71 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
72                                      cl::Hidden, cl::desc("Disable optimize"));
73 
74 // The percent threshold to optimize memory intrinsic calls.
75 static cl::opt<unsigned>
76     MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
77                           cl::Hidden, cl::ZeroOrMore,
78                           cl::desc("The percentage threshold for the "
79                                    "memory intrinsic calls optimization"));
80 
81 // Maximum number of versions for optimizing memory intrinsic call.
82 static cl::opt<unsigned>
83     MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
84                     cl::ZeroOrMore,
85                     cl::desc("The max version for the optimized memory "
86                              " intrinsic calls"));
87 
88 // Scale the counts from the annotation using the BB count value.
89 static cl::opt<bool>
90     MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
91                     cl::desc("Scale the memop size counts using the basic "
92                              " block count value"));
93 
94 // This option sets the rangge of precise profile memop sizes.
95 extern cl::opt<std::string> MemOPSizeRange;
96 
97 // This option sets the value that groups large memop sizes
98 extern cl::opt<unsigned> MemOPSizeLarge;
99 
100 namespace {
101 class PGOMemOPSizeOptLegacyPass : public FunctionPass {
102 public:
103   static char ID;
104 
105   PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
106     initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
107   }
108 
109   StringRef getPassName() const override { return "PGOMemOPSize"; }
110 
111 private:
112   bool runOnFunction(Function &F) override;
113   void getAnalysisUsage(AnalysisUsage &AU) const override {
114     AU.addRequired<BlockFrequencyInfoWrapperPass>();
115     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
116     AU.addPreserved<GlobalsAAWrapperPass>();
117     AU.addPreserved<DominatorTreeWrapperPass>();
118   }
119 };
120 } // end anonymous namespace
121 
122 char PGOMemOPSizeOptLegacyPass::ID = 0;
123 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
124                       "Optimize memory intrinsic using its size value profile",
125                       false, false)
126 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
127 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
128                     "Optimize memory intrinsic using its size value profile",
129                     false, false)
130 
131 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
132   return new PGOMemOPSizeOptLegacyPass();
133 }
134 
135 namespace {
136 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
137 public:
138   MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
139                OptimizationRemarkEmitter &ORE, DominatorTree *DT)
140       : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) {
141     ValueDataArray =
142         std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
143     // Get the MemOPSize range information from option MemOPSizeRange,
144     getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
145                                 PreciseRangeLast);
146   }
147   bool isChanged() const { return Changed; }
148   void perform() {
149     WorkList.clear();
150     visit(Func);
151 
152     for (auto &MI : WorkList) {
153       ++NumOfPGOMemOPAnnotate;
154       if (perform(MI)) {
155         Changed = true;
156         ++NumOfPGOMemOPOpt;
157         LLVM_DEBUG(dbgs() << "MemOP call: "
158                           << MI->getCalledFunction()->getName()
159                           << "is Transformed.\n");
160       }
161     }
162   }
163 
164   void visitMemIntrinsic(MemIntrinsic &MI) {
165     Value *Length = MI.getLength();
166     // Not perform on constant length calls.
167     if (dyn_cast<ConstantInt>(Length))
168       return;
169     WorkList.push_back(&MI);
170   }
171 
172 private:
173   Function &Func;
174   BlockFrequencyInfo &BFI;
175   OptimizationRemarkEmitter &ORE;
176   DominatorTree *DT;
177   bool Changed;
178   std::vector<MemIntrinsic *> WorkList;
179   // Start of the previse range.
180   int64_t PreciseRangeStart;
181   // Last value of the previse range.
182   int64_t PreciseRangeLast;
183   // The space to read the profile annotation.
184   std::unique_ptr<InstrProfValueData[]> ValueDataArray;
185   bool perform(MemIntrinsic *MI);
186 
187   // This kind shows which group the value falls in. For PreciseValue, we have
188   // the profile count for that value. LargeGroup groups the values that are in
189   // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
190   enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
191 
192   MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
193     if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
194       return LargeGroup;
195     if (Value == PreciseRangeLast + 1)
196       return NonLargeGroup;
197     return PreciseValue;
198   }
199 };
200 
201 static const char *getMIName(const MemIntrinsic *MI) {
202   switch (MI->getIntrinsicID()) {
203   case Intrinsic::memcpy:
204     return "memcpy";
205   case Intrinsic::memmove:
206     return "memmove";
207   case Intrinsic::memset:
208     return "memset";
209   default:
210     return "unknown";
211   }
212 }
213 
214 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
215   assert(Count <= TotalCount);
216   if (Count < MemOPCountThreshold)
217     return false;
218   if (Count < TotalCount * MemOPPercentThreshold / 100)
219     return false;
220   return true;
221 }
222 
223 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
224                                       uint64_t Denom) {
225   if (!MemOPScaleCount)
226     return Count;
227   bool Overflowed;
228   uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
229   return ScaleCount / Denom;
230 }
231 
232 bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
233   assert(MI);
234   if (MI->getIntrinsicID() == Intrinsic::memmove)
235     return false;
236 
237   uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
238   uint64_t TotalCount;
239   if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
240                                 ValueDataArray.get(), NumVals, TotalCount))
241     return false;
242 
243   uint64_t ActualCount = TotalCount;
244   uint64_t SavedTotalCount = TotalCount;
245   if (MemOPScaleCount) {
246     auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
247     if (!BBEdgeCount)
248       return false;
249     ActualCount = *BBEdgeCount;
250   }
251 
252   ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
253   LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
254                     << ActualCount << "\n");
255   LLVM_DEBUG(
256       for (auto &VD
257            : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
258 
259   if (ActualCount < MemOPCountThreshold)
260     return false;
261   // Skip if the total value profiled count is 0, in which case we can't
262   // scale up the counts properly (and there is no profitable transformation).
263   if (TotalCount == 0)
264     return false;
265 
266   TotalCount = ActualCount;
267   if (MemOPScaleCount)
268     LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
269                       << " denominator = " << SavedTotalCount << "\n");
270 
271   // Keeping track of the count of the default case:
272   uint64_t RemainCount = TotalCount;
273   uint64_t SavedRemainCount = SavedTotalCount;
274   SmallVector<uint64_t, 16> SizeIds;
275   SmallVector<uint64_t, 16> CaseCounts;
276   uint64_t MaxCount = 0;
277   unsigned Version = 0;
278   // Default case is in the front -- save the slot here.
279   CaseCounts.push_back(0);
280   for (auto &VD : VDs) {
281     int64_t V = VD.Value;
282     uint64_t C = VD.Count;
283     if (MemOPScaleCount)
284       C = getScaledCount(C, ActualCount, SavedTotalCount);
285 
286     // Only care precise value here.
287     if (getMemOPSizeKind(V) != PreciseValue)
288       continue;
289 
290     // ValueCounts are sorted on the count. Break at the first un-profitable
291     // value.
292     if (!isProfitable(C, RemainCount))
293       break;
294 
295     SizeIds.push_back(V);
296     CaseCounts.push_back(C);
297     if (C > MaxCount)
298       MaxCount = C;
299 
300     assert(RemainCount >= C);
301     RemainCount -= C;
302     assert(SavedRemainCount >= VD.Count);
303     SavedRemainCount -= VD.Count;
304 
305     if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
306       break;
307   }
308 
309   if (Version == 0)
310     return false;
311 
312   CaseCounts[0] = RemainCount;
313   if (RemainCount > MaxCount)
314     MaxCount = RemainCount;
315 
316   uint64_t SumForOpt = TotalCount - RemainCount;
317 
318   LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
319                     << " Versions (covering " << SumForOpt << " out of "
320                     << TotalCount << ")\n");
321 
322   // mem_op(..., size)
323   // ==>
324   // switch (size) {
325   //   case s1:
326   //      mem_op(..., s1);
327   //      goto merge_bb;
328   //   case s2:
329   //      mem_op(..., s2);
330   //      goto merge_bb;
331   //   ...
332   //   default:
333   //      mem_op(..., size);
334   //      goto merge_bb;
335   // }
336   // merge_bb:
337 
338   BasicBlock *BB = MI->getParent();
339   LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
340   LLVM_DEBUG(dbgs() << *BB << "\n");
341   auto OrigBBFreq = BFI.getBlockFreq(BB);
342 
343   BasicBlock *DefaultBB = SplitBlock(BB, MI, DT);
344   BasicBlock::iterator It(*MI);
345   ++It;
346   assert(It != DefaultBB->end());
347   BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
348   MergeBB->setName("MemOP.Merge");
349   BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
350   DefaultBB->setName("MemOP.Default");
351 
352   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
353   auto &Ctx = Func.getContext();
354   IRBuilder<> IRB(BB);
355   BB->getTerminator()->eraseFromParent();
356   Value *SizeVar = MI->getLength();
357   SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
358 
359   // Clear the value profile data.
360   MI->setMetadata(LLVMContext::MD_prof, nullptr);
361   // If all promoted, we don't need the MD.prof metadata.
362   if (SavedRemainCount > 0 || Version != NumVals)
363     // Otherwise we need update with the un-promoted records back.
364     annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
365                       SavedRemainCount, IPVK_MemOPSize, NumVals);
366 
367   LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
368 
369   std::vector<DominatorTree::UpdateType> Updates;
370   if (DT)
371     Updates.reserve(2 * SizeIds.size());
372 
373   for (uint64_t SizeId : SizeIds) {
374     BasicBlock *CaseBB = BasicBlock::Create(
375         Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
376     Instruction *NewInst = MI->clone();
377     // Fix the argument.
378     auto *MemI = cast<MemIntrinsic>(NewInst);
379     auto *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType());
380     assert(SizeType && "Expected integer type size argument.");
381     ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
382     MemI->setLength(CaseSizeId);
383     CaseBB->getInstList().push_back(NewInst);
384     IRBuilder<> IRBCase(CaseBB);
385     IRBCase.CreateBr(MergeBB);
386     SI->addCase(CaseSizeId, CaseBB);
387     if (DT) {
388       Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
389       Updates.push_back({DominatorTree::Insert, BB, CaseBB});
390     }
391     LLVM_DEBUG(dbgs() << *CaseBB << "\n");
392   }
393   DTU.applyUpdates(Updates);
394   Updates.clear();
395 
396   setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
397 
398   LLVM_DEBUG(dbgs() << *BB << "\n");
399   LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
400   LLVM_DEBUG(dbgs() << *MergeBB << "\n");
401 
402   ORE.emit([&]() {
403     using namespace ore;
404     return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MI)
405              << "optimized " << NV("Intrinsic", StringRef(getMIName(MI)))
406              << " with count " << NV("Count", SumForOpt) << " out of "
407              << NV("Total", TotalCount) << " for " << NV("Versions", Version)
408              << " versions";
409   });
410 
411   return true;
412 }
413 } // namespace
414 
415 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
416                                 OptimizationRemarkEmitter &ORE,
417                                 DominatorTree *DT) {
418   if (DisableMemOPOPT)
419     return false;
420 
421   if (F.hasFnAttribute(Attribute::OptimizeForSize))
422     return false;
423   MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT);
424   MemOPSizeOpt.perform();
425   return MemOPSizeOpt.isChanged();
426 }
427 
428 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
429   BlockFrequencyInfo &BFI =
430       getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
431   auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
432   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
433   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
434   return PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
435 }
436 
437 namespace llvm {
438 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
439 
440 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
441                                        FunctionAnalysisManager &FAM) {
442   auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
443   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
444   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
445   bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
446   if (!Changed)
447     return PreservedAnalyses::all();
448   auto PA = PreservedAnalyses();
449   PA.preserve<GlobalsAA>();
450   PA.preserve<DominatorTreeAnalysis>();
451   return PA;
452 }
453 } // namespace llvm
454