1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the transformation that optimizes memory intrinsics
10 // such as memcpy using the size value profile. When memory intrinsic size
11 // value profile metadata is available, a single memory intrinsic is expanded
12 // to a sequence of guarded specialized versions that are called with the
13 // hottest size(s), for later expansion into more optimal inline sequences.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "llvm/ADT/ArrayRef.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Twine.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/DomTreeUpdater.h"
23 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
24 #include "llvm/Analysis/TargetLibraryInfo.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Dominators.h"
28 #include "llvm/IR/Function.h"
29 #include "llvm/IR/IRBuilder.h"
30 #include "llvm/IR/InstVisitor.h"
31 #include "llvm/IR/Instruction.h"
32 #include "llvm/IR/Instructions.h"
33 #include "llvm/IR/LLVMContext.h"
34 #include "llvm/IR/PassManager.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/ProfileData/InstrProf.h"
37 #define INSTR_PROF_VALUE_PROF_MEMOP_API
38 #include "llvm/ProfileData/InstrProfData.inc"
39 #include "llvm/Support/Casting.h"
40 #include "llvm/Support/CommandLine.h"
41 #include "llvm/Support/Debug.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/MathExtras.h"
44 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
45 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
46 #include <cassert>
47 #include <cstdint>
48 #include <vector>
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "pgo-memop-opt"
53 
54 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
55 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
56 
57 // The minimum call count to optimize memory intrinsic calls.
58 static cl::opt<unsigned>
59     MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000),
60                         cl::desc("The minimum count to optimize memory "
61                                  "intrinsic calls"));
62 
63 // Command line option to disable memory intrinsic optimization. The default is
64 // false. This is for debug purpose.
65 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
66                                      cl::Hidden, cl::desc("Disable optimize"));
67 
68 // The percent threshold to optimize memory intrinsic calls.
69 static cl::opt<unsigned>
70     MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
71                           cl::Hidden,
72                           cl::desc("The percentage threshold for the "
73                                    "memory intrinsic calls optimization"));
74 
75 // Maximum number of versions for optimizing memory intrinsic call.
76 static cl::opt<unsigned>
77     MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
78                     cl::desc("The max version for the optimized memory "
79                              " intrinsic calls"));
80 
81 // Scale the counts from the annotation using the BB count value.
82 static cl::opt<bool>
83     MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
84                     cl::desc("Scale the memop size counts using the basic "
85                              " block count value"));
86 
87 cl::opt<bool>
88     MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
89                        cl::Hidden,
90                        cl::desc("Size-specialize memcmp and bcmp calls"));
91 
92 static cl::opt<unsigned>
93     MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
94                     cl::desc("Optimize the memop size <= this value"));
95 
96 namespace {
97 
98 static const char *getMIName(const MemIntrinsic *MI) {
99   switch (MI->getIntrinsicID()) {
100   case Intrinsic::memcpy:
101     return "memcpy";
102   case Intrinsic::memmove:
103     return "memmove";
104   case Intrinsic::memset:
105     return "memset";
106   default:
107     return "unknown";
108   }
109 }
110 
111 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
112 struct MemOp {
113   Instruction *I;
114   MemOp(MemIntrinsic *MI) : I(MI) {}
115   MemOp(CallInst *CI) : I(CI) {}
116   MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); }
117   CallInst *asCI() { return cast<CallInst>(I); }
118   MemOp clone() {
119     if (auto MI = asMI())
120       return MemOp(cast<MemIntrinsic>(MI->clone()));
121     return MemOp(cast<CallInst>(asCI()->clone()));
122   }
123   Value *getLength() {
124     if (auto MI = asMI())
125       return MI->getLength();
126     return asCI()->getArgOperand(2);
127   }
128   void setLength(Value *Length) {
129     if (auto MI = asMI())
130       return MI->setLength(Length);
131     asCI()->setArgOperand(2, Length);
132   }
133   StringRef getFuncName() {
134     if (auto MI = asMI())
135       return MI->getCalledFunction()->getName();
136     return asCI()->getCalledFunction()->getName();
137   }
138   bool isMemmove() {
139     if (auto MI = asMI())
140       if (MI->getIntrinsicID() == Intrinsic::memmove)
141         return true;
142     return false;
143   }
144   bool isMemcmp(TargetLibraryInfo &TLI) {
145     LibFunc Func;
146     if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
147         Func == LibFunc_memcmp) {
148       return true;
149     }
150     return false;
151   }
152   bool isBcmp(TargetLibraryInfo &TLI) {
153     LibFunc Func;
154     if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
155         Func == LibFunc_bcmp) {
156       return true;
157     }
158     return false;
159   }
160   const char *getName(TargetLibraryInfo &TLI) {
161     if (auto MI = asMI())
162       return getMIName(MI);
163     LibFunc Func;
164     if (TLI.getLibFunc(*asCI(), Func)) {
165       if (Func == LibFunc_memcmp)
166         return "memcmp";
167       if (Func == LibFunc_bcmp)
168         return "bcmp";
169     }
170     llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
171     return nullptr;
172   }
173 };
174 
175 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
176 public:
177   MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
178                OptimizationRemarkEmitter &ORE, DominatorTree *DT,
179                TargetLibraryInfo &TLI)
180       : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
181     ValueDataArray =
182         std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS);
183   }
184   bool isChanged() const { return Changed; }
185   void perform() {
186     WorkList.clear();
187     visit(Func);
188 
189     for (auto &MO : WorkList) {
190       ++NumOfPGOMemOPAnnotate;
191       if (perform(MO)) {
192         Changed = true;
193         ++NumOfPGOMemOPOpt;
194         LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
195                           << "is Transformed.\n");
196       }
197     }
198   }
199 
200   void visitMemIntrinsic(MemIntrinsic &MI) {
201     Value *Length = MI.getLength();
202     // Not perform on constant length calls.
203     if (isa<ConstantInt>(Length))
204       return;
205     WorkList.push_back(MemOp(&MI));
206   }
207 
208   void visitCallInst(CallInst &CI) {
209     LibFunc Func;
210     if (TLI.getLibFunc(CI, Func) &&
211         (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
212         !isa<ConstantInt>(CI.getArgOperand(2))) {
213       WorkList.push_back(MemOp(&CI));
214     }
215   }
216 
217 private:
218   Function &Func;
219   BlockFrequencyInfo &BFI;
220   OptimizationRemarkEmitter &ORE;
221   DominatorTree *DT;
222   TargetLibraryInfo &TLI;
223   bool Changed;
224   std::vector<MemOp> WorkList;
225   // The space to read the profile annotation.
226   std::unique_ptr<InstrProfValueData[]> ValueDataArray;
227   bool perform(MemOp MO);
228 };
229 
230 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
231   assert(Count <= TotalCount);
232   if (Count < MemOPCountThreshold)
233     return false;
234   if (Count < TotalCount * MemOPPercentThreshold / 100)
235     return false;
236   return true;
237 }
238 
239 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
240                                       uint64_t Denom) {
241   if (!MemOPScaleCount)
242     return Count;
243   bool Overflowed;
244   uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
245   return ScaleCount / Denom;
246 }
247 
248 bool MemOPSizeOpt::perform(MemOp MO) {
249   assert(MO.I);
250   if (MO.isMemmove())
251     return false;
252   if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
253     return false;
254 
255   uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS;
256   uint64_t TotalCount;
257   if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals,
258                                 ValueDataArray.get(), NumVals, TotalCount))
259     return false;
260 
261   uint64_t ActualCount = TotalCount;
262   uint64_t SavedTotalCount = TotalCount;
263   if (MemOPScaleCount) {
264     auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent());
265     if (!BBEdgeCount)
266       return false;
267     ActualCount = *BBEdgeCount;
268   }
269 
270   ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
271   LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
272                     << ActualCount << "\n");
273   LLVM_DEBUG(
274       for (auto &VD
275            : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
276 
277   if (ActualCount < MemOPCountThreshold)
278     return false;
279   // Skip if the total value profiled count is 0, in which case we can't
280   // scale up the counts properly (and there is no profitable transformation).
281   if (TotalCount == 0)
282     return false;
283 
284   TotalCount = ActualCount;
285   if (MemOPScaleCount)
286     LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
287                       << " denominator = " << SavedTotalCount << "\n");
288 
289   // Keeping track of the count of the default case:
290   uint64_t RemainCount = TotalCount;
291   uint64_t SavedRemainCount = SavedTotalCount;
292   SmallVector<uint64_t, 16> SizeIds;
293   SmallVector<uint64_t, 16> CaseCounts;
294   SmallDenseSet<uint64_t, 16> SeenSizeId;
295   uint64_t MaxCount = 0;
296   unsigned Version = 0;
297   // Default case is in the front -- save the slot here.
298   CaseCounts.push_back(0);
299   SmallVector<InstrProfValueData, 24> RemainingVDs;
300   for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) {
301     auto &VD = *I;
302     int64_t V = VD.Value;
303     uint64_t C = VD.Count;
304     if (MemOPScaleCount)
305       C = getScaledCount(C, ActualCount, SavedTotalCount);
306 
307     if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) {
308       RemainingVDs.push_back(VD);
309       continue;
310     }
311 
312     // ValueCounts are sorted on the count. Break at the first un-profitable
313     // value.
314     if (!isProfitable(C, RemainCount)) {
315       RemainingVDs.insert(RemainingVDs.end(), I, E);
316       break;
317     }
318 
319     if (!SeenSizeId.insert(V).second) {
320       errs() << "warning: Invalid Profile Data in Function " << Func.getName()
321              << ": Two identical values in MemOp value counts.\n";
322       return false;
323     }
324 
325     SizeIds.push_back(V);
326     CaseCounts.push_back(C);
327     if (C > MaxCount)
328       MaxCount = C;
329 
330     assert(RemainCount >= C);
331     RemainCount -= C;
332     assert(SavedRemainCount >= VD.Count);
333     SavedRemainCount -= VD.Count;
334 
335     if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) {
336       RemainingVDs.insert(RemainingVDs.end(), I + 1, E);
337       break;
338     }
339   }
340 
341   if (Version == 0)
342     return false;
343 
344   CaseCounts[0] = RemainCount;
345   if (RemainCount > MaxCount)
346     MaxCount = RemainCount;
347 
348   uint64_t SumForOpt = TotalCount - RemainCount;
349 
350   LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
351                     << " Versions (covering " << SumForOpt << " out of "
352                     << TotalCount << ")\n");
353 
354   // mem_op(..., size)
355   // ==>
356   // switch (size) {
357   //   case s1:
358   //      mem_op(..., s1);
359   //      goto merge_bb;
360   //   case s2:
361   //      mem_op(..., s2);
362   //      goto merge_bb;
363   //   ...
364   //   default:
365   //      mem_op(..., size);
366   //      goto merge_bb;
367   // }
368   // merge_bb:
369 
370   BasicBlock *BB = MO.I->getParent();
371   LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
372   LLVM_DEBUG(dbgs() << *BB << "\n");
373   auto OrigBBFreq = BFI.getBlockFreq(BB);
374 
375   BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT);
376   BasicBlock::iterator It(*MO.I);
377   ++It;
378   assert(It != DefaultBB->end());
379   BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
380   MergeBB->setName("MemOP.Merge");
381   BFI.setBlockFreq(MergeBB, OrigBBFreq);
382   DefaultBB->setName("MemOP.Default");
383 
384   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
385   auto &Ctx = Func.getContext();
386   IRBuilder<> IRB(BB);
387   BB->getTerminator()->eraseFromParent();
388   Value *SizeVar = MO.getLength();
389   SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
390   Type *MemOpTy = MO.I->getType();
391   PHINode *PHI = nullptr;
392   if (!MemOpTy->isVoidTy()) {
393     // Insert a phi for the return values at the merge block.
394     IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
395     PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
396     MO.I->replaceAllUsesWith(PHI);
397     PHI->addIncoming(MO.I, DefaultBB);
398   }
399 
400   // Clear the value profile data.
401   MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
402   // If all promoted, we don't need the MD.prof metadata.
403   if (SavedRemainCount > 0 || Version != NumVals) {
404     // Otherwise we need update with the un-promoted records back.
405     ArrayRef<InstrProfValueData> RemVDs(RemainingVDs);
406     annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount,
407                       IPVK_MemOPSize, NumVals);
408   }
409 
410   LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
411 
412   std::vector<DominatorTree::UpdateType> Updates;
413   if (DT)
414     Updates.reserve(2 * SizeIds.size());
415 
416   for (uint64_t SizeId : SizeIds) {
417     BasicBlock *CaseBB = BasicBlock::Create(
418         Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
419     MemOp NewMO = MO.clone();
420     // Fix the argument.
421     auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
422     assert(SizeType && "Expected integer type size argument.");
423     ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
424     NewMO.setLength(CaseSizeId);
425     NewMO.I->insertInto(CaseBB, CaseBB->end());
426     IRBuilder<> IRBCase(CaseBB);
427     IRBCase.CreateBr(MergeBB);
428     SI->addCase(CaseSizeId, CaseBB);
429     if (!MemOpTy->isVoidTy())
430       PHI->addIncoming(NewMO.I, CaseBB);
431     if (DT) {
432       Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
433       Updates.push_back({DominatorTree::Insert, BB, CaseBB});
434     }
435     LLVM_DEBUG(dbgs() << *CaseBB << "\n");
436   }
437   DTU.applyUpdates(Updates);
438   Updates.clear();
439 
440   if (MaxCount)
441     setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
442 
443   LLVM_DEBUG(dbgs() << *BB << "\n");
444   LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
445   LLVM_DEBUG(dbgs() << *MergeBB << "\n");
446 
447   ORE.emit([&]() {
448     using namespace ore;
449     return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
450            << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
451            << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
452            << " for " << NV("Versions", Version) << " versions";
453   });
454 
455   return true;
456 }
457 } // namespace
458 
459 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
460                                 OptimizationRemarkEmitter &ORE,
461                                 DominatorTree *DT, TargetLibraryInfo &TLI) {
462   if (DisableMemOPOPT)
463     return false;
464 
465   if (F.hasFnAttribute(Attribute::OptimizeForSize))
466     return false;
467   MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
468   MemOPSizeOpt.perform();
469   return MemOPSizeOpt.isChanged();
470 }
471 
472 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
473                                        FunctionAnalysisManager &FAM) {
474   auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
475   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
476   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
477   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
478   bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
479   if (!Changed)
480     return PreservedAnalyses::all();
481   auto PA = PreservedAnalyses();
482   PA.preserve<DominatorTreeAnalysis>();
483   return PA;
484 }
485