1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the transformation that optimizes memory intrinsics 10 // such as memcpy using the size value profile. When memory intrinsic size 11 // value profile metadata is available, a single memory intrinsic is expanded 12 // to a sequence of guarded specialized versions that are called with the 13 // hottest size(s), for later expansion into more optimal inline sequences. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/DomTreeUpdater.h" 23 #include "llvm/Analysis/GlobalsModRef.h" 24 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/CallSite.h" 27 #include "llvm/IR/DerivedTypes.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/InstVisitor.h" 32 #include "llvm/IR/InstrTypes.h" 33 #include "llvm/IR/Instruction.h" 34 #include "llvm/IR/Instructions.h" 35 #include "llvm/IR/LLVMContext.h" 36 #include "llvm/IR/PassManager.h" 37 #include "llvm/IR/Type.h" 38 #include "llvm/InitializePasses.h" 39 #include "llvm/Pass.h" 40 #include "llvm/PassRegistry.h" 41 #include "llvm/PassSupport.h" 42 #include "llvm/ProfileData/InstrProf.h" 43 #include "llvm/Support/Casting.h" 44 #include "llvm/Support/CommandLine.h" 45 #include "llvm/Support/Debug.h" 46 #include "llvm/Support/ErrorHandling.h" 47 #include "llvm/Support/MathExtras.h" 48 #include "llvm/Transforms/Instrumentation.h" 49 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 50 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 51 #include <cassert> 52 #include <cstdint> 53 #include <vector> 54 55 using namespace llvm; 56 57 #define DEBUG_TYPE "pgo-memop-opt" 58 59 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 60 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 61 62 // The minimum call count to optimize memory intrinsic calls. 63 static cl::opt<unsigned> 64 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, 65 cl::init(1000), 66 cl::desc("The minimum count to optimize memory " 67 "intrinsic calls")); 68 69 // Command line option to disable memory intrinsic optimization. The default is 70 // false. This is for debug purpose. 71 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 72 cl::Hidden, cl::desc("Disable optimize")); 73 74 // The percent threshold to optimize memory intrinsic calls. 75 static cl::opt<unsigned> 76 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 77 cl::Hidden, cl::ZeroOrMore, 78 cl::desc("The percentage threshold for the " 79 "memory intrinsic calls optimization")); 80 81 // Maximum number of versions for optimizing memory intrinsic call. 82 static cl::opt<unsigned> 83 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 84 cl::ZeroOrMore, 85 cl::desc("The max version for the optimized memory " 86 " intrinsic calls")); 87 88 // Scale the counts from the annotation using the BB count value. 89 static cl::opt<bool> 90 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 91 cl::desc("Scale the memop size counts using the basic " 92 " block count value")); 93 94 // This option sets the rangge of precise profile memop sizes. 95 extern cl::opt<std::string> MemOPSizeRange; 96 97 // This option sets the value that groups large memop sizes 98 extern cl::opt<unsigned> MemOPSizeLarge; 99 100 namespace { 101 class PGOMemOPSizeOptLegacyPass : public FunctionPass { 102 public: 103 static char ID; 104 105 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) { 106 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry()); 107 } 108 109 StringRef getPassName() const override { return "PGOMemOPSize"; } 110 111 private: 112 bool runOnFunction(Function &F) override; 113 void getAnalysisUsage(AnalysisUsage &AU) const override { 114 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 115 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 116 AU.addPreserved<GlobalsAAWrapperPass>(); 117 AU.addPreserved<DominatorTreeWrapperPass>(); 118 } 119 }; 120 } // end anonymous namespace 121 122 char PGOMemOPSizeOptLegacyPass::ID = 0; 123 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 124 "Optimize memory intrinsic using its size value profile", 125 false, false) 126 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 127 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 128 "Optimize memory intrinsic using its size value profile", 129 false, false) 130 131 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() { 132 return new PGOMemOPSizeOptLegacyPass(); 133 } 134 135 namespace { 136 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 137 public: 138 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, 139 OptimizationRemarkEmitter &ORE, DominatorTree *DT) 140 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) { 141 ValueDataArray = 142 std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2); 143 // Get the MemOPSize range information from option MemOPSizeRange, 144 getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart, 145 PreciseRangeLast); 146 } 147 bool isChanged() const { return Changed; } 148 void perform() { 149 WorkList.clear(); 150 visit(Func); 151 152 for (auto &MI : WorkList) { 153 ++NumOfPGOMemOPAnnotate; 154 if (perform(MI)) { 155 Changed = true; 156 ++NumOfPGOMemOPOpt; 157 LLVM_DEBUG(dbgs() << "MemOP call: " 158 << MI->getCalledFunction()->getName() 159 << "is Transformed.\n"); 160 } 161 } 162 } 163 164 void visitMemIntrinsic(MemIntrinsic &MI) { 165 Value *Length = MI.getLength(); 166 // Not perform on constant length calls. 167 if (dyn_cast<ConstantInt>(Length)) 168 return; 169 WorkList.push_back(&MI); 170 } 171 172 private: 173 Function &Func; 174 BlockFrequencyInfo &BFI; 175 OptimizationRemarkEmitter &ORE; 176 DominatorTree *DT; 177 bool Changed; 178 std::vector<MemIntrinsic *> WorkList; 179 // Start of the previse range. 180 int64_t PreciseRangeStart; 181 // Last value of the previse range. 182 int64_t PreciseRangeLast; 183 // The space to read the profile annotation. 184 std::unique_ptr<InstrProfValueData[]> ValueDataArray; 185 bool perform(MemIntrinsic *MI); 186 187 // This kind shows which group the value falls in. For PreciseValue, we have 188 // the profile count for that value. LargeGroup groups the values that are in 189 // range [LargeValue, +inf). NonLargeGroup groups the rest of values. 190 enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup }; 191 192 MemOPSizeKind getMemOPSizeKind(int64_t Value) const { 193 if (Value == MemOPSizeLarge && MemOPSizeLarge != 0) 194 return LargeGroup; 195 if (Value == PreciseRangeLast + 1) 196 return NonLargeGroup; 197 return PreciseValue; 198 } 199 }; 200 201 static const char *getMIName(const MemIntrinsic *MI) { 202 switch (MI->getIntrinsicID()) { 203 case Intrinsic::memcpy: 204 return "memcpy"; 205 case Intrinsic::memmove: 206 return "memmove"; 207 case Intrinsic::memset: 208 return "memset"; 209 default: 210 return "unknown"; 211 } 212 } 213 214 static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 215 assert(Count <= TotalCount); 216 if (Count < MemOPCountThreshold) 217 return false; 218 if (Count < TotalCount * MemOPPercentThreshold / 100) 219 return false; 220 return true; 221 } 222 223 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 224 uint64_t Denom) { 225 if (!MemOPScaleCount) 226 return Count; 227 bool Overflowed; 228 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 229 return ScaleCount / Denom; 230 } 231 232 bool MemOPSizeOpt::perform(MemIntrinsic *MI) { 233 assert(MI); 234 if (MI->getIntrinsicID() == Intrinsic::memmove) 235 return false; 236 237 uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2; 238 uint64_t TotalCount; 239 if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions, 240 ValueDataArray.get(), NumVals, TotalCount)) 241 return false; 242 243 uint64_t ActualCount = TotalCount; 244 uint64_t SavedTotalCount = TotalCount; 245 if (MemOPScaleCount) { 246 auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent()); 247 if (!BBEdgeCount) 248 return false; 249 ActualCount = *BBEdgeCount; 250 } 251 252 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); 253 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " 254 << ActualCount << "\n"); 255 LLVM_DEBUG( 256 for (auto &VD 257 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); 258 259 if (ActualCount < MemOPCountThreshold) 260 return false; 261 // Skip if the total value profiled count is 0, in which case we can't 262 // scale up the counts properly (and there is no profitable transformation). 263 if (TotalCount == 0) 264 return false; 265 266 TotalCount = ActualCount; 267 if (MemOPScaleCount) 268 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 269 << " denominator = " << SavedTotalCount << "\n"); 270 271 // Keeping track of the count of the default case: 272 uint64_t RemainCount = TotalCount; 273 uint64_t SavedRemainCount = SavedTotalCount; 274 SmallVector<uint64_t, 16> SizeIds; 275 SmallVector<uint64_t, 16> CaseCounts; 276 uint64_t MaxCount = 0; 277 unsigned Version = 0; 278 // Default case is in the front -- save the slot here. 279 CaseCounts.push_back(0); 280 for (auto &VD : VDs) { 281 int64_t V = VD.Value; 282 uint64_t C = VD.Count; 283 if (MemOPScaleCount) 284 C = getScaledCount(C, ActualCount, SavedTotalCount); 285 286 // Only care precise value here. 287 if (getMemOPSizeKind(V) != PreciseValue) 288 continue; 289 290 // ValueCounts are sorted on the count. Break at the first un-profitable 291 // value. 292 if (!isProfitable(C, RemainCount)) 293 break; 294 295 SizeIds.push_back(V); 296 CaseCounts.push_back(C); 297 if (C > MaxCount) 298 MaxCount = C; 299 300 assert(RemainCount >= C); 301 RemainCount -= C; 302 assert(SavedRemainCount >= VD.Count); 303 SavedRemainCount -= VD.Count; 304 305 if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0) 306 break; 307 } 308 309 if (Version == 0) 310 return false; 311 312 CaseCounts[0] = RemainCount; 313 if (RemainCount > MaxCount) 314 MaxCount = RemainCount; 315 316 uint64_t SumForOpt = TotalCount - RemainCount; 317 318 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 319 << " Versions (covering " << SumForOpt << " out of " 320 << TotalCount << ")\n"); 321 322 // mem_op(..., size) 323 // ==> 324 // switch (size) { 325 // case s1: 326 // mem_op(..., s1); 327 // goto merge_bb; 328 // case s2: 329 // mem_op(..., s2); 330 // goto merge_bb; 331 // ... 332 // default: 333 // mem_op(..., size); 334 // goto merge_bb; 335 // } 336 // merge_bb: 337 338 BasicBlock *BB = MI->getParent(); 339 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 340 LLVM_DEBUG(dbgs() << *BB << "\n"); 341 auto OrigBBFreq = BFI.getBlockFreq(BB); 342 343 BasicBlock *DefaultBB = SplitBlock(BB, MI, DT); 344 BasicBlock::iterator It(*MI); 345 ++It; 346 assert(It != DefaultBB->end()); 347 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); 348 MergeBB->setName("MemOP.Merge"); 349 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); 350 DefaultBB->setName("MemOP.Default"); 351 352 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 353 auto &Ctx = Func.getContext(); 354 IRBuilder<> IRB(BB); 355 BB->getTerminator()->eraseFromParent(); 356 Value *SizeVar = MI->getLength(); 357 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 358 359 // Clear the value profile data. 360 MI->setMetadata(LLVMContext::MD_prof, nullptr); 361 // If all promoted, we don't need the MD.prof metadata. 362 if (SavedRemainCount > 0 || Version != NumVals) 363 // Otherwise we need update with the un-promoted records back. 364 annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version), 365 SavedRemainCount, IPVK_MemOPSize, NumVals); 366 367 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 368 369 std::vector<DominatorTree::UpdateType> Updates; 370 if (DT) 371 Updates.reserve(2 * SizeIds.size()); 372 373 for (uint64_t SizeId : SizeIds) { 374 BasicBlock *CaseBB = BasicBlock::Create( 375 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 376 Instruction *NewInst = MI->clone(); 377 // Fix the argument. 378 auto *MemI = cast<MemIntrinsic>(NewInst); 379 auto *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType()); 380 assert(SizeType && "Expected integer type size argument."); 381 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); 382 MemI->setLength(CaseSizeId); 383 CaseBB->getInstList().push_back(NewInst); 384 IRBuilder<> IRBCase(CaseBB); 385 IRBCase.CreateBr(MergeBB); 386 SI->addCase(CaseSizeId, CaseBB); 387 if (DT) { 388 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); 389 Updates.push_back({DominatorTree::Insert, BB, CaseBB}); 390 } 391 LLVM_DEBUG(dbgs() << *CaseBB << "\n"); 392 } 393 DTU.applyUpdates(Updates); 394 Updates.clear(); 395 396 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 397 398 LLVM_DEBUG(dbgs() << *BB << "\n"); 399 LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); 400 LLVM_DEBUG(dbgs() << *MergeBB << "\n"); 401 402 ORE.emit([&]() { 403 using namespace ore; 404 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MI) 405 << "optimized " << NV("Intrinsic", StringRef(getMIName(MI))) 406 << " with count " << NV("Count", SumForOpt) << " out of " 407 << NV("Total", TotalCount) << " for " << NV("Versions", Version) 408 << " versions"; 409 }); 410 411 return true; 412 } 413 } // namespace 414 415 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, 416 OptimizationRemarkEmitter &ORE, 417 DominatorTree *DT) { 418 if (DisableMemOPOPT) 419 return false; 420 421 if (F.hasFnAttribute(Attribute::OptimizeForSize)) 422 return false; 423 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT); 424 MemOPSizeOpt.perform(); 425 return MemOPSizeOpt.isChanged(); 426 } 427 428 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) { 429 BlockFrequencyInfo &BFI = 430 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 431 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 432 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 433 DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; 434 return PGOMemOPSizeOptImpl(F, BFI, ORE, DT); 435 } 436 437 namespace llvm { 438 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID; 439 440 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 441 FunctionAnalysisManager &FAM) { 442 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 443 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 444 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 445 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT); 446 if (!Changed) 447 return PreservedAnalyses::all(); 448 auto PA = PreservedAnalyses(); 449 PA.preserve<GlobalsAA>(); 450 PA.preserve<DominatorTreeAnalysis>(); 451 return PA; 452 } 453 } // namespace llvm 454