1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the transformation that optimizes memory intrinsics 10 // such as memcpy using the size value profile. When memory intrinsic size 11 // value profile metadata is available, a single memory intrinsic is expanded 12 // to a sequence of guarded specialized versions that are called with the 13 // hottest size(s), for later expansion into more optimal inline sequences. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/DomTreeUpdater.h" 23 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 24 #include "llvm/Analysis/TargetLibraryInfo.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/Dominators.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/Instruction.h" 32 #include "llvm/IR/Instructions.h" 33 #include "llvm/IR/LLVMContext.h" 34 #include "llvm/IR/PassManager.h" 35 #include "llvm/IR/Type.h" 36 #include "llvm/ProfileData/InstrProf.h" 37 #define INSTR_PROF_VALUE_PROF_MEMOP_API 38 #include "llvm/ProfileData/InstrProfData.inc" 39 #include "llvm/Support/Casting.h" 40 #include "llvm/Support/CommandLine.h" 41 #include "llvm/Support/Debug.h" 42 #include "llvm/Support/ErrorHandling.h" 43 #include "llvm/Support/MathExtras.h" 44 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 45 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 46 #include <cassert> 47 #include <cstdint> 48 #include <vector> 49 50 using namespace llvm; 51 52 #define DEBUG_TYPE "pgo-memop-opt" 53 54 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 55 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 56 57 // The minimum call count to optimize memory intrinsic calls. 58 static cl::opt<unsigned> 59 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000), 60 cl::desc("The minimum count to optimize memory " 61 "intrinsic calls")); 62 63 // Command line option to disable memory intrinsic optimization. The default is 64 // false. This is for debug purpose. 65 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 66 cl::Hidden, cl::desc("Disable optimize")); 67 68 // The percent threshold to optimize memory intrinsic calls. 69 static cl::opt<unsigned> 70 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 71 cl::Hidden, 72 cl::desc("The percentage threshold for the " 73 "memory intrinsic calls optimization")); 74 75 // Maximum number of versions for optimizing memory intrinsic call. 76 static cl::opt<unsigned> 77 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 78 cl::desc("The max version for the optimized memory " 79 " intrinsic calls")); 80 81 // Scale the counts from the annotation using the BB count value. 82 static cl::opt<bool> 83 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 84 cl::desc("Scale the memop size counts using the basic " 85 " block count value")); 86 87 cl::opt<bool> 88 MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), 89 cl::Hidden, 90 cl::desc("Size-specialize memcmp and bcmp calls")); 91 92 static cl::opt<unsigned> 93 MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), 94 cl::desc("Optimize the memop size <= this value")); 95 96 namespace { 97 98 static const char *getMIName(const MemIntrinsic *MI) { 99 switch (MI->getIntrinsicID()) { 100 case Intrinsic::memcpy: 101 return "memcpy"; 102 case Intrinsic::memmove: 103 return "memmove"; 104 case Intrinsic::memset: 105 return "memset"; 106 default: 107 return "unknown"; 108 } 109 } 110 111 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). 112 struct MemOp { 113 Instruction *I; 114 MemOp(MemIntrinsic *MI) : I(MI) {} 115 MemOp(CallInst *CI) : I(CI) {} 116 MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); } 117 CallInst *asCI() { return cast<CallInst>(I); } 118 MemOp clone() { 119 if (auto MI = asMI()) 120 return MemOp(cast<MemIntrinsic>(MI->clone())); 121 return MemOp(cast<CallInst>(asCI()->clone())); 122 } 123 Value *getLength() { 124 if (auto MI = asMI()) 125 return MI->getLength(); 126 return asCI()->getArgOperand(2); 127 } 128 void setLength(Value *Length) { 129 if (auto MI = asMI()) 130 return MI->setLength(Length); 131 asCI()->setArgOperand(2, Length); 132 } 133 StringRef getFuncName() { 134 if (auto MI = asMI()) 135 return MI->getCalledFunction()->getName(); 136 return asCI()->getCalledFunction()->getName(); 137 } 138 bool isMemmove() { 139 if (auto MI = asMI()) 140 if (MI->getIntrinsicID() == Intrinsic::memmove) 141 return true; 142 return false; 143 } 144 bool isMemcmp(TargetLibraryInfo &TLI) { 145 LibFunc Func; 146 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 147 Func == LibFunc_memcmp) { 148 return true; 149 } 150 return false; 151 } 152 bool isBcmp(TargetLibraryInfo &TLI) { 153 LibFunc Func; 154 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 155 Func == LibFunc_bcmp) { 156 return true; 157 } 158 return false; 159 } 160 const char *getName(TargetLibraryInfo &TLI) { 161 if (auto MI = asMI()) 162 return getMIName(MI); 163 LibFunc Func; 164 if (TLI.getLibFunc(*asCI(), Func)) { 165 if (Func == LibFunc_memcmp) 166 return "memcmp"; 167 if (Func == LibFunc_bcmp) 168 return "bcmp"; 169 } 170 llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst"); 171 return nullptr; 172 } 173 }; 174 175 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 176 public: 177 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, 178 OptimizationRemarkEmitter &ORE, DominatorTree *DT, 179 TargetLibraryInfo &TLI) 180 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { 181 ValueDataArray = 182 std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS); 183 } 184 bool isChanged() const { return Changed; } 185 void perform() { 186 WorkList.clear(); 187 visit(Func); 188 189 for (auto &MO : WorkList) { 190 ++NumOfPGOMemOPAnnotate; 191 if (perform(MO)) { 192 Changed = true; 193 ++NumOfPGOMemOPOpt; 194 LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() 195 << "is Transformed.\n"); 196 } 197 } 198 } 199 200 void visitMemIntrinsic(MemIntrinsic &MI) { 201 Value *Length = MI.getLength(); 202 // Not perform on constant length calls. 203 if (isa<ConstantInt>(Length)) 204 return; 205 WorkList.push_back(MemOp(&MI)); 206 } 207 208 void visitCallInst(CallInst &CI) { 209 LibFunc Func; 210 if (TLI.getLibFunc(CI, Func) && 211 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && 212 !isa<ConstantInt>(CI.getArgOperand(2))) { 213 WorkList.push_back(MemOp(&CI)); 214 } 215 } 216 217 private: 218 Function &Func; 219 BlockFrequencyInfo &BFI; 220 OptimizationRemarkEmitter &ORE; 221 DominatorTree *DT; 222 TargetLibraryInfo &TLI; 223 bool Changed; 224 std::vector<MemOp> WorkList; 225 // The space to read the profile annotation. 226 std::unique_ptr<InstrProfValueData[]> ValueDataArray; 227 bool perform(MemOp MO); 228 }; 229 230 static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 231 assert(Count <= TotalCount); 232 if (Count < MemOPCountThreshold) 233 return false; 234 if (Count < TotalCount * MemOPPercentThreshold / 100) 235 return false; 236 return true; 237 } 238 239 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 240 uint64_t Denom) { 241 if (!MemOPScaleCount) 242 return Count; 243 bool Overflowed; 244 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 245 return ScaleCount / Denom; 246 } 247 248 bool MemOPSizeOpt::perform(MemOp MO) { 249 assert(MO.I); 250 if (MO.isMemmove()) 251 return false; 252 if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) 253 return false; 254 255 uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS; 256 uint64_t TotalCount; 257 if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals, 258 ValueDataArray.get(), NumVals, TotalCount)) 259 return false; 260 261 uint64_t ActualCount = TotalCount; 262 uint64_t SavedTotalCount = TotalCount; 263 if (MemOPScaleCount) { 264 auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); 265 if (!BBEdgeCount) 266 return false; 267 ActualCount = *BBEdgeCount; 268 } 269 270 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); 271 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " 272 << ActualCount << "\n"); 273 LLVM_DEBUG( 274 for (auto &VD 275 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); 276 277 if (ActualCount < MemOPCountThreshold) 278 return false; 279 // Skip if the total value profiled count is 0, in which case we can't 280 // scale up the counts properly (and there is no profitable transformation). 281 if (TotalCount == 0) 282 return false; 283 284 TotalCount = ActualCount; 285 if (MemOPScaleCount) 286 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 287 << " denominator = " << SavedTotalCount << "\n"); 288 289 // Keeping track of the count of the default case: 290 uint64_t RemainCount = TotalCount; 291 uint64_t SavedRemainCount = SavedTotalCount; 292 SmallVector<uint64_t, 16> SizeIds; 293 SmallVector<uint64_t, 16> CaseCounts; 294 uint64_t MaxCount = 0; 295 unsigned Version = 0; 296 int64_t LastV = -1; 297 // Default case is in the front -- save the slot here. 298 CaseCounts.push_back(0); 299 SmallVector<InstrProfValueData, 24> RemainingVDs; 300 for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) { 301 auto &VD = *I; 302 int64_t V = VD.Value; 303 uint64_t C = VD.Count; 304 if (MemOPScaleCount) 305 C = getScaledCount(C, ActualCount, SavedTotalCount); 306 307 if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) { 308 RemainingVDs.push_back(VD); 309 continue; 310 } 311 312 // ValueCounts are sorted on the count. Break at the first un-profitable 313 // value. 314 if (!isProfitable(C, RemainCount)) { 315 RemainingVDs.insert(RemainingVDs.end(), I, E); 316 break; 317 } 318 319 if (V == LastV) { 320 LLVM_DEBUG(dbgs() << "Invalid Profile Data in Function " << Func.getName() 321 << ": Two consecutive, identical values in MemOp value" 322 "counts.\n"); 323 return false; 324 } 325 326 LastV = V; 327 328 SizeIds.push_back(V); 329 CaseCounts.push_back(C); 330 if (C > MaxCount) 331 MaxCount = C; 332 333 assert(RemainCount >= C); 334 RemainCount -= C; 335 assert(SavedRemainCount >= VD.Count); 336 SavedRemainCount -= VD.Count; 337 338 if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) { 339 RemainingVDs.insert(RemainingVDs.end(), I + 1, E); 340 break; 341 } 342 } 343 344 if (Version == 0) 345 return false; 346 347 CaseCounts[0] = RemainCount; 348 if (RemainCount > MaxCount) 349 MaxCount = RemainCount; 350 351 uint64_t SumForOpt = TotalCount - RemainCount; 352 353 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 354 << " Versions (covering " << SumForOpt << " out of " 355 << TotalCount << ")\n"); 356 357 // mem_op(..., size) 358 // ==> 359 // switch (size) { 360 // case s1: 361 // mem_op(..., s1); 362 // goto merge_bb; 363 // case s2: 364 // mem_op(..., s2); 365 // goto merge_bb; 366 // ... 367 // default: 368 // mem_op(..., size); 369 // goto merge_bb; 370 // } 371 // merge_bb: 372 373 BasicBlock *BB = MO.I->getParent(); 374 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 375 LLVM_DEBUG(dbgs() << *BB << "\n"); 376 auto OrigBBFreq = BFI.getBlockFreq(BB); 377 378 BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT); 379 BasicBlock::iterator It(*MO.I); 380 ++It; 381 assert(It != DefaultBB->end()); 382 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); 383 MergeBB->setName("MemOP.Merge"); 384 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); 385 DefaultBB->setName("MemOP.Default"); 386 387 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 388 auto &Ctx = Func.getContext(); 389 IRBuilder<> IRB(BB); 390 BB->getTerminator()->eraseFromParent(); 391 Value *SizeVar = MO.getLength(); 392 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 393 Type *MemOpTy = MO.I->getType(); 394 PHINode *PHI = nullptr; 395 if (!MemOpTy->isVoidTy()) { 396 // Insert a phi for the return values at the merge block. 397 IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); 398 PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); 399 MO.I->replaceAllUsesWith(PHI); 400 PHI->addIncoming(MO.I, DefaultBB); 401 } 402 403 // Clear the value profile data. 404 MO.I->setMetadata(LLVMContext::MD_prof, nullptr); 405 // If all promoted, we don't need the MD.prof metadata. 406 if (SavedRemainCount > 0 || Version != NumVals) { 407 // Otherwise we need update with the un-promoted records back. 408 ArrayRef<InstrProfValueData> RemVDs(RemainingVDs); 409 annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount, 410 IPVK_MemOPSize, NumVals); 411 } 412 413 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 414 415 std::vector<DominatorTree::UpdateType> Updates; 416 if (DT) 417 Updates.reserve(2 * SizeIds.size()); 418 419 for (uint64_t SizeId : SizeIds) { 420 BasicBlock *CaseBB = BasicBlock::Create( 421 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 422 MemOp NewMO = MO.clone(); 423 // Fix the argument. 424 auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType()); 425 assert(SizeType && "Expected integer type size argument."); 426 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); 427 NewMO.setLength(CaseSizeId); 428 CaseBB->getInstList().push_back(NewMO.I); 429 IRBuilder<> IRBCase(CaseBB); 430 IRBCase.CreateBr(MergeBB); 431 SI->addCase(CaseSizeId, CaseBB); 432 if (!MemOpTy->isVoidTy()) 433 PHI->addIncoming(NewMO.I, CaseBB); 434 if (DT) { 435 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); 436 Updates.push_back({DominatorTree::Insert, BB, CaseBB}); 437 } 438 LLVM_DEBUG(dbgs() << *CaseBB << "\n"); 439 } 440 DTU.applyUpdates(Updates); 441 Updates.clear(); 442 443 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 444 445 LLVM_DEBUG(dbgs() << *BB << "\n"); 446 LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); 447 LLVM_DEBUG(dbgs() << *MergeBB << "\n"); 448 449 ORE.emit([&]() { 450 using namespace ore; 451 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I) 452 << "optimized " << NV("Memop", MO.getName(TLI)) << " with count " 453 << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount) 454 << " for " << NV("Versions", Version) << " versions"; 455 }); 456 457 return true; 458 } 459 } // namespace 460 461 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, 462 OptimizationRemarkEmitter &ORE, 463 DominatorTree *DT, TargetLibraryInfo &TLI) { 464 if (DisableMemOPOPT) 465 return false; 466 467 if (F.hasFnAttribute(Attribute::OptimizeForSize)) 468 return false; 469 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); 470 MemOPSizeOpt.perform(); 471 return MemOPSizeOpt.isChanged(); 472 } 473 474 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 475 FunctionAnalysisManager &FAM) { 476 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 477 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 478 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 479 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 480 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 481 if (!Changed) 482 return PreservedAnalyses::all(); 483 auto PA = PreservedAnalyses(); 484 PA.preserve<DominatorTreeAnalysis>(); 485 return PA; 486 } 487