1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the transformation that optimizes memory intrinsics 10 // such as memcpy using the size value profile. When memory intrinsic size 11 // value profile metadata is available, a single memory intrinsic is expanded 12 // to a sequence of guarded specialized versions that are called with the 13 // hottest size(s), for later expansion into more optimal inline sequences. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/DomTreeUpdater.h" 23 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 24 #include "llvm/Analysis/TargetLibraryInfo.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/Dominators.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/Instruction.h" 32 #include "llvm/IR/Instructions.h" 33 #include "llvm/IR/LLVMContext.h" 34 #include "llvm/IR/PassManager.h" 35 #include "llvm/IR/Type.h" 36 #include "llvm/ProfileData/InstrProf.h" 37 #define INSTR_PROF_VALUE_PROF_MEMOP_API 38 #include "llvm/ProfileData/InstrProfData.inc" 39 #include "llvm/Support/Casting.h" 40 #include "llvm/Support/CommandLine.h" 41 #include "llvm/Support/Debug.h" 42 #include "llvm/Support/ErrorHandling.h" 43 #include "llvm/Support/MathExtras.h" 44 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 45 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 46 #include <cassert> 47 #include <cstdint> 48 #include <vector> 49 50 using namespace llvm; 51 52 #define DEBUG_TYPE "pgo-memop-opt" 53 54 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 55 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 56 57 // The minimum call count to optimize memory intrinsic calls. 58 static cl::opt<unsigned> 59 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000), 60 cl::desc("The minimum count to optimize memory " 61 "intrinsic calls")); 62 63 // Command line option to disable memory intrinsic optimization. The default is 64 // false. This is for debug purpose. 65 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 66 cl::Hidden, cl::desc("Disable optimize")); 67 68 // The percent threshold to optimize memory intrinsic calls. 69 static cl::opt<unsigned> 70 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 71 cl::Hidden, 72 cl::desc("The percentage threshold for the " 73 "memory intrinsic calls optimization")); 74 75 // Maximum number of versions for optimizing memory intrinsic call. 76 static cl::opt<unsigned> 77 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 78 cl::desc("The max version for the optimized memory " 79 " intrinsic calls")); 80 81 // Scale the counts from the annotation using the BB count value. 82 static cl::opt<bool> 83 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 84 cl::desc("Scale the memop size counts using the basic " 85 " block count value")); 86 87 cl::opt<bool> 88 MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), 89 cl::Hidden, 90 cl::desc("Size-specialize memcmp and bcmp calls")); 91 92 static cl::opt<unsigned> 93 MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), 94 cl::desc("Optimize the memop size <= this value")); 95 96 namespace { 97 98 static const char *getMIName(const MemIntrinsic *MI) { 99 switch (MI->getIntrinsicID()) { 100 case Intrinsic::memcpy: 101 return "memcpy"; 102 case Intrinsic::memmove: 103 return "memmove"; 104 case Intrinsic::memset: 105 return "memset"; 106 default: 107 return "unknown"; 108 } 109 } 110 111 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). 112 struct MemOp { 113 Instruction *I; 114 MemOp(MemIntrinsic *MI) : I(MI) {} 115 MemOp(CallInst *CI) : I(CI) {} 116 MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); } 117 CallInst *asCI() { return cast<CallInst>(I); } 118 MemOp clone() { 119 if (auto MI = asMI()) 120 return MemOp(cast<MemIntrinsic>(MI->clone())); 121 return MemOp(cast<CallInst>(asCI()->clone())); 122 } 123 Value *getLength() { 124 if (auto MI = asMI()) 125 return MI->getLength(); 126 return asCI()->getArgOperand(2); 127 } 128 void setLength(Value *Length) { 129 if (auto MI = asMI()) 130 return MI->setLength(Length); 131 asCI()->setArgOperand(2, Length); 132 } 133 StringRef getFuncName() { 134 if (auto MI = asMI()) 135 return MI->getCalledFunction()->getName(); 136 return asCI()->getCalledFunction()->getName(); 137 } 138 bool isMemmove() { 139 if (auto MI = asMI()) 140 if (MI->getIntrinsicID() == Intrinsic::memmove) 141 return true; 142 return false; 143 } 144 bool isMemcmp(TargetLibraryInfo &TLI) { 145 LibFunc Func; 146 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 147 Func == LibFunc_memcmp) { 148 return true; 149 } 150 return false; 151 } 152 bool isBcmp(TargetLibraryInfo &TLI) { 153 LibFunc Func; 154 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 155 Func == LibFunc_bcmp) { 156 return true; 157 } 158 return false; 159 } 160 const char *getName(TargetLibraryInfo &TLI) { 161 if (auto MI = asMI()) 162 return getMIName(MI); 163 LibFunc Func; 164 if (TLI.getLibFunc(*asCI(), Func)) { 165 if (Func == LibFunc_memcmp) 166 return "memcmp"; 167 if (Func == LibFunc_bcmp) 168 return "bcmp"; 169 } 170 llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst"); 171 return nullptr; 172 } 173 }; 174 175 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 176 public: 177 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, 178 OptimizationRemarkEmitter &ORE, DominatorTree *DT, 179 TargetLibraryInfo &TLI) 180 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { 181 ValueDataArray = 182 std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS); 183 } 184 bool isChanged() const { return Changed; } 185 void perform() { 186 WorkList.clear(); 187 visit(Func); 188 189 for (auto &MO : WorkList) { 190 ++NumOfPGOMemOPAnnotate; 191 if (perform(MO)) { 192 Changed = true; 193 ++NumOfPGOMemOPOpt; 194 LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() 195 << "is Transformed.\n"); 196 } 197 } 198 } 199 200 void visitMemIntrinsic(MemIntrinsic &MI) { 201 Value *Length = MI.getLength(); 202 // Not perform on constant length calls. 203 if (isa<ConstantInt>(Length)) 204 return; 205 WorkList.push_back(MemOp(&MI)); 206 } 207 208 void visitCallInst(CallInst &CI) { 209 LibFunc Func; 210 if (TLI.getLibFunc(CI, Func) && 211 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && 212 !isa<ConstantInt>(CI.getArgOperand(2))) { 213 WorkList.push_back(MemOp(&CI)); 214 } 215 } 216 217 private: 218 Function &Func; 219 BlockFrequencyInfo &BFI; 220 OptimizationRemarkEmitter &ORE; 221 DominatorTree *DT; 222 TargetLibraryInfo &TLI; 223 bool Changed; 224 std::vector<MemOp> WorkList; 225 // The space to read the profile annotation. 226 std::unique_ptr<InstrProfValueData[]> ValueDataArray; 227 bool perform(MemOp MO); 228 }; 229 230 static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 231 assert(Count <= TotalCount); 232 if (Count < MemOPCountThreshold) 233 return false; 234 if (Count < TotalCount * MemOPPercentThreshold / 100) 235 return false; 236 return true; 237 } 238 239 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 240 uint64_t Denom) { 241 if (!MemOPScaleCount) 242 return Count; 243 bool Overflowed; 244 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 245 return ScaleCount / Denom; 246 } 247 248 bool MemOPSizeOpt::perform(MemOp MO) { 249 assert(MO.I); 250 if (MO.isMemmove()) 251 return false; 252 if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) 253 return false; 254 255 uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS; 256 uint64_t TotalCount; 257 if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals, 258 ValueDataArray.get(), NumVals, TotalCount)) 259 return false; 260 261 uint64_t ActualCount = TotalCount; 262 uint64_t SavedTotalCount = TotalCount; 263 if (MemOPScaleCount) { 264 auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); 265 if (!BBEdgeCount) 266 return false; 267 ActualCount = *BBEdgeCount; 268 } 269 270 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); 271 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " 272 << ActualCount << "\n"); 273 LLVM_DEBUG( 274 for (auto &VD 275 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); 276 277 if (ActualCount < MemOPCountThreshold) 278 return false; 279 // Skip if the total value profiled count is 0, in which case we can't 280 // scale up the counts properly (and there is no profitable transformation). 281 if (TotalCount == 0) 282 return false; 283 284 TotalCount = ActualCount; 285 if (MemOPScaleCount) 286 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 287 << " denominator = " << SavedTotalCount << "\n"); 288 289 // Keeping track of the count of the default case: 290 uint64_t RemainCount = TotalCount; 291 uint64_t SavedRemainCount = SavedTotalCount; 292 SmallVector<uint64_t, 16> SizeIds; 293 SmallVector<uint64_t, 16> CaseCounts; 294 SmallDenseSet<uint64_t, 16> SeenSizeId; 295 uint64_t MaxCount = 0; 296 unsigned Version = 0; 297 // Default case is in the front -- save the slot here. 298 CaseCounts.push_back(0); 299 SmallVector<InstrProfValueData, 24> RemainingVDs; 300 for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) { 301 auto &VD = *I; 302 int64_t V = VD.Value; 303 uint64_t C = VD.Count; 304 if (MemOPScaleCount) 305 C = getScaledCount(C, ActualCount, SavedTotalCount); 306 307 if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) { 308 RemainingVDs.push_back(VD); 309 continue; 310 } 311 312 // ValueCounts are sorted on the count. Break at the first un-profitable 313 // value. 314 if (!isProfitable(C, RemainCount)) { 315 RemainingVDs.insert(RemainingVDs.end(), I, E); 316 break; 317 } 318 319 if (!SeenSizeId.insert(V).second) { 320 errs() << "warning: Invalid Profile Data in Function " << Func.getName() 321 << ": Two identical values in MemOp value counts.\n"; 322 return false; 323 } 324 325 SizeIds.push_back(V); 326 CaseCounts.push_back(C); 327 if (C > MaxCount) 328 MaxCount = C; 329 330 assert(RemainCount >= C); 331 RemainCount -= C; 332 assert(SavedRemainCount >= VD.Count); 333 SavedRemainCount -= VD.Count; 334 335 if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) { 336 RemainingVDs.insert(RemainingVDs.end(), I + 1, E); 337 break; 338 } 339 } 340 341 if (Version == 0) 342 return false; 343 344 CaseCounts[0] = RemainCount; 345 if (RemainCount > MaxCount) 346 MaxCount = RemainCount; 347 348 uint64_t SumForOpt = TotalCount - RemainCount; 349 350 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 351 << " Versions (covering " << SumForOpt << " out of " 352 << TotalCount << ")\n"); 353 354 // mem_op(..., size) 355 // ==> 356 // switch (size) { 357 // case s1: 358 // mem_op(..., s1); 359 // goto merge_bb; 360 // case s2: 361 // mem_op(..., s2); 362 // goto merge_bb; 363 // ... 364 // default: 365 // mem_op(..., size); 366 // goto merge_bb; 367 // } 368 // merge_bb: 369 370 BasicBlock *BB = MO.I->getParent(); 371 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 372 LLVM_DEBUG(dbgs() << *BB << "\n"); 373 auto OrigBBFreq = BFI.getBlockFreq(BB); 374 375 BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT); 376 BasicBlock::iterator It(*MO.I); 377 ++It; 378 assert(It != DefaultBB->end()); 379 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); 380 MergeBB->setName("MemOP.Merge"); 381 BFI.setBlockFreq(MergeBB, OrigBBFreq); 382 DefaultBB->setName("MemOP.Default"); 383 384 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 385 auto &Ctx = Func.getContext(); 386 IRBuilder<> IRB(BB); 387 BB->getTerminator()->eraseFromParent(); 388 Value *SizeVar = MO.getLength(); 389 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 390 Type *MemOpTy = MO.I->getType(); 391 PHINode *PHI = nullptr; 392 if (!MemOpTy->isVoidTy()) { 393 // Insert a phi for the return values at the merge block. 394 IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); 395 PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); 396 MO.I->replaceAllUsesWith(PHI); 397 PHI->addIncoming(MO.I, DefaultBB); 398 } 399 400 // Clear the value profile data. 401 MO.I->setMetadata(LLVMContext::MD_prof, nullptr); 402 // If all promoted, we don't need the MD.prof metadata. 403 if (SavedRemainCount > 0 || Version != NumVals) { 404 // Otherwise we need update with the un-promoted records back. 405 ArrayRef<InstrProfValueData> RemVDs(RemainingVDs); 406 annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount, 407 IPVK_MemOPSize, NumVals); 408 } 409 410 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 411 412 std::vector<DominatorTree::UpdateType> Updates; 413 if (DT) 414 Updates.reserve(2 * SizeIds.size()); 415 416 for (uint64_t SizeId : SizeIds) { 417 BasicBlock *CaseBB = BasicBlock::Create( 418 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 419 MemOp NewMO = MO.clone(); 420 // Fix the argument. 421 auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType()); 422 assert(SizeType && "Expected integer type size argument."); 423 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); 424 NewMO.setLength(CaseSizeId); 425 NewMO.I->insertInto(CaseBB, CaseBB->end()); 426 IRBuilder<> IRBCase(CaseBB); 427 IRBCase.CreateBr(MergeBB); 428 SI->addCase(CaseSizeId, CaseBB); 429 if (!MemOpTy->isVoidTy()) 430 PHI->addIncoming(NewMO.I, CaseBB); 431 if (DT) { 432 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); 433 Updates.push_back({DominatorTree::Insert, BB, CaseBB}); 434 } 435 LLVM_DEBUG(dbgs() << *CaseBB << "\n"); 436 } 437 DTU.applyUpdates(Updates); 438 Updates.clear(); 439 440 if (MaxCount) 441 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 442 443 LLVM_DEBUG(dbgs() << *BB << "\n"); 444 LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); 445 LLVM_DEBUG(dbgs() << *MergeBB << "\n"); 446 447 ORE.emit([&]() { 448 using namespace ore; 449 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I) 450 << "optimized " << NV("Memop", MO.getName(TLI)) << " with count " 451 << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount) 452 << " for " << NV("Versions", Version) << " versions"; 453 }); 454 455 return true; 456 } 457 } // namespace 458 459 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, 460 OptimizationRemarkEmitter &ORE, 461 DominatorTree *DT, TargetLibraryInfo &TLI) { 462 if (DisableMemOPOPT) 463 return false; 464 465 if (F.hasFnAttribute(Attribute::OptimizeForSize)) 466 return false; 467 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); 468 MemOPSizeOpt.perform(); 469 return MemOPSizeOpt.isChanged(); 470 } 471 472 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 473 FunctionAnalysisManager &FAM) { 474 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 475 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 476 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 477 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 478 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 479 if (!Changed) 480 return PreservedAnalyses::all(); 481 auto PA = PreservedAnalyses(); 482 PA.preserve<DominatorTreeAnalysis>(); 483 return PA; 484 } 485