1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // The machine combiner pass uses machine trace metrics to ensure the combined 10 // instructions do not lengthen the critical path or the resource depth. 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/ADT/DenseMap.h" 14 #include "llvm/ADT/Statistic.h" 15 #include "llvm/Analysis/ProfileSummaryInfo.h" 16 #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" 17 #include "llvm/CodeGen/MachineDominators.h" 18 #include "llvm/CodeGen/MachineFunction.h" 19 #include "llvm/CodeGen/MachineFunctionPass.h" 20 #include "llvm/CodeGen/MachineLoopInfo.h" 21 #include "llvm/CodeGen/MachineRegisterInfo.h" 22 #include "llvm/CodeGen/MachineSizeOpts.h" 23 #include "llvm/CodeGen/MachineTraceMetrics.h" 24 #include "llvm/CodeGen/RegisterClassInfo.h" 25 #include "llvm/CodeGen/TargetInstrInfo.h" 26 #include "llvm/CodeGen/TargetRegisterInfo.h" 27 #include "llvm/CodeGen/TargetSchedule.h" 28 #include "llvm/CodeGen/TargetSubtargetInfo.h" 29 #include "llvm/InitializePasses.h" 30 #include "llvm/Support/CommandLine.h" 31 #include "llvm/Support/Debug.h" 32 #include "llvm/Support/raw_ostream.h" 33 34 using namespace llvm; 35 36 #define DEBUG_TYPE "machine-combiner" 37 38 STATISTIC(NumInstCombined, "Number of machineinst combined"); 39 40 static cl::opt<unsigned> 41 inc_threshold("machine-combiner-inc-threshold", cl::Hidden, 42 cl::desc("Incremental depth computation will be used for basic " 43 "blocks with more instructions."), cl::init(500)); 44 45 static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden, 46 cl::desc("Dump all substituted intrs"), 47 cl::init(false)); 48 49 #ifdef EXPENSIVE_CHECKS 50 static cl::opt<bool> VerifyPatternOrder( 51 "machine-combiner-verify-pattern-order", cl::Hidden, 52 cl::desc( 53 "Verify that the generated patterns are ordered by increasing latency"), 54 cl::init(true)); 55 #else 56 static cl::opt<bool> VerifyPatternOrder( 57 "machine-combiner-verify-pattern-order", cl::Hidden, 58 cl::desc( 59 "Verify that the generated patterns are ordered by increasing latency"), 60 cl::init(false)); 61 #endif 62 63 namespace { 64 class MachineCombiner : public MachineFunctionPass { 65 const TargetSubtargetInfo *STI; 66 const TargetInstrInfo *TII; 67 const TargetRegisterInfo *TRI; 68 MCSchedModel SchedModel; 69 MachineRegisterInfo *MRI; 70 MachineLoopInfo *MLI; // Current MachineLoopInfo 71 MachineTraceMetrics *Traces; 72 MachineTraceMetrics::Ensemble *MinInstr; 73 MachineBlockFrequencyInfo *MBFI; 74 ProfileSummaryInfo *PSI; 75 RegisterClassInfo RegClassInfo; 76 77 TargetSchedModel TSchedModel; 78 79 /// True if optimizing for code size. 80 bool OptSize; 81 82 public: 83 static char ID; 84 MachineCombiner() : MachineFunctionPass(ID) { 85 initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); 86 } 87 void getAnalysisUsage(AnalysisUsage &AU) const override; 88 bool runOnMachineFunction(MachineFunction &MF) override; 89 StringRef getPassName() const override { return "Machine InstCombiner"; } 90 91 private: 92 bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize); 93 bool combineInstructions(MachineBasicBlock *); 94 MachineInstr *getOperandDef(const MachineOperand &MO); 95 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 96 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 97 MachineTraceMetrics::Trace BlockTrace); 98 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, 99 MachineTraceMetrics::Trace BlockTrace); 100 bool 101 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, 102 MachineTraceMetrics::Trace BlockTrace, 103 SmallVectorImpl<MachineInstr *> &InsInstrs, 104 SmallVectorImpl<MachineInstr *> &DelInstrs, 105 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 106 MachineCombinerPattern Pattern, bool SlackIsAccurate); 107 bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB, 108 SmallVectorImpl<MachineInstr *> &InsInstrs, 109 SmallVectorImpl<MachineInstr *> &DelInstrs, 110 MachineCombinerPattern Pattern); 111 bool preservesResourceLen(MachineBasicBlock *MBB, 112 MachineTraceMetrics::Trace BlockTrace, 113 SmallVectorImpl<MachineInstr *> &InsInstrs, 114 SmallVectorImpl<MachineInstr *> &DelInstrs); 115 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, 116 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC); 117 std::pair<unsigned, unsigned> 118 getLatenciesForInstrSequences(MachineInstr &MI, 119 SmallVectorImpl<MachineInstr *> &InsInstrs, 120 SmallVectorImpl<MachineInstr *> &DelInstrs, 121 MachineTraceMetrics::Trace BlockTrace); 122 123 void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root, 124 SmallVector<MachineCombinerPattern, 16> &Patterns); 125 }; 126 } 127 128 char MachineCombiner::ID = 0; 129 char &llvm::MachineCombinerID = MachineCombiner::ID; 130 131 INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE, 132 "Machine InstCombiner", false, false) 133 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 134 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) 135 INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner", 136 false, false) 137 138 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 139 AU.setPreservesCFG(); 140 AU.addPreserved<MachineDominatorTree>(); 141 AU.addRequired<MachineLoopInfo>(); 142 AU.addPreserved<MachineLoopInfo>(); 143 AU.addRequired<MachineTraceMetrics>(); 144 AU.addPreserved<MachineTraceMetrics>(); 145 AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); 146 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 147 MachineFunctionPass::getAnalysisUsage(AU); 148 } 149 150 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { 151 MachineInstr *DefInstr = nullptr; 152 // We need a virtual register definition. 153 if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) 154 DefInstr = MRI->getUniqueVRegDef(MO.getReg()); 155 // PHI's have no depth etc. 156 if (DefInstr && DefInstr->isPHI()) 157 DefInstr = nullptr; 158 return DefInstr; 159 } 160 161 /// Computes depth of instructions in vector \InsInstr. 162 /// 163 /// \param InsInstrs is a vector of machine instructions 164 /// \param InstrIdxForVirtReg is a dense map of virtual register to index 165 /// of defining machine instruction in \p InsInstrs 166 /// \param BlockTrace is a trace of machine instructions 167 /// 168 /// \returns Depth of last instruction in \InsInstrs ("NewRoot") 169 unsigned 170 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 171 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 172 MachineTraceMetrics::Trace BlockTrace) { 173 SmallVector<unsigned, 16> InstrDepth; 174 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 175 "Missing machine model\n"); 176 177 // For each instruction in the new sequence compute the depth based on the 178 // operands. Use the trace information when possible. For new operands which 179 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth 180 for (auto *InstrPtr : InsInstrs) { // for each Use 181 unsigned IDepth = 0; 182 for (const MachineOperand &MO : InstrPtr->operands()) { 183 // Check for virtual register operand. 184 if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg()))) 185 continue; 186 if (!MO.isUse()) 187 continue; 188 unsigned DepthOp = 0; 189 unsigned LatencyOp = 0; 190 DenseMap<unsigned, unsigned>::iterator II = 191 InstrIdxForVirtReg.find(MO.getReg()); 192 if (II != InstrIdxForVirtReg.end()) { 193 // Operand is new virtual register not in trace 194 assert(II->second < InstrDepth.size() && "Bad Index"); 195 MachineInstr *DefInstr = InsInstrs[II->second]; 196 assert(DefInstr && 197 "There must be a definition for a new virtual register"); 198 DepthOp = InstrDepth[II->second]; 199 int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg()); 200 int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg()); 201 LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, 202 InstrPtr, UseIdx); 203 } else { 204 MachineInstr *DefInstr = getOperandDef(MO); 205 if (DefInstr) { 206 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth; 207 LatencyOp = TSchedModel.computeOperandLatency( 208 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 209 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 210 } 211 } 212 IDepth = std::max(IDepth, DepthOp + LatencyOp); 213 } 214 InstrDepth.push_back(IDepth); 215 } 216 unsigned NewRootIdx = InsInstrs.size() - 1; 217 return InstrDepth[NewRootIdx]; 218 } 219 220 /// Computes instruction latency as max of latency of defined operands. 221 /// 222 /// \param Root is a machine instruction that could be replaced by NewRoot. 223 /// It is used to compute a more accurate latency information for NewRoot in 224 /// case there is a dependent instruction in the same trace (\p BlockTrace) 225 /// \param NewRoot is the instruction for which the latency is computed 226 /// \param BlockTrace is a trace of machine instructions 227 /// 228 /// \returns Latency of \p NewRoot 229 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, 230 MachineTraceMetrics::Trace BlockTrace) { 231 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 232 "Missing machine model\n"); 233 234 // Check each definition in NewRoot and compute the latency 235 unsigned NewRootLatency = 0; 236 237 for (const MachineOperand &MO : NewRoot->operands()) { 238 // Check for virtual register operand. 239 if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg()))) 240 continue; 241 if (!MO.isDef()) 242 continue; 243 // Get the first instruction that uses MO 244 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); 245 RI++; 246 if (RI == MRI->reg_end()) 247 continue; 248 MachineInstr *UseMO = RI->getParent(); 249 unsigned LatencyOp = 0; 250 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) { 251 LatencyOp = TSchedModel.computeOperandLatency( 252 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, 253 UseMO->findRegisterUseOperandIdx(MO.getReg())); 254 } else { 255 LatencyOp = TSchedModel.computeInstrLatency(NewRoot); 256 } 257 NewRootLatency = std::max(NewRootLatency, LatencyOp); 258 } 259 return NewRootLatency; 260 } 261 262 /// The combiner's goal may differ based on which pattern it is attempting 263 /// to optimize. 264 enum class CombinerObjective { 265 MustReduceDepth, // The data dependency chain must be improved. 266 MustReduceRegisterPressure, // The register pressure must be reduced. 267 Default // The critical path must not be lengthened. 268 }; 269 270 static CombinerObjective getCombinerObjective(MachineCombinerPattern P) { 271 // TODO: If C++ ever gets a real enum class, make this part of the 272 // MachineCombinerPattern class. 273 switch (P) { 274 case MachineCombinerPattern::REASSOC_AX_BY: 275 case MachineCombinerPattern::REASSOC_AX_YB: 276 case MachineCombinerPattern::REASSOC_XA_BY: 277 case MachineCombinerPattern::REASSOC_XA_YB: 278 case MachineCombinerPattern::REASSOC_XY_AMM_BMM: 279 case MachineCombinerPattern::REASSOC_XMM_AMM_BMM: 280 case MachineCombinerPattern::SUBADD_OP1: 281 case MachineCombinerPattern::SUBADD_OP2: 282 return CombinerObjective::MustReduceDepth; 283 case MachineCombinerPattern::REASSOC_XY_BCA: 284 case MachineCombinerPattern::REASSOC_XY_BAC: 285 return CombinerObjective::MustReduceRegisterPressure; 286 default: 287 return CombinerObjective::Default; 288 } 289 } 290 291 /// Estimate the latency of the new and original instruction sequence by summing 292 /// up the latencies of the inserted and deleted instructions. This assumes 293 /// that the inserted and deleted instructions are dependent instruction chains, 294 /// which might not hold in all cases. 295 std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences( 296 MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs, 297 SmallVectorImpl<MachineInstr *> &DelInstrs, 298 MachineTraceMetrics::Trace BlockTrace) { 299 assert(!InsInstrs.empty() && "Only support sequences that insert instrs."); 300 unsigned NewRootLatency = 0; 301 // NewRoot is the last instruction in the \p InsInstrs vector. 302 MachineInstr *NewRoot = InsInstrs.back(); 303 for (unsigned i = 0; i < InsInstrs.size() - 1; i++) 304 NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]); 305 NewRootLatency += getLatency(&MI, NewRoot, BlockTrace); 306 307 unsigned RootLatency = 0; 308 for (auto I : DelInstrs) 309 RootLatency += TSchedModel.computeInstrLatency(I); 310 311 return {NewRootLatency, RootLatency}; 312 } 313 314 bool MachineCombiner::reduceRegisterPressure( 315 MachineInstr &Root, MachineBasicBlock *MBB, 316 SmallVectorImpl<MachineInstr *> &InsInstrs, 317 SmallVectorImpl<MachineInstr *> &DelInstrs, 318 MachineCombinerPattern Pattern) { 319 // FIXME: for now, we don't do any check for the register pressure patterns. 320 // We treat them as always profitable. But we can do better if we make 321 // RegPressureTracker class be aware of TIE attribute. Then we can get an 322 // accurate compare of register pressure with DelInstrs or InsInstrs. 323 return true; 324 } 325 326 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. 327 /// The new code sequence ends in MI NewRoot. A necessary condition for the new 328 /// sequence to replace the old sequence is that it cannot lengthen the critical 329 /// path. The definition of "improve" may be restricted by specifying that the 330 /// new path improves the data dependency chain (MustReduceDepth). 331 bool MachineCombiner::improvesCriticalPathLen( 332 MachineBasicBlock *MBB, MachineInstr *Root, 333 MachineTraceMetrics::Trace BlockTrace, 334 SmallVectorImpl<MachineInstr *> &InsInstrs, 335 SmallVectorImpl<MachineInstr *> &DelInstrs, 336 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 337 MachineCombinerPattern Pattern, 338 bool SlackIsAccurate) { 339 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 340 "Missing machine model\n"); 341 // Get depth and latency of NewRoot and Root. 342 unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); 343 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; 344 345 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: " 346 << NewRootDepth << "\tRootDepth: " << RootDepth); 347 348 // For a transform such as reassociation, the cost equation is 349 // conservatively calculated so that we must improve the depth (data 350 // dependency cycles) in the critical path to proceed with the transform. 351 // Being conservative also protects against inaccuracies in the underlying 352 // machine trace metrics and CPU models. 353 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) { 354 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth "); 355 LLVM_DEBUG(NewRootDepth < RootDepth 356 ? dbgs() << "\t and it does it\n" 357 : dbgs() << "\t but it does NOT do it\n"); 358 return NewRootDepth < RootDepth; 359 } 360 361 // A more flexible cost calculation for the critical path includes the slack 362 // of the original code sequence. This may allow the transform to proceed 363 // even if the instruction depths (data dependency cycles) become worse. 364 365 // Account for the latency of the inserted and deleted instructions by 366 unsigned NewRootLatency, RootLatency; 367 std::tie(NewRootLatency, RootLatency) = 368 getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); 369 370 unsigned RootSlack = BlockTrace.getInstrSlack(*Root); 371 unsigned NewCycleCount = NewRootDepth + NewRootLatency; 372 unsigned OldCycleCount = 373 RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0); 374 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency 375 << "\tRootLatency: " << RootLatency << "\n\tRootSlack: " 376 << RootSlack << " SlackIsAccurate=" << SlackIsAccurate 377 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount 378 << "\n\tRootDepth + RootLatency + RootSlack = " 379 << OldCycleCount;); 380 LLVM_DEBUG(NewCycleCount <= OldCycleCount 381 ? dbgs() << "\n\t It IMPROVES PathLen because" 382 : dbgs() << "\n\t It DOES NOT improve PathLen because"); 383 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount 384 << ", OldCycleCount = " << OldCycleCount << "\n"); 385 386 return NewCycleCount <= OldCycleCount; 387 } 388 389 /// helper routine to convert instructions into SC 390 void MachineCombiner::instr2instrSC( 391 SmallVectorImpl<MachineInstr *> &Instrs, 392 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) { 393 for (auto *InstrPtr : Instrs) { 394 unsigned Opc = InstrPtr->getOpcode(); 395 unsigned Idx = TII->get(Opc).getSchedClass(); 396 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx); 397 InstrsSC.push_back(SC); 398 } 399 } 400 401 /// True when the new instructions do not increase resource length 402 bool MachineCombiner::preservesResourceLen( 403 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, 404 SmallVectorImpl<MachineInstr *> &InsInstrs, 405 SmallVectorImpl<MachineInstr *> &DelInstrs) { 406 if (!TSchedModel.hasInstrSchedModel()) 407 return true; 408 409 // Compute current resource length 410 411 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); 412 SmallVector <const MachineBasicBlock *, 1> MBBarr; 413 MBBarr.push_back(MBB); 414 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); 415 416 // Deal with SC rather than Instructions. 417 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC; 418 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC; 419 420 instr2instrSC(InsInstrs, InsInstrsSC); 421 instr2instrSC(DelInstrs, DelInstrsSC); 422 423 ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC); 424 ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC); 425 426 // Compute new resource length. 427 unsigned ResLenAfterCombine = 428 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); 429 430 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: " 431 << ResLenBeforeCombine 432 << " and after: " << ResLenAfterCombine << "\n";); 433 LLVM_DEBUG( 434 ResLenAfterCombine <= 435 ResLenBeforeCombine + TII->getExtendResourceLenLimit() 436 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n" 437 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource " 438 "Length\n"); 439 440 return ResLenAfterCombine <= 441 ResLenBeforeCombine + TII->getExtendResourceLenLimit(); 442 } 443 444 /// \returns true when new instruction sequence should be generated 445 /// independent if it lengthens critical path or not 446 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize, 447 bool OptForSize) { 448 if (OptForSize && (NewSize < OldSize)) 449 return true; 450 if (!TSchedModel.hasInstrSchedModelOrItineraries()) 451 return true; 452 return false; 453 } 454 455 /// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction 456 /// depths if requested. 457 /// 458 /// \param MBB basic block to insert instructions in 459 /// \param MI current machine instruction 460 /// \param InsInstrs new instructions to insert in \p MBB 461 /// \param DelInstrs instruction to delete from \p MBB 462 /// \param MinInstr is a pointer to the machine trace information 463 /// \param RegUnits set of live registers, needed to compute instruction depths 464 /// \param TII is target instruction info, used to call target hook 465 /// \param Pattern is used to call target hook finalizeInsInstrs 466 /// \param IncrementalUpdate if true, compute instruction depths incrementally, 467 /// otherwise invalidate the trace 468 static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, 469 SmallVector<MachineInstr *, 16> InsInstrs, 470 SmallVector<MachineInstr *, 16> DelInstrs, 471 MachineTraceMetrics::Ensemble *MinInstr, 472 SparseSet<LiveRegUnit> &RegUnits, 473 const TargetInstrInfo *TII, 474 MachineCombinerPattern Pattern, 475 bool IncrementalUpdate) { 476 // If we want to fix up some placeholder for some target, do it now. 477 // We need this because in genAlternativeCodeSequence, we have not decided the 478 // better pattern InsInstrs or DelInstrs, so we don't want generate some 479 // sideeffect to the function. For example we need to delay the constant pool 480 // entry creation here after InsInstrs is selected as better pattern. 481 // Otherwise the constant pool entry created for InsInstrs will not be deleted 482 // even if InsInstrs is not the better pattern. 483 TII->finalizeInsInstrs(MI, Pattern, InsInstrs); 484 485 for (auto *InstrPtr : InsInstrs) 486 MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr); 487 488 for (auto *InstrPtr : DelInstrs) { 489 InstrPtr->eraseFromParent(); 490 // Erase all LiveRegs defined by the removed instruction 491 for (auto I = RegUnits.begin(); I != RegUnits.end(); ) { 492 if (I->MI == InstrPtr) 493 I = RegUnits.erase(I); 494 else 495 I++; 496 } 497 } 498 499 if (IncrementalUpdate) 500 for (auto *InstrPtr : InsInstrs) 501 MinInstr->updateDepth(MBB, *InstrPtr, RegUnits); 502 else 503 MinInstr->invalidate(MBB); 504 505 NumInstCombined++; 506 } 507 508 // Check that the difference between original and new latency is decreasing for 509 // later patterns. This helps to discover sub-optimal pattern orderings. 510 void MachineCombiner::verifyPatternOrder( 511 MachineBasicBlock *MBB, MachineInstr &Root, 512 SmallVector<MachineCombinerPattern, 16> &Patterns) { 513 long PrevLatencyDiff = std::numeric_limits<long>::max(); 514 (void)PrevLatencyDiff; // Variable is used in assert only. 515 for (auto P : Patterns) { 516 SmallVector<MachineInstr *, 16> InsInstrs; 517 SmallVector<MachineInstr *, 16> DelInstrs; 518 DenseMap<unsigned, unsigned> InstrIdxForVirtReg; 519 TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs, 520 InstrIdxForVirtReg); 521 // Found pattern, but did not generate alternative sequence. 522 // This can happen e.g. when an immediate could not be materialized 523 // in a single instruction. 524 if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries()) 525 continue; 526 527 unsigned NewRootLatency, RootLatency; 528 std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences( 529 Root, InsInstrs, DelInstrs, MinInstr->getTrace(MBB)); 530 long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency); 531 assert(CurrentLatencyDiff <= PrevLatencyDiff && 532 "Current pattern is better than previous pattern."); 533 PrevLatencyDiff = CurrentLatencyDiff; 534 } 535 } 536 537 /// Substitute a slow code sequence with a faster one by 538 /// evaluating instruction combining pattern. 539 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction 540 /// combining based on machine trace metrics. Only combine a sequence of 541 /// instructions when this neither lengthens the critical path nor increases 542 /// resource pressure. When optimizing for codesize always combine when the new 543 /// sequence is shorter. 544 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { 545 bool Changed = false; 546 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); 547 548 bool IncrementalUpdate = false; 549 auto BlockIter = MBB->begin(); 550 decltype(BlockIter) LastUpdate; 551 // Check if the block is in a loop. 552 const MachineLoop *ML = MLI->getLoopFor(MBB); 553 if (!MinInstr) 554 MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); 555 556 SparseSet<LiveRegUnit> RegUnits; 557 RegUnits.setUniverse(TRI->getNumRegUnits()); 558 559 bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI); 560 561 bool DoRegPressureReduce = 562 TII->shouldReduceRegisterPressure(MBB, &RegClassInfo); 563 564 while (BlockIter != MBB->end()) { 565 auto &MI = *BlockIter++; 566 SmallVector<MachineCombinerPattern, 16> Patterns; 567 // The motivating example is: 568 // 569 // MUL Other MUL_op1 MUL_op2 Other 570 // \ / \ | / 571 // ADD/SUB => MADD/MSUB 572 // (=Root) (=NewRoot) 573 574 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is 575 // usually beneficial for code size it unfortunately can hurt performance 576 // when the ADD is on the critical path, but the MUL is not. With the 577 // substitution the MUL becomes part of the critical path (in form of the 578 // MADD) and can lengthen it on architectures where the MADD latency is 579 // longer than the ADD latency. 580 // 581 // For each instruction we check if it can be the root of a combiner 582 // pattern. Then for each pattern the new code sequence in form of MI is 583 // generated and evaluated. When the efficiency criteria (don't lengthen 584 // critical path, don't use more resources) is met the new sequence gets 585 // hooked up into the basic block before the old sequence is removed. 586 // 587 // The algorithm does not try to evaluate all patterns and pick the best. 588 // This is only an artificial restriction though. In practice there is 589 // mostly one pattern, and getMachineCombinerPatterns() can order patterns 590 // based on an internal cost heuristic. If 591 // machine-combiner-verify-pattern-order is enabled, all patterns are 592 // checked to ensure later patterns do not provide better latency savings. 593 594 if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce)) 595 continue; 596 597 if (VerifyPatternOrder) 598 verifyPatternOrder(MBB, MI, Patterns); 599 600 for (auto P : Patterns) { 601 SmallVector<MachineInstr *, 16> InsInstrs; 602 SmallVector<MachineInstr *, 16> DelInstrs; 603 DenseMap<unsigned, unsigned> InstrIdxForVirtReg; 604 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, 605 InstrIdxForVirtReg); 606 unsigned NewInstCount = InsInstrs.size(); 607 unsigned OldInstCount = DelInstrs.size(); 608 // Found pattern, but did not generate alternative sequence. 609 // This can happen e.g. when an immediate could not be materialized 610 // in a single instruction. 611 if (!NewInstCount) 612 continue; 613 614 LLVM_DEBUG(if (dump_intrs) { 615 dbgs() << "\tFor the Pattern (" << (int)P 616 << ") these instructions could be removed\n"; 617 for (auto const *InstrPtr : DelInstrs) 618 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false, 619 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII); 620 dbgs() << "\tThese instructions could replace the removed ones\n"; 621 for (auto const *InstrPtr : InsInstrs) 622 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false, 623 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII); 624 }); 625 626 bool SubstituteAlways = false; 627 if (ML && TII->isThroughputPattern(P)) 628 SubstituteAlways = true; 629 630 if (IncrementalUpdate && LastUpdate != BlockIter) { 631 // Update depths since the last incremental update. 632 MinInstr->updateDepths(LastUpdate, BlockIter, RegUnits); 633 LastUpdate = BlockIter; 634 } 635 636 if (DoRegPressureReduce && 637 getCombinerObjective(P) == 638 CombinerObjective::MustReduceRegisterPressure) { 639 if (MBB->size() > inc_threshold) { 640 // Use incremental depth updates for basic blocks above threshold 641 IncrementalUpdate = true; 642 LastUpdate = BlockIter; 643 } 644 if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) { 645 // Replace DelInstrs with InsInstrs. 646 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, 647 RegUnits, TII, P, IncrementalUpdate); 648 Changed |= true; 649 650 // Go back to previous instruction as it may have ILP reassociation 651 // opportunity. 652 BlockIter--; 653 break; 654 } 655 } 656 657 // Substitute when we optimize for codesize and the new sequence has 658 // fewer instructions OR 659 // the new sequence neither lengthens the critical path nor increases 660 // resource pressure. 661 if (SubstituteAlways || 662 doSubstitute(NewInstCount, OldInstCount, OptForSize)) { 663 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, 664 RegUnits, TII, P, IncrementalUpdate); 665 // Eagerly stop after the first pattern fires. 666 Changed = true; 667 break; 668 } else { 669 // For big basic blocks, we only compute the full trace the first time 670 // we hit this. We do not invalidate the trace, but instead update the 671 // instruction depths incrementally. 672 // NOTE: Only the instruction depths up to MI are accurate. All other 673 // trace information is not updated. 674 MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); 675 Traces->verifyAnalysis(); 676 if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs, 677 InstrIdxForVirtReg, P, 678 !IncrementalUpdate) && 679 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) { 680 if (MBB->size() > inc_threshold) { 681 // Use incremental depth updates for basic blocks above treshold 682 IncrementalUpdate = true; 683 LastUpdate = BlockIter; 684 } 685 686 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, 687 RegUnits, TII, P, IncrementalUpdate); 688 689 // Eagerly stop after the first pattern fires. 690 Changed = true; 691 break; 692 } 693 // Cleanup instructions of the alternative code sequence. There is no 694 // use for them. 695 MachineFunction *MF = MBB->getParent(); 696 for (auto *InstrPtr : InsInstrs) 697 MF->deleteMachineInstr(InstrPtr); 698 } 699 InstrIdxForVirtReg.clear(); 700 } 701 } 702 703 if (Changed && IncrementalUpdate) 704 Traces->invalidate(MBB); 705 return Changed; 706 } 707 708 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { 709 STI = &MF.getSubtarget(); 710 TII = STI->getInstrInfo(); 711 TRI = STI->getRegisterInfo(); 712 SchedModel = STI->getSchedModel(); 713 TSchedModel.init(STI); 714 MRI = &MF.getRegInfo(); 715 MLI = &getAnalysis<MachineLoopInfo>(); 716 Traces = &getAnalysis<MachineTraceMetrics>(); 717 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 718 MBFI = (PSI && PSI->hasProfileSummary()) ? 719 &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() : 720 nullptr; 721 MinInstr = nullptr; 722 OptSize = MF.getFunction().hasOptSize(); 723 RegClassInfo.runOnMachineFunction(MF); 724 725 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); 726 if (!TII->useMachineCombiner()) { 727 LLVM_DEBUG( 728 dbgs() 729 << " Skipping pass: Target does not support machine combiner\n"); 730 return false; 731 } 732 733 bool Changed = false; 734 735 // Try to combine instructions. 736 for (auto &MBB : MF) 737 Changed |= combineInstructions(&MBB); 738 739 return Changed; 740 } 741