1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // after the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUCombinerHelper.h" 16 #include "AMDGPULegalizerInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "llvm/CodeGen/GlobalISel/Combiner.h" 20 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 21 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/MachineDominators.h" 27 #include "llvm/CodeGen/TargetPassConfig.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 #include "llvm/Target/TargetMachine.h" 30 31 #define GET_GICOMBINER_DEPS 32 #include "AMDGPUGenPreLegalizeGICombiner.inc" 33 #undef GET_GICOMBINER_DEPS 34 35 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 36 37 using namespace llvm; 38 using namespace MIPatternMatch; 39 40 namespace { 41 #define GET_GICOMBINER_TYPES 42 #include "AMDGPUGenPostLegalizeGICombiner.inc" 43 #undef GET_GICOMBINER_TYPES 44 45 class AMDGPUPostLegalizerCombinerImpl : public Combiner { 46 protected: 47 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig; 48 const GCNSubtarget &STI; 49 const SIInstrInfo &TII; 50 // TODO: Make CombinerHelper methods const. 51 mutable AMDGPUCombinerHelper Helper; 52 53 public: 54 AMDGPUPostLegalizerCombinerImpl( 55 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 56 GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 57 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, 58 const GCNSubtarget &STI, MachineDominatorTree *MDT, 59 const LegalizerInfo *LI); 60 61 static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; } 62 63 bool tryCombineAllImpl(MachineInstr &I) const; 64 bool tryCombineAll(MachineInstr &I) const override; 65 66 struct FMinFMaxLegacyInfo { 67 Register LHS; 68 Register RHS; 69 Register True; 70 Register False; 71 CmpInst::Predicate Pred; 72 }; 73 74 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 75 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info) const; 76 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 77 const FMinFMaxLegacyInfo &Info) const; 78 79 bool matchUCharToFloat(MachineInstr &MI) const; 80 void applyUCharToFloat(MachineInstr &MI) const; 81 82 bool 83 matchRcpSqrtToRsq(MachineInstr &MI, 84 std::function<void(MachineIRBuilder &)> &MatchInfo) const; 85 86 // FIXME: Should be able to have 2 separate matchdatas rather than custom 87 // struct boilerplate. 88 struct CvtF32UByteMatchInfo { 89 Register CvtVal; 90 unsigned ShiftOffset; 91 }; 92 93 bool matchCvtF32UByteN(MachineInstr &MI, 94 CvtF32UByteMatchInfo &MatchInfo) const; 95 void applyCvtF32UByteN(MachineInstr &MI, 96 const CvtF32UByteMatchInfo &MatchInfo) const; 97 98 bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const; 99 100 // Combine unsigned buffer load and signed extension instructions to generate 101 // signed buffer laod instructions. 102 bool matchCombineSignExtendInReg(MachineInstr &MI, 103 MachineInstr *&MatchInfo) const; 104 void applyCombineSignExtendInReg(MachineInstr &MI, 105 MachineInstr *&MatchInfo) const; 106 107 private: 108 #define GET_GICOMBINER_CLASS_MEMBERS 109 #define AMDGPUSubtarget GCNSubtarget 110 #include "AMDGPUGenPostLegalizeGICombiner.inc" 111 #undef GET_GICOMBINER_CLASS_MEMBERS 112 #undef AMDGPUSubtarget 113 }; 114 115 #define GET_GICOMBINER_IMPL 116 #define AMDGPUSubtarget GCNSubtarget 117 #include "AMDGPUGenPostLegalizeGICombiner.inc" 118 #undef AMDGPUSubtarget 119 #undef GET_GICOMBINER_IMPL 120 121 AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl( 122 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 123 GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 124 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, 125 const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) 126 : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI), 127 TII(*STI.getInstrInfo()), 128 Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI), 129 #define GET_GICOMBINER_CONSTRUCTOR_INITS 130 #include "AMDGPUGenPostLegalizeGICombiner.inc" 131 #undef GET_GICOMBINER_CONSTRUCTOR_INITS 132 { 133 } 134 135 bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { 136 if (tryCombineAllImpl(MI)) 137 return true; 138 139 switch (MI.getOpcode()) { 140 case TargetOpcode::G_SHL: 141 case TargetOpcode::G_LSHR: 142 case TargetOpcode::G_ASHR: 143 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 144 // common case, splitting this into a move and a 32-bit shift is faster and 145 // the same code size. 146 return Helper.tryCombineShiftToUnmerge(MI, 32); 147 } 148 149 return false; 150 } 151 152 bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy( 153 MachineInstr &MI, FMinFMaxLegacyInfo &Info) const { 154 // FIXME: Type predicate on pattern 155 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 156 return false; 157 158 Register Cond = MI.getOperand(1).getReg(); 159 if (!MRI.hasOneNonDBGUse(Cond) || 160 !mi_match(Cond, MRI, 161 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 162 return false; 163 164 Info.True = MI.getOperand(2).getReg(); 165 Info.False = MI.getOperand(3).getReg(); 166 167 // TODO: Handle case where the the selected value is an fneg and the compared 168 // constant is the negation of the selected value. 169 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 170 !(Info.LHS == Info.False && Info.RHS == Info.True)) 171 return false; 172 173 switch (Info.Pred) { 174 case CmpInst::FCMP_FALSE: 175 case CmpInst::FCMP_OEQ: 176 case CmpInst::FCMP_ONE: 177 case CmpInst::FCMP_ORD: 178 case CmpInst::FCMP_UNO: 179 case CmpInst::FCMP_UEQ: 180 case CmpInst::FCMP_UNE: 181 case CmpInst::FCMP_TRUE: 182 return false; 183 default: 184 return true; 185 } 186 } 187 188 void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinToFMaxLegacy( 189 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const { 190 B.setInstrAndDebugLoc(MI); 191 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { 192 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 193 }; 194 195 switch (Info.Pred) { 196 case CmpInst::FCMP_ULT: 197 case CmpInst::FCMP_ULE: 198 if (Info.LHS == Info.True) 199 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 200 else 201 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 202 break; 203 case CmpInst::FCMP_OLE: 204 case CmpInst::FCMP_OLT: { 205 // We need to permute the operands to get the correct NaN behavior. The 206 // selected operand is the second one based on the failing compare with NaN, 207 // so permute it based on the compare type the hardware uses. 208 if (Info.LHS == Info.True) 209 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 210 else 211 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 212 break; 213 } 214 case CmpInst::FCMP_UGE: 215 case CmpInst::FCMP_UGT: { 216 if (Info.LHS == Info.True) 217 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 218 else 219 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 220 break; 221 } 222 case CmpInst::FCMP_OGT: 223 case CmpInst::FCMP_OGE: { 224 if (Info.LHS == Info.True) 225 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 226 else 227 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 228 break; 229 } 230 default: 231 llvm_unreachable("predicate should not have matched"); 232 } 233 234 MI.eraseFromParent(); 235 } 236 237 bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat( 238 MachineInstr &MI) const { 239 Register DstReg = MI.getOperand(0).getReg(); 240 241 // TODO: We could try to match extracting the higher bytes, which would be 242 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 243 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 244 // about in practice. 245 LLT Ty = MRI.getType(DstReg); 246 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 247 Register SrcReg = MI.getOperand(1).getReg(); 248 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 249 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 250 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 251 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 252 } 253 254 return false; 255 } 256 257 void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat( 258 MachineInstr &MI) const { 259 B.setInstrAndDebugLoc(MI); 260 261 const LLT S32 = LLT::scalar(32); 262 263 Register DstReg = MI.getOperand(0).getReg(); 264 Register SrcReg = MI.getOperand(1).getReg(); 265 LLT Ty = MRI.getType(DstReg); 266 LLT SrcTy = MRI.getType(SrcReg); 267 if (SrcTy != S32) 268 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 269 270 if (Ty == S32) { 271 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg}, 272 MI.getFlags()); 273 } else { 274 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg}, 275 MI.getFlags()); 276 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 277 } 278 279 MI.eraseFromParent(); 280 } 281 282 bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( 283 MachineInstr &MI, 284 std::function<void(MachineIRBuilder &)> &MatchInfo) const { 285 auto getRcpSrc = [=](const MachineInstr &MI) -> MachineInstr * { 286 if (!MI.getFlag(MachineInstr::FmContract)) 287 return nullptr; 288 289 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) { 290 if (GI->is(Intrinsic::amdgcn_rcp)) 291 return MRI.getVRegDef(MI.getOperand(2).getReg()); 292 } 293 return nullptr; 294 }; 295 296 auto getSqrtSrc = [=](const MachineInstr &MI) -> MachineInstr * { 297 if (!MI.getFlag(MachineInstr::FmContract)) 298 return nullptr; 299 MachineInstr *SqrtSrcMI = nullptr; 300 auto Match = 301 mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI))); 302 (void)Match; 303 return SqrtSrcMI; 304 }; 305 306 MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr; 307 // rcp(sqrt(x)) 308 if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) { 309 MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) { 310 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}) 311 .addUse(SqrtSrcMI->getOperand(0).getReg()) 312 .setMIFlags(MI.getFlags()); 313 }; 314 return true; 315 } 316 317 // sqrt(rcp(x)) 318 if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) { 319 MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) { 320 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}) 321 .addUse(RcpSrcMI->getOperand(0).getReg()) 322 .setMIFlags(MI.getFlags()); 323 }; 324 return true; 325 } 326 return false; 327 } 328 329 bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN( 330 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const { 331 Register SrcReg = MI.getOperand(1).getReg(); 332 333 // Look through G_ZEXT. 334 bool IsShr = mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 335 336 Register Src0; 337 int64_t ShiftAmt; 338 IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 339 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 340 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 341 342 unsigned ShiftOffset = 8 * Offset; 343 if (IsShr) 344 ShiftOffset += ShiftAmt; 345 else 346 ShiftOffset -= ShiftAmt; 347 348 MatchInfo.CvtVal = Src0; 349 MatchInfo.ShiftOffset = ShiftOffset; 350 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 351 } 352 353 // TODO: Simplify demanded bits. 354 return false; 355 } 356 357 void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN( 358 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const { 359 B.setInstrAndDebugLoc(MI); 360 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 361 362 const LLT S32 = LLT::scalar(32); 363 Register CvtSrc = MatchInfo.CvtVal; 364 LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 365 if (SrcTy != S32) { 366 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 367 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 368 } 369 370 assert(MI.getOpcode() != NewOpc); 371 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 372 MI.eraseFromParent(); 373 } 374 375 bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize( 376 MachineInstr &MI, Register &Reg) const { 377 const SITargetLowering *TLI = static_cast<const SITargetLowering *>( 378 MF.getSubtarget().getTargetLowering()); 379 Reg = MI.getOperand(1).getReg(); 380 return TLI->isCanonicalized(Reg, MF); 381 } 382 383 // The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8, 384 // u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined 385 // with sign extension instrucions in order to generate buffer_load_{i8, i16} 386 // instructions. 387 388 // Identify buffer_load_{u8, u16}. 389 bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg( 390 MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const { 391 Register Op0Reg = MI.getOperand(1).getReg(); 392 SubwordBufferLoad = MRI.getVRegDef(Op0Reg); 393 394 if (!MRI.hasOneNonDBGUse(Op0Reg)) 395 return false; 396 397 // Check if the first operand of the sign extension is a subword buffer load 398 // instruction. 399 return SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE || 400 SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 401 } 402 403 // Combine buffer_load_{u8, u16} and the sign extension instruction to generate 404 // buffer_load_{i8, i16}. 405 void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg( 406 MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const { 407 // Modify the opcode and the destination of buffer_load_{u8, u16}: 408 // Replace the opcode. 409 unsigned Opc = 410 SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE 411 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE 412 : AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT; 413 SubwordBufferLoad->setDesc(TII.get(Opc)); 414 // Update the destination register of SubwordBufferLoad with the destination 415 // register of the sign extension. 416 Register SignExtendInsnDst = MI.getOperand(0).getReg(); 417 SubwordBufferLoad->getOperand(0).setReg(SignExtendInsnDst); 418 // Remove the sign extension. 419 MI.eraseFromParent(); 420 } 421 422 // Pass boilerplate 423 // ================ 424 425 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 426 public: 427 static char ID; 428 429 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 430 431 StringRef getPassName() const override { 432 return "AMDGPUPostLegalizerCombiner"; 433 } 434 435 bool runOnMachineFunction(MachineFunction &MF) override; 436 437 void getAnalysisUsage(AnalysisUsage &AU) const override; 438 439 private: 440 bool IsOptNone; 441 AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig; 442 }; 443 } // end anonymous namespace 444 445 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 446 AU.addRequired<TargetPassConfig>(); 447 AU.setPreservesCFG(); 448 getSelectionDAGFallbackAnalysisUsage(AU); 449 AU.addRequired<GISelKnownBitsAnalysis>(); 450 AU.addPreserved<GISelKnownBitsAnalysis>(); 451 if (!IsOptNone) { 452 AU.addRequired<MachineDominatorTree>(); 453 AU.addPreserved<MachineDominatorTree>(); 454 } 455 MachineFunctionPass::getAnalysisUsage(AU); 456 } 457 458 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 459 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 460 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 461 462 if (!RuleConfig.parseCommandLineOption()) 463 report_fatal_error("Invalid rule identifier"); 464 } 465 466 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 467 if (MF.getProperties().hasProperty( 468 MachineFunctionProperties::Property::FailedISel)) 469 return false; 470 auto *TPC = &getAnalysis<TargetPassConfig>(); 471 const Function &F = MF.getFunction(); 472 bool EnableOpt = 473 MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); 474 475 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 476 const AMDGPULegalizerInfo *LI = 477 static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 478 479 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 480 MachineDominatorTree *MDT = 481 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 482 483 CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 484 LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); 485 486 AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr, 487 RuleConfig, ST, MDT, LI); 488 return Impl.combineMachineInstrs(); 489 } 490 491 char AMDGPUPostLegalizerCombiner::ID = 0; 492 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 493 "Combine AMDGPU machine instrs after legalization", false, 494 false) 495 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 496 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 497 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 498 "Combine AMDGPU machine instrs after legalization", false, 499 false) 500 501 namespace llvm { 502 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 503 return new AMDGPUPostLegalizerCombiner(IsOptNone); 504 } 505 } // end namespace llvm 506