1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPURegisterBankInfo.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUSubtarget.h" 17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 18 #include "SIMachineFunctionInfo.h" 19 #include "SIRegisterInfo.h" 20 #include "llvm/ADT/SmallSet.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/RegisterBank.h" 24 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" 25 #include "llvm/CodeGen/TargetRegisterInfo.h" 26 #include "llvm/CodeGen/TargetSubtargetInfo.h" 27 #include "llvm/IR/Constants.h" 28 29 #define GET_TARGET_REGBANK_IMPL 30 #include "AMDGPUGenRegisterBank.inc" 31 32 // This file will be TableGen'ed at some point. 33 #include "AMDGPUGenRegisterBankInfo.def" 34 35 using namespace llvm; 36 37 namespace { 38 39 // Observer to apply a register bank to new registers created by LegalizerHelper. 40 class ApplyRegBankMapping final : public GISelChangeObserver { 41 private: 42 MachineRegisterInfo &MRI; 43 const RegisterBank *NewBank; 44 SmallVector<MachineInstr *, 4> NewInsts; 45 46 public: 47 ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB) 48 : MRI(MRI_), NewBank(RB) {} 49 50 ~ApplyRegBankMapping() { 51 for (MachineInstr *MI : NewInsts) 52 applyBank(*MI); 53 } 54 55 /// Set any registers that don't have a set register class or bank to SALU. 56 void applyBank(MachineInstr &MI) { 57 for (MachineOperand &Op : MI.operands()) { 58 if (!Op.isReg()) 59 continue; 60 61 Register Reg = Op.getReg(); 62 if (MRI.getRegClassOrRegBank(Reg)) 63 continue; 64 65 const RegisterBank *RB = NewBank; 66 // FIXME: This might not be enough to detect when SCC should be used. 67 if (MRI.getType(Reg) == LLT::scalar(1)) 68 RB = (NewBank == &AMDGPU::SGPRRegBank ? 69 &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank); 70 71 MRI.setRegBank(Reg, *RB); 72 } 73 } 74 75 void erasingInstr(MachineInstr &MI) override {} 76 77 void createdInstr(MachineInstr &MI) override { 78 // At this point, the instruction was just inserted and has no operands. 79 NewInsts.push_back(&MI); 80 } 81 82 void changingInstr(MachineInstr &MI) override {} 83 void changedInstr(MachineInstr &MI) override {} 84 }; 85 86 } 87 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) 88 : AMDGPUGenRegisterBankInfo(), 89 TRI(static_cast<const SIRegisterInfo*>(&TRI)) { 90 91 // HACK: Until this is fully tablegen'd. 92 static bool AlreadyInit = false; 93 if (AlreadyInit) 94 return; 95 96 AlreadyInit = true; 97 98 const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID); 99 (void)RBSGPR; 100 assert(&RBSGPR == &AMDGPU::SGPRRegBank); 101 102 const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID); 103 (void)RBVGPR; 104 assert(&RBVGPR == &AMDGPU::VGPRRegBank); 105 106 } 107 108 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 109 const RegisterBank &Src, 110 unsigned Size) const { 111 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 112 if (Dst.getID() == AMDGPU::SGPRRegBankID && 113 Src.getID() == AMDGPU::VGPRRegBankID) { 114 return std::numeric_limits<unsigned>::max(); 115 } 116 117 // Bool values are tricky, because the meaning is based on context. The SCC 118 // and VCC banks are for the natural scalar and vector conditions produced by 119 // a compare. 120 // 121 // Legalization doesn't know about the necessary context, so an s1 use may 122 // have been a truncate from an arbitrary value, in which case a copy (lowered 123 // as a compare with 0) needs to be inserted. 124 if (Size == 1 && 125 (Dst.getID() == AMDGPU::SCCRegBankID || 126 Dst.getID() == AMDGPU::SGPRRegBankID) && 127 (Src.getID() == AMDGPU::SGPRRegBankID || 128 Src.getID() == AMDGPU::VGPRRegBankID || 129 Src.getID() == AMDGPU::VCCRegBankID)) 130 return std::numeric_limits<unsigned>::max(); 131 132 if (Dst.getID() == AMDGPU::SCCRegBankID && 133 Src.getID() == AMDGPU::VCCRegBankID) 134 return std::numeric_limits<unsigned>::max(); 135 136 return RegisterBankInfo::copyCost(Dst, Src, Size); 137 } 138 139 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 140 const ValueMapping &ValMapping, 141 const RegisterBank *CurBank) const { 142 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 143 // VGPR. 144 // FIXME: Is there a better way to do this? 145 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 146 return 10; // This is expensive. 147 148 assert(ValMapping.NumBreakDowns == 2 && 149 ValMapping.BreakDown[0].Length == 32 && 150 ValMapping.BreakDown[0].StartIdx == 0 && 151 ValMapping.BreakDown[1].Length == 32 && 152 ValMapping.BreakDown[1].StartIdx == 32 && 153 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 154 155 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 156 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 157 // want. 158 159 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 160 // alignment restrictions, but this probably isn't important. 161 return 1; 162 } 163 164 const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass( 165 const TargetRegisterClass &RC) const { 166 167 if (TRI->isSGPRClass(&RC)) 168 return getRegBank(AMDGPU::SGPRRegBankID); 169 170 return getRegBank(AMDGPU::VGPRRegBankID); 171 } 172 173 template <unsigned NumOps> 174 RegisterBankInfo::InstructionMappings 175 AMDGPURegisterBankInfo::addMappingFromTable( 176 const MachineInstr &MI, const MachineRegisterInfo &MRI, 177 const std::array<unsigned, NumOps> RegSrcOpIdx, 178 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 179 180 InstructionMappings AltMappings; 181 182 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 183 184 unsigned Sizes[NumOps]; 185 for (unsigned I = 0; I < NumOps; ++I) { 186 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 187 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 188 } 189 190 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 191 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 192 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 193 } 194 195 unsigned MappingID = 0; 196 for (const auto &Entry : Table) { 197 for (unsigned I = 0; I < NumOps; ++I) { 198 int OpIdx = RegSrcOpIdx[I]; 199 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 200 } 201 202 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 203 getOperandsMapping(Operands), 204 Operands.size())); 205 } 206 207 return AltMappings; 208 } 209 210 RegisterBankInfo::InstructionMappings 211 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 212 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 213 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 214 case Intrinsic::amdgcn_readlane: { 215 static const OpRegBankEntry<3> Table[2] = { 216 // Perfectly legal. 217 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 218 219 // Need a readfirstlane for the index. 220 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 221 }; 222 223 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 224 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 225 } 226 case Intrinsic::amdgcn_writelane: { 227 static const OpRegBankEntry<4> Table[4] = { 228 // Perfectly legal. 229 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 230 231 // Need readfirstlane of first op 232 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 233 234 // Need readfirstlane of second op 235 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 236 237 // Need readfirstlane of both ops 238 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 239 }; 240 241 // rsrc, voffset, offset 242 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 243 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 244 } 245 default: 246 return RegisterBankInfo::getInstrAlternativeMappings(MI); 247 } 248 } 249 250 RegisterBankInfo::InstructionMappings 251 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 252 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 253 254 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 255 case Intrinsic::amdgcn_buffer_load: { 256 static const OpRegBankEntry<3> Table[4] = { 257 // Perfectly legal. 258 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 259 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 260 261 // Waterfall loop needed for rsrc. In the worst case this will execute 262 // approximately an extra 10 * wavesize + 2 instructions. 263 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 264 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 } 265 }; 266 267 // rsrc, voffset, offset 268 const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } }; 269 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 270 } 271 case Intrinsic::amdgcn_s_buffer_load: { 272 static const OpRegBankEntry<2> Table[4] = { 273 // Perfectly legal. 274 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 275 276 // Only need 1 register in loop 277 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 278 279 // Have to waterfall the resource. 280 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 281 282 // Have to waterfall the resource, and the offset. 283 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 284 }; 285 286 // rsrc, offset 287 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 288 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 289 } 290 case Intrinsic::amdgcn_ds_ordered_add: 291 case Intrinsic::amdgcn_ds_ordered_swap: { 292 // VGPR = M0, VGPR 293 static const OpRegBankEntry<3> Table[2] = { 294 // Perfectly legal. 295 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 296 297 // Need a readfirstlane for m0 298 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 299 }; 300 301 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 302 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 303 } 304 case Intrinsic::amdgcn_s_sendmsg: 305 case Intrinsic::amdgcn_s_sendmsghalt: { 306 static const OpRegBankEntry<1> Table[2] = { 307 // Perfectly legal. 308 { { AMDGPU::SGPRRegBankID }, 1 }, 309 310 // Need readlane 311 { { AMDGPU::VGPRRegBankID }, 3 } 312 }; 313 314 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 315 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 316 } 317 default: 318 return RegisterBankInfo::getInstrAlternativeMappings(MI); 319 } 320 } 321 322 static bool isInstrUniform(const MachineInstr &MI) { 323 if (!MI.hasOneMemOperand()) 324 return false; 325 326 const MachineMemOperand *MMO = *MI.memoperands_begin(); 327 return AMDGPUInstrInfo::isUniformMMO(MMO); 328 } 329 330 RegisterBankInfo::InstructionMappings 331 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 332 const MachineInstr &MI) const { 333 334 const MachineFunction &MF = *MI.getParent()->getParent(); 335 const MachineRegisterInfo &MRI = MF.getRegInfo(); 336 337 338 InstructionMappings AltMappings; 339 switch (MI.getOpcode()) { 340 case TargetOpcode::G_AND: 341 case TargetOpcode::G_OR: 342 case TargetOpcode::G_XOR: { 343 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 344 345 if (Size == 1) { 346 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 347 const InstructionMapping &SCCMapping = getInstructionMapping( 348 1, 1, getOperandsMapping( 349 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size), 350 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 351 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 352 3); // Num Operands 353 AltMappings.push_back(&SCCMapping); 354 355 const InstructionMapping &SGPRMapping = getInstructionMapping( 356 1, 1, getOperandsMapping( 357 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 358 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 359 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 360 3); // Num Operands 361 AltMappings.push_back(&SGPRMapping); 362 363 const InstructionMapping &VCCMapping0 = getInstructionMapping( 364 2, 10, getOperandsMapping( 365 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 366 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 367 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 368 3); // Num Operands 369 AltMappings.push_back(&VCCMapping0); 370 return AltMappings; 371 } 372 373 if (Size != 64) 374 break; 375 376 const InstructionMapping &SSMapping = getInstructionMapping( 377 1, 1, getOperandsMapping( 378 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 379 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 380 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 381 3); // Num Operands 382 AltMappings.push_back(&SSMapping); 383 384 const InstructionMapping &VVMapping = getInstructionMapping( 385 2, 2, getOperandsMapping( 386 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 387 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 388 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 389 3); // Num Operands 390 AltMappings.push_back(&VVMapping); 391 392 const InstructionMapping &SVMapping = getInstructionMapping( 393 3, 3, getOperandsMapping( 394 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 395 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size), 396 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 397 3); // Num Operands 398 AltMappings.push_back(&SVMapping); 399 400 // SGPR in LHS is slightly preferrable, so make it VS more expensive than 401 // SV. 402 const InstructionMapping &VSMapping = getInstructionMapping( 403 3, 4, getOperandsMapping( 404 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 405 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 406 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}), 407 3); // Num Operands 408 AltMappings.push_back(&VSMapping); 409 break; 410 } 411 case TargetOpcode::G_LOAD: { 412 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 413 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); 414 // FIXME: Should we be hard coding the size for these mappings? 415 if (isInstrUniform(MI)) { 416 const InstructionMapping &SSMapping = getInstructionMapping( 417 1, 1, getOperandsMapping( 418 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 419 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), 420 2); // Num Operands 421 AltMappings.push_back(&SSMapping); 422 } 423 424 const InstructionMapping &VVMapping = getInstructionMapping( 425 2, 1, getOperandsMapping( 426 {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), 427 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}), 428 2); // Num Operands 429 AltMappings.push_back(&VVMapping); 430 431 // It may be possible to have a vgpr = load sgpr mapping here, because 432 // the mubuf instructions support this kind of load, but probably for only 433 // gfx7 and older. However, the addressing mode matching in the instruction 434 // selector should be able to do a better job of detecting and selecting 435 // these kinds of loads from the vgpr = load vgpr mapping. 436 437 return AltMappings; 438 439 } 440 case TargetOpcode::G_ICMP: { 441 unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 442 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 443 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), 444 nullptr, // Predicate operand. 445 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 446 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 447 4); // Num Operands 448 AltMappings.push_back(&SSMapping); 449 450 const InstructionMapping &SVMapping = getInstructionMapping(2, 1, 451 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 452 nullptr, // Predicate operand. 453 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 454 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), 455 4); // Num Operands 456 AltMappings.push_back(&SVMapping); 457 458 const InstructionMapping &VSMapping = getInstructionMapping(3, 1, 459 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 460 nullptr, // Predicate operand. 461 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 462 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 463 4); // Num Operands 464 AltMappings.push_back(&VSMapping); 465 466 const InstructionMapping &VVMapping = getInstructionMapping(4, 1, 467 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 468 nullptr, // Predicate operand. 469 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 470 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), 471 4); // Num Operands 472 AltMappings.push_back(&VVMapping); 473 474 return AltMappings; 475 } 476 case TargetOpcode::G_SELECT: { 477 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 478 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 479 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 480 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), 481 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 482 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 483 4); // Num Operands 484 AltMappings.push_back(&SSMapping); 485 486 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 487 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 488 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 489 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 490 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 491 4); // Num Operands 492 AltMappings.push_back(&VVMapping); 493 494 return AltMappings; 495 } 496 case TargetOpcode::G_SMIN: 497 case TargetOpcode::G_SMAX: 498 case TargetOpcode::G_UMIN: 499 case TargetOpcode::G_UMAX: { 500 static const OpRegBankEntry<3> Table[4] = { 501 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 502 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 503 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 504 505 // Scalar requires cmp+select, and extends if 16-bit. 506 // FIXME: Should there be separate costs for 32 and 16-bit 507 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 } 508 }; 509 510 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } }; 511 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 512 } 513 case TargetOpcode::G_UADDE: 514 case TargetOpcode::G_USUBE: 515 case TargetOpcode::G_SADDE: 516 case TargetOpcode::G_SSUBE: { 517 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 518 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 519 getOperandsMapping( 520 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 521 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), 522 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 523 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 524 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}), 525 5); // Num Operands 526 AltMappings.push_back(&SSMapping); 527 528 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 529 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 530 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 531 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 532 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 533 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 534 5); // Num Operands 535 AltMappings.push_back(&VVMapping); 536 return AltMappings; 537 } 538 case AMDGPU::G_BRCOND: { 539 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 540 541 const InstructionMapping &SMapping = getInstructionMapping( 542 1, 1, getOperandsMapping( 543 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}), 544 2); // Num Operands 545 AltMappings.push_back(&SMapping); 546 547 const InstructionMapping &VMapping = getInstructionMapping( 548 1, 1, getOperandsMapping( 549 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 550 2); // Num Operands 551 AltMappings.push_back(&VMapping); 552 return AltMappings; 553 } 554 case AMDGPU::G_INTRINSIC: 555 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 556 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 557 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 558 default: 559 break; 560 } 561 return RegisterBankInfo::getInstrAlternativeMappings(MI); 562 } 563 564 void AMDGPURegisterBankInfo::split64BitValueForMapping( 565 MachineIRBuilder &B, 566 SmallVector<Register, 2> &Regs, 567 LLT HalfTy, 568 Register Reg) const { 569 assert(HalfTy.getSizeInBits() == 32); 570 MachineRegisterInfo *MRI = B.getMRI(); 571 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 572 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 573 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 574 MRI->setRegBank(LoLHS, *Bank); 575 MRI->setRegBank(HiLHS, *Bank); 576 577 Regs.push_back(LoLHS); 578 Regs.push_back(HiLHS); 579 580 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 581 .addDef(LoLHS) 582 .addDef(HiLHS) 583 .addUse(Reg); 584 } 585 586 /// Replace the current type each register in \p Regs has with \p NewTy 587 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 588 LLT NewTy) { 589 for (Register Reg : Regs) { 590 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 591 MRI.setType(Reg, NewTy); 592 } 593 } 594 595 static LLT getHalfSizedType(LLT Ty) { 596 if (Ty.isVector()) { 597 assert(Ty.getNumElements() % 2 == 0); 598 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType()); 599 } 600 601 assert(Ty.getSizeInBits() % 2 == 0); 602 return LLT::scalar(Ty.getSizeInBits() / 2); 603 } 604 605 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 606 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 607 /// execute the instruction for each unique combination of values in all lanes 608 /// in the wave. The block will be split such that rest of the instructions are 609 /// moved to a new block. 610 /// 611 /// Essentially performs this loop: 612 // 613 /// Save Execution Mask 614 /// For (Lane : Wavefront) { 615 /// Enable Lane, Disable all other lanes 616 /// SGPR = read SGPR value for current lane from VGPR 617 /// VGPRResult[Lane] = use_op SGPR 618 /// } 619 /// Restore Execution Mask 620 /// 621 /// There is additional complexity to try for compare values to identify the 622 /// unique values used. 623 void AMDGPURegisterBankInfo::executeInWaterfallLoop( 624 MachineInstr &MI, MachineRegisterInfo &MRI, 625 ArrayRef<unsigned> OpIndices) const { 626 MachineFunction *MF = MI.getParent()->getParent(); 627 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 628 const SIInstrInfo *TII = ST.getInstrInfo(); 629 MachineBasicBlock::iterator I(MI); 630 631 MachineBasicBlock &MBB = *MI.getParent(); 632 const DebugLoc &DL = MI.getDebugLoc(); 633 634 // Use a set to avoid extra readfirstlanes in the case where multiple operands 635 // are the same register. 636 SmallSet<Register, 4> SGPROperandRegs; 637 for (unsigned Op : OpIndices) { 638 assert(MI.getOperand(Op).isUse()); 639 Register Reg = MI.getOperand(Op).getReg(); 640 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 641 if (OpBank->getID() == AMDGPU::VGPRRegBankID) 642 SGPROperandRegs.insert(Reg); 643 } 644 645 // No operands need to be replaced, so no need to loop. 646 if (SGPROperandRegs.empty()) 647 return; 648 649 MachineIRBuilder B(MI); 650 SmallVector<Register, 4> ResultRegs; 651 SmallVector<Register, 4> InitResultRegs; 652 SmallVector<Register, 4> PhiRegs; 653 for (MachineOperand &Def : MI.defs()) { 654 LLT ResTy = MRI.getType(Def.getReg()); 655 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); 656 ResultRegs.push_back(Def.getReg()); 657 Register InitReg = B.buildUndef(ResTy).getReg(0); 658 Register PhiReg = MRI.createGenericVirtualRegister(ResTy); 659 InitResultRegs.push_back(InitReg); 660 PhiRegs.push_back(PhiReg); 661 MRI.setRegBank(PhiReg, *DefBank); 662 MRI.setRegBank(InitReg, *DefBank); 663 } 664 665 Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 666 Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 667 668 // Don't bother using generic instructions/registers for the exec mask. 669 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 670 .addDef(InitSaveExecReg); 671 672 Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 673 Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 674 675 // To insert the loop we need to split the block. Move everything before this 676 // point to a new block, and insert a new empty block before this instruction. 677 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 678 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 679 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 680 MachineFunction::iterator MBBI(MBB); 681 ++MBBI; 682 MF->insert(MBBI, LoopBB); 683 MF->insert(MBBI, RestoreExecBB); 684 MF->insert(MBBI, RemainderBB); 685 686 LoopBB->addSuccessor(RestoreExecBB); 687 LoopBB->addSuccessor(LoopBB); 688 689 // Move the rest of the block into a new block. 690 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 691 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 692 693 MBB.addSuccessor(LoopBB); 694 RestoreExecBB->addSuccessor(RemainderBB); 695 696 B.setInsertPt(*LoopBB, LoopBB->end()); 697 698 B.buildInstr(TargetOpcode::PHI) 699 .addDef(PhiExec) 700 .addReg(InitSaveExecReg) 701 .addMBB(&MBB) 702 .addReg(NewExec) 703 .addMBB(LoopBB); 704 705 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { 706 B.buildInstr(TargetOpcode::G_PHI) 707 .addDef(std::get<2>(Result)) 708 .addReg(std::get<0>(Result)) // Initial value / implicit_def 709 .addMBB(&MBB) 710 .addReg(std::get<1>(Result)) // Mid-loop value. 711 .addMBB(LoopBB); 712 } 713 714 // Move the instruction into the loop. 715 LoopBB->splice(LoopBB->end(), &MBB, I); 716 I = std::prev(LoopBB->end()); 717 718 B.setInstr(*I); 719 720 Register CondReg; 721 722 for (MachineOperand &Op : MI.uses()) { 723 if (!Op.isReg()) 724 continue; 725 726 assert(!Op.isDef()); 727 if (SGPROperandRegs.count(Op.getReg())) { 728 LLT OpTy = MRI.getType(Op.getReg()); 729 unsigned OpSize = OpTy.getSizeInBits(); 730 731 // Can only do a readlane of 32-bit pieces. 732 if (OpSize == 32) { 733 // Avoid extra copies in the simple case of one 32-bit register. 734 Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 735 MRI.setType(CurrentLaneOpReg, OpTy); 736 737 constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); 738 // Read the next variant <- also loop target. 739 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg) 740 .addReg(Op.getReg()); 741 742 Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 743 bool First = CondReg == AMDGPU::NoRegister; 744 if (First) 745 CondReg = NewCondReg; 746 747 // Compare the just read M0 value to all possible Idx values. 748 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) 749 .addDef(NewCondReg) 750 .addReg(CurrentLaneOpReg) 751 .addReg(Op.getReg()); 752 Op.setReg(CurrentLaneOpReg); 753 754 if (!First) { 755 Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 756 757 // If there are multiple operands to consider, and the conditions. 758 B.buildInstr(AMDGPU::S_AND_B64) 759 .addDef(AndReg) 760 .addReg(NewCondReg) 761 .addReg(CondReg); 762 CondReg = AndReg; 763 } 764 } else { 765 LLT S32 = LLT::scalar(32); 766 SmallVector<Register, 8> ReadlanePieces; 767 768 // The compares can be done as 64-bit, but the extract needs to be done 769 // in 32-bit pieces. 770 771 bool Is64 = OpSize % 64 == 0; 772 773 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); 774 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 775 : AMDGPU::V_CMP_EQ_U32_e64; 776 777 // The compares can be done as 64-bit, but the extract needs to be done 778 // in 32-bit pieces. 779 780 // Insert the unmerge before the loop. 781 782 B.setMBB(MBB); 783 auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); 784 B.setInstr(*I); 785 786 unsigned NumPieces = Unmerge->getNumOperands() - 1; 787 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { 788 unsigned UnmergePiece = Unmerge.getReg(PieceIdx); 789 790 Register CurrentLaneOpReg; 791 if (Is64) { 792 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); 793 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); 794 795 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); 796 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); 797 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); 798 799 // Read the next variant <- also loop target. 800 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 801 CurrentLaneOpRegLo) 802 .addReg(UnmergePiece, 0, AMDGPU::sub0); 803 804 // Read the next variant <- also loop target. 805 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 806 CurrentLaneOpRegHi) 807 .addReg(UnmergePiece, 0, AMDGPU::sub1); 808 809 CurrentLaneOpReg = 810 B.buildMerge(LLT::scalar(64), 811 {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) 812 .getReg(0); 813 814 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); 815 816 if (OpTy.getScalarSizeInBits() == 64) { 817 // If we need to produce a 64-bit element vector, so use the 818 // merged pieces 819 ReadlanePieces.push_back(CurrentLaneOpReg); 820 } else { 821 // 32-bit element type. 822 ReadlanePieces.push_back(CurrentLaneOpRegLo); 823 ReadlanePieces.push_back(CurrentLaneOpRegHi); 824 } 825 } else { 826 CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 827 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); 828 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); 829 830 // Read the next variant <- also loop target. 831 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 832 CurrentLaneOpReg) 833 .addReg(UnmergePiece); 834 ReadlanePieces.push_back(CurrentLaneOpReg); 835 } 836 837 Register NewCondReg 838 = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 839 bool First = CondReg == AMDGPU::NoRegister; 840 if (First) 841 CondReg = NewCondReg; 842 843 B.buildInstr(CmpOp) 844 .addDef(NewCondReg) 845 .addReg(CurrentLaneOpReg) 846 .addReg(UnmergePiece); 847 848 if (!First) { 849 Register AndReg 850 = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 851 852 // If there are multiple operands to consider, and the conditions. 853 B.buildInstr(AMDGPU::S_AND_B64) 854 .addDef(AndReg) 855 .addReg(NewCondReg) 856 .addReg(CondReg); 857 CondReg = AndReg; 858 } 859 } 860 861 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not 862 // BUILD_VECTOR 863 if (OpTy.isVector()) { 864 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); 865 Op.setReg(Merge.getReg(0)); 866 } else { 867 auto Merge = B.buildMerge(OpTy, ReadlanePieces); 868 Op.setReg(Merge.getReg(0)); 869 } 870 871 MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); 872 } 873 } 874 } 875 876 B.setInsertPt(*LoopBB, LoopBB->end()); 877 878 // Update EXEC, save the original EXEC value to VCC. 879 B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64) 880 .addDef(NewExec) 881 .addReg(CondReg, RegState::Kill); 882 883 MRI.setSimpleHint(NewExec, CondReg); 884 885 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 886 B.buildInstr(AMDGPU::S_XOR_B64_term) 887 .addDef(AMDGPU::EXEC) 888 .addReg(AMDGPU::EXEC) 889 .addReg(NewExec); 890 891 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 892 // s_cbranch_scc0? 893 894 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 895 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) 896 .addMBB(LoopBB); 897 898 // Save the EXEC mask before the loop. 899 BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg) 900 .addReg(AMDGPU::EXEC); 901 902 // Restore the EXEC mask after the loop. 903 B.setMBB(*RestoreExecBB); 904 B.buildInstr(AMDGPU::S_MOV_B64_term) 905 .addDef(AMDGPU::EXEC) 906 .addReg(SaveExecReg); 907 } 908 909 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 910 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 911 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { 912 Register Reg = MI.getOperand(OpIdx).getReg(); 913 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 914 if (Bank != &AMDGPU::VGPRRegBank) 915 return; 916 917 MachineIRBuilder B(MI); 918 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 919 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) 920 .addDef(SGPR) 921 .addReg(Reg); 922 923 const TargetRegisterClass *Constrained = 924 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); 925 (void)Constrained; 926 assert(Constrained && "Failed to constrain readfirstlane src reg"); 927 928 MI.getOperand(OpIdx).setReg(SGPR); 929 } 930 931 // When regbankselect repairs registers, it will insert a repair instruction 932 // which defines the repaired register. Then it calls applyMapping and expects 933 // that the targets will either delete or rewrite the originally wrote to the 934 // repaired registers. Beccause of this, we end up in a situation where 935 // we have 2 instructions defining the same registers. 936 static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI, 937 Register Reg, 938 const MachineInstr &MI) { 939 // Is there some way we can assert that there are exactly 2 def instructions? 940 for (MachineInstr &Other : MRI.def_instructions(Reg)) { 941 if (&Other != &MI) 942 return &Other; 943 } 944 945 return nullptr; 946 } 947 948 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, 949 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 950 MachineRegisterInfo &MRI) const { 951 Register DstReg = MI.getOperand(0).getReg(); 952 const LLT LoadTy = MRI.getType(DstReg); 953 unsigned LoadSize = LoadTy.getSizeInBits(); 954 const unsigned MaxNonSmrdLoadSize = 128; 955 // 128-bit loads are supported for all instruction types. 956 if (LoadSize <= MaxNonSmrdLoadSize) 957 return false; 958 959 SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0)); 960 SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1)); 961 962 // If the pointer is an SGPR, we have nothing to do. 963 if (SrcRegs.empty()) 964 return false; 965 966 assert(LoadSize % MaxNonSmrdLoadSize == 0); 967 968 // We want to get the repair instruction now, because it will help us 969 // determine which instruction the legalizer inserts that will also 970 // write to DstReg. 971 MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI); 972 973 // RegBankSelect only emits scalar types, so we need to reset the pointer 974 // operand to a pointer type. 975 Register BasePtrReg = SrcRegs[0]; 976 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 977 MRI.setType(BasePtrReg, PtrTy); 978 979 MachineIRBuilder B(MI); 980 981 unsigned SplitElts = 982 MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits(); 983 const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType()); 984 ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank); 985 GISelObserverWrapper Observer(&O); 986 B.setChangeObserver(Observer); 987 LegalizerHelper Helper(B.getMF(), Observer, B); 988 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 989 return false; 990 991 // At this point, the legalizer has split the original load into smaller 992 // loads. At the end of lowering, it inserts an instruction (LegalizedInst) 993 // that combines the outputs of the lower loads and writes it to DstReg. 994 // The register bank selector has also added the RepairInst which writes to 995 // DstReg as well. 996 997 MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst); 998 999 // Replace the output of the LegalizedInst with a temporary register, since 1000 // RepairInst already defines DstReg. 1001 Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg)); 1002 LegalizedInst->getOperand(0).setReg(TmpReg); 1003 B.setInsertPt(*RepairInst->getParent(), RepairInst); 1004 1005 for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) { 1006 Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 1007 B.buildConstant(IdxReg, DefIdx); 1008 MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID)); 1009 B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg); 1010 } 1011 1012 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); 1013 return true; 1014 } 1015 1016 // For cases where only a single copy is inserted for matching register banks. 1017 // Replace the register in the instruction operand 1018 static void substituteSimpleCopyRegs( 1019 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1020 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1021 if (!SrcReg.empty()) { 1022 assert(SrcReg.size() == 1); 1023 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1024 } 1025 } 1026 1027 void AMDGPURegisterBankInfo::applyMappingImpl( 1028 const OperandsMapper &OpdMapper) const { 1029 MachineInstr &MI = OpdMapper.getMI(); 1030 unsigned Opc = MI.getOpcode(); 1031 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1032 switch (Opc) { 1033 case AMDGPU::G_SELECT: { 1034 Register DstReg = MI.getOperand(0).getReg(); 1035 LLT DstTy = MRI.getType(DstReg); 1036 if (DstTy.getSizeInBits() != 64) 1037 break; 1038 1039 LLT HalfTy = getHalfSizedType(DstTy); 1040 1041 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1042 SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1)); 1043 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 1044 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 1045 1046 // All inputs are SGPRs, nothing special to do. 1047 if (DefRegs.empty()) { 1048 assert(Src1Regs.empty() && Src2Regs.empty()); 1049 break; 1050 } 1051 1052 MachineIRBuilder B(MI); 1053 if (Src0Regs.empty()) 1054 Src0Regs.push_back(MI.getOperand(1).getReg()); 1055 else { 1056 assert(Src0Regs.size() == 1); 1057 } 1058 1059 if (Src1Regs.empty()) 1060 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 1061 else { 1062 setRegsToType(MRI, Src1Regs, HalfTy); 1063 } 1064 1065 if (Src2Regs.empty()) 1066 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 1067 else 1068 setRegsToType(MRI, Src2Regs, HalfTy); 1069 1070 setRegsToType(MRI, DefRegs, HalfTy); 1071 1072 B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]); 1073 B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]); 1074 1075 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); 1076 MI.eraseFromParent(); 1077 return; 1078 } 1079 case AMDGPU::G_AND: 1080 case AMDGPU::G_OR: 1081 case AMDGPU::G_XOR: { 1082 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 1083 // there is a VGPR input. 1084 Register DstReg = MI.getOperand(0).getReg(); 1085 LLT DstTy = MRI.getType(DstReg); 1086 if (DstTy.getSizeInBits() != 64) 1087 break; 1088 1089 LLT HalfTy = getHalfSizedType(DstTy); 1090 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1091 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 1092 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 1093 1094 // All inputs are SGPRs, nothing special to do. 1095 if (DefRegs.empty()) { 1096 assert(Src0Regs.empty() && Src1Regs.empty()); 1097 break; 1098 } 1099 1100 assert(DefRegs.size() == 2); 1101 assert(Src0Regs.size() == Src1Regs.size() && 1102 (Src0Regs.empty() || Src0Regs.size() == 2)); 1103 1104 // Depending on where the source registers came from, the generic code may 1105 // have decided to split the inputs already or not. If not, we still need to 1106 // extract the values. 1107 MachineIRBuilder B(MI); 1108 1109 if (Src0Regs.empty()) 1110 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 1111 else 1112 setRegsToType(MRI, Src0Regs, HalfTy); 1113 1114 if (Src1Regs.empty()) 1115 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 1116 else 1117 setRegsToType(MRI, Src1Regs, HalfTy); 1118 1119 setRegsToType(MRI, DefRegs, HalfTy); 1120 1121 B.buildInstr(Opc) 1122 .addDef(DefRegs[0]) 1123 .addUse(Src0Regs[0]) 1124 .addUse(Src1Regs[0]); 1125 1126 B.buildInstr(Opc) 1127 .addDef(DefRegs[1]) 1128 .addUse(Src0Regs[1]) 1129 .addUse(Src1Regs[1]); 1130 1131 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); 1132 MI.eraseFromParent(); 1133 return; 1134 } 1135 case AMDGPU::G_ADD: 1136 case AMDGPU::G_SUB: 1137 case AMDGPU::G_MUL: { 1138 Register DstReg = MI.getOperand(0).getReg(); 1139 LLT DstTy = MRI.getType(DstReg); 1140 if (DstTy != LLT::scalar(16)) 1141 break; 1142 1143 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); 1144 if (DstBank == &AMDGPU::VGPRRegBank) 1145 break; 1146 1147 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 1148 MachineFunction *MF = MI.getParent()->getParent(); 1149 MachineIRBuilder B(MI); 1150 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); 1151 GISelObserverWrapper Observer(&ApplySALU); 1152 LegalizerHelper Helper(*MF, Observer, B); 1153 1154 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 1155 LegalizerHelper::Legalized) 1156 llvm_unreachable("widen scalar should have succeeded"); 1157 return; 1158 } 1159 case AMDGPU::G_SMIN: 1160 case AMDGPU::G_SMAX: 1161 case AMDGPU::G_UMIN: 1162 case AMDGPU::G_UMAX: { 1163 Register DstReg = MI.getOperand(0).getReg(); 1164 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); 1165 if (DstBank == &AMDGPU::VGPRRegBank) 1166 break; 1167 1168 MachineFunction *MF = MI.getParent()->getParent(); 1169 MachineIRBuilder B(MI); 1170 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); 1171 GISelObserverWrapper Observer(&ApplySALU); 1172 LegalizerHelper Helper(*MF, Observer, B); 1173 1174 // Turn scalar min/max into a compare and select. 1175 LLT Ty = MRI.getType(DstReg); 1176 LLT S32 = LLT::scalar(32); 1177 LLT S16 = LLT::scalar(16); 1178 1179 if (Ty == S16) { 1180 // Need to widen to s32, and expand as cmp + select. 1181 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 1182 llvm_unreachable("widenScalar should have succeeded"); 1183 1184 // FIXME: This is relying on widenScalar leaving MI in place. 1185 if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized) 1186 llvm_unreachable("lower should have succeeded"); 1187 } else { 1188 if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized) 1189 llvm_unreachable("lower should have succeeded"); 1190 } 1191 1192 return; 1193 } 1194 case AMDGPU::G_SEXT: 1195 case AMDGPU::G_ZEXT: { 1196 Register SrcReg = MI.getOperand(1).getReg(); 1197 LLT SrcTy = MRI.getType(SrcReg); 1198 bool Signed = Opc == AMDGPU::G_SEXT; 1199 1200 MachineIRBuilder B(MI); 1201 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 1202 1203 Register DstReg = MI.getOperand(0).getReg(); 1204 LLT DstTy = MRI.getType(DstReg); 1205 if (DstTy.isScalar() && 1206 SrcBank != &AMDGPU::SGPRRegBank && 1207 SrcBank != &AMDGPU::SCCRegBank && 1208 SrcBank != &AMDGPU::VCCRegBank && 1209 // FIXME: Should handle any type that round to s64 when irregular 1210 // breakdowns supported. 1211 DstTy.getSizeInBits() == 64 && 1212 SrcTy.getSizeInBits() <= 32) { 1213 const LLT S32 = LLT::scalar(32); 1214 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1215 1216 // Extend to 32-bit, and then extend the low half. 1217 if (Signed) { 1218 // TODO: Should really be buildSExtOrCopy 1219 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 1220 1221 // Replicate sign bit from 32-bit extended part. 1222 auto ShiftAmt = B.buildConstant(S32, 31); 1223 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); 1224 B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt); 1225 } else { 1226 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 1227 B.buildConstant(DefRegs[1], 0); 1228 } 1229 1230 MRI.setRegBank(DstReg, *SrcBank); 1231 MI.eraseFromParent(); 1232 return; 1233 } 1234 1235 if (SrcTy != LLT::scalar(1)) 1236 return; 1237 1238 if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) { 1239 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1240 1241 const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ? 1242 &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank; 1243 1244 unsigned DstSize = DstTy.getSizeInBits(); 1245 // 64-bit select is SGPR only 1246 const bool UseSel64 = DstSize > 32 && 1247 SrcBank->getID() == AMDGPU::SCCRegBankID; 1248 1249 // TODO: Should s16 select be legal? 1250 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 1251 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 1252 auto False = B.buildConstant(SelType, 0); 1253 1254 MRI.setRegBank(True.getReg(0), *DstBank); 1255 MRI.setRegBank(False.getReg(0), *DstBank); 1256 MRI.setRegBank(DstReg, *DstBank); 1257 1258 if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) { 1259 B.buildSelect(DefRegs[0], SrcReg, True, False); 1260 B.buildCopy(DefRegs[1], DefRegs[0]); 1261 } else if (DstSize < 32) { 1262 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 1263 MRI.setRegBank(Sel.getReg(0), *DstBank); 1264 B.buildTrunc(DstReg, Sel); 1265 } else { 1266 B.buildSelect(DstReg, SrcReg, True, False); 1267 } 1268 1269 MI.eraseFromParent(); 1270 return; 1271 } 1272 1273 // Fixup the case with an s1 src that isn't a condition register. Use shifts 1274 // instead of introducing a compare to avoid an unnecessary condition 1275 // register (and since there's no scalar 16-bit compares). 1276 auto Ext = B.buildAnyExt(DstTy, SrcReg); 1277 auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1); 1278 auto Shl = B.buildShl(DstTy, Ext, ShiftAmt); 1279 1280 if (MI.getOpcode() == AMDGPU::G_SEXT) 1281 B.buildAShr(DstReg, Shl, ShiftAmt); 1282 else 1283 B.buildLShr(DstReg, Shl, ShiftAmt); 1284 1285 MRI.setRegBank(DstReg, *SrcBank); 1286 MRI.setRegBank(Ext.getReg(0), *SrcBank); 1287 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); 1288 MRI.setRegBank(Shl.getReg(0), *SrcBank); 1289 MI.eraseFromParent(); 1290 return; 1291 } 1292 case AMDGPU::G_EXTRACT_VECTOR_ELT: 1293 applyDefaultMapping(OpdMapper); 1294 executeInWaterfallLoop(MI, MRI, { 2 }); 1295 return; 1296 case AMDGPU::G_INTRINSIC: { 1297 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1298 case Intrinsic::amdgcn_s_buffer_load: { 1299 // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS 1300 executeInWaterfallLoop(MI, MRI, { 2, 3 }); 1301 return; 1302 } 1303 case Intrinsic::amdgcn_readlane: { 1304 substituteSimpleCopyRegs(OpdMapper, 2); 1305 1306 assert(empty(OpdMapper.getVRegs(0))); 1307 assert(empty(OpdMapper.getVRegs(3))); 1308 1309 // Make sure the index is an SGPR. It doesn't make sense to run this in a 1310 // waterfall loop, so assume it's a uniform value. 1311 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 1312 return; 1313 } 1314 case Intrinsic::amdgcn_writelane: { 1315 assert(empty(OpdMapper.getVRegs(0))); 1316 assert(empty(OpdMapper.getVRegs(2))); 1317 assert(empty(OpdMapper.getVRegs(3))); 1318 1319 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 1320 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value 1321 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 1322 return; 1323 } 1324 default: 1325 break; 1326 } 1327 break; 1328 } 1329 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 1330 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1331 case Intrinsic::amdgcn_buffer_load: { 1332 executeInWaterfallLoop(MI, MRI, { 2 }); 1333 return; 1334 } 1335 case Intrinsic::amdgcn_ds_ordered_add: 1336 case Intrinsic::amdgcn_ds_ordered_swap: { 1337 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 1338 assert(empty(OpdMapper.getVRegs(0))); 1339 substituteSimpleCopyRegs(OpdMapper, 3); 1340 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 1341 return; 1342 } 1343 case Intrinsic::amdgcn_s_sendmsg: 1344 case Intrinsic::amdgcn_s_sendmsghalt: { 1345 // FIXME: Should this use a waterfall loop? 1346 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 1347 return; 1348 } 1349 default: 1350 break; 1351 } 1352 break; 1353 } 1354 case AMDGPU::G_LOAD: { 1355 if (applyMappingWideLoad(MI, OpdMapper, MRI)) 1356 return; 1357 break; 1358 } 1359 default: 1360 break; 1361 } 1362 1363 return applyDefaultMapping(OpdMapper); 1364 } 1365 1366 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 1367 const MachineFunction &MF = *MI.getParent()->getParent(); 1368 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1369 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { 1370 if (!MI.getOperand(i).isReg()) 1371 continue; 1372 Register Reg = MI.getOperand(i).getReg(); 1373 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 1374 if (Bank->getID() == AMDGPU::VGPRRegBankID) 1375 return false; 1376 1377 assert(Bank->getID() == AMDGPU::SGPRRegBankID || 1378 Bank->getID() == AMDGPU::SCCRegBankID); 1379 } 1380 } 1381 return true; 1382 } 1383 1384 const RegisterBankInfo::InstructionMapping & 1385 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 1386 const MachineFunction &MF = *MI.getParent()->getParent(); 1387 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1388 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 1389 1390 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 1391 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 1392 unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID; 1393 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 1394 } 1395 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 1396 MI.getNumOperands()); 1397 } 1398 1399 const RegisterBankInfo::InstructionMapping & 1400 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 1401 const MachineFunction &MF = *MI.getParent()->getParent(); 1402 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1403 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 1404 unsigned OpdIdx = 0; 1405 1406 unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 1407 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); 1408 1409 if (MI.getOperand(OpdIdx).isIntrinsicID()) 1410 OpdsMapping[OpdIdx++] = nullptr; 1411 1412 Register Reg1 = MI.getOperand(OpdIdx).getReg(); 1413 unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI); 1414 1415 unsigned DefaultBankID = Size1 == 1 ? 1416 AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 1417 unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID); 1418 1419 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1); 1420 1421 for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) { 1422 const MachineOperand &MO = MI.getOperand(OpdIdx); 1423 if (!MO.isReg()) 1424 continue; 1425 1426 unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI); 1427 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 1428 OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size); 1429 } 1430 1431 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 1432 MI.getNumOperands()); 1433 } 1434 1435 const RegisterBankInfo::InstructionMapping & 1436 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 1437 const MachineFunction &MF = *MI.getParent()->getParent(); 1438 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1439 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 1440 1441 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 1442 const MachineOperand &Op = MI.getOperand(I); 1443 if (!Op.isReg()) 1444 continue; 1445 1446 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 1447 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 1448 } 1449 1450 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 1451 MI.getNumOperands()); 1452 } 1453 1454 const RegisterBankInfo::InstructionMapping & 1455 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 1456 1457 const MachineFunction &MF = *MI.getParent()->getParent(); 1458 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1459 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 1460 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 1461 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); 1462 unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 1463 1464 const ValueMapping *ValMapping; 1465 const ValueMapping *PtrMapping; 1466 1467 if (isInstrUniform(MI)) { 1468 // We have a uniform instruction so we want to use an SMRD load 1469 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 1470 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 1471 } else { 1472 ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy); 1473 // FIXME: What would happen if we used SGPRRegBankID here? 1474 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 1475 } 1476 1477 OpdsMapping[0] = ValMapping; 1478 OpdsMapping[1] = PtrMapping; 1479 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 1480 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 1481 return Mapping; 1482 1483 // FIXME: Do we want to add a mapping for FLAT load, or should we just 1484 // handle that during instruction selection? 1485 } 1486 1487 unsigned 1488 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 1489 const MachineRegisterInfo &MRI, 1490 const TargetRegisterInfo &TRI, 1491 unsigned Default) const { 1492 1493 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI); 1494 return Bank ? Bank->getID() : Default; 1495 } 1496 1497 /// 1498 /// This function must return a legal mapping, because 1499 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 1500 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 1501 /// VGPR to SGPR generated is illegal. 1502 /// 1503 const RegisterBankInfo::InstructionMapping & 1504 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 1505 const MachineFunction &MF = *MI.getParent()->getParent(); 1506 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1507 1508 if (MI.isRegSequence()) { 1509 // If any input is a VGPR, the result must be a VGPR. The default handling 1510 // assumes any copy between banks is legal. 1511 unsigned BankID = AMDGPU::SGPRRegBankID; 1512 1513 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 1514 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI); 1515 // It doesn't make sense to use vcc or scc banks here, so just ignore 1516 // them. 1517 if (OpBank != AMDGPU::SGPRRegBankID) { 1518 BankID = AMDGPU::VGPRRegBankID; 1519 break; 1520 } 1521 } 1522 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 1523 1524 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 1525 return getInstructionMapping( 1526 1, /*Cost*/ 1, 1527 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 1528 } 1529 1530 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 1531 // properly. 1532 // 1533 // TODO: There are additional exec masking dependencies to analyze. 1534 if (MI.getOpcode() == TargetOpcode::G_PHI) { 1535 // TODO: Generate proper invalid bank enum. 1536 int ResultBank = -1; 1537 1538 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 1539 unsigned Reg = MI.getOperand(I).getReg(); 1540 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1541 1542 // FIXME: Assuming VGPR for any undetermined inputs. 1543 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 1544 ResultBank = AMDGPU::VGPRRegBankID; 1545 break; 1546 } 1547 1548 unsigned OpBank = Bank->getID(); 1549 // scc, scc -> sgpr 1550 if (OpBank == AMDGPU::SCCRegBankID) { 1551 // There's only one SCC register, so a phi requires copying to SGPR. 1552 OpBank = AMDGPU::SGPRRegBankID; 1553 } else if (OpBank == AMDGPU::VCCRegBankID) { 1554 // vcc, vcc -> vcc 1555 // vcc, sgpr -> vgpr 1556 if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) { 1557 ResultBank = AMDGPU::VGPRRegBankID; 1558 break; 1559 } 1560 } 1561 1562 ResultBank = OpBank; 1563 } 1564 1565 assert(ResultBank != -1); 1566 1567 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1568 1569 const ValueMapping &ValMap = 1570 getValueMapping(0, Size, getRegBank(ResultBank)); 1571 return getInstructionMapping( 1572 1, /*Cost*/ 1, 1573 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 1574 } 1575 1576 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 1577 if (Mapping.isValid()) 1578 return Mapping; 1579 1580 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 1581 1582 switch (MI.getOpcode()) { 1583 default: 1584 return getInvalidInstructionMapping(); 1585 1586 case AMDGPU::G_AND: 1587 case AMDGPU::G_OR: 1588 case AMDGPU::G_XOR: { 1589 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1590 if (Size == 1) { 1591 const RegisterBank *DstBank 1592 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 1593 1594 unsigned TargetBankID = -1; 1595 unsigned BankLHS = -1; 1596 unsigned BankRHS = -1; 1597 if (DstBank) { 1598 TargetBankID = DstBank->getID(); 1599 if (DstBank == &AMDGPU::VCCRegBank) { 1600 TargetBankID = AMDGPU::VCCRegBankID; 1601 BankLHS = AMDGPU::VCCRegBankID; 1602 BankRHS = AMDGPU::VCCRegBankID; 1603 } else if (DstBank == &AMDGPU::SCCRegBank) { 1604 TargetBankID = AMDGPU::SCCRegBankID; 1605 BankLHS = AMDGPU::SGPRRegBankID; 1606 BankRHS = AMDGPU::SGPRRegBankID; 1607 } else { 1608 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 1609 AMDGPU::SGPRRegBankID); 1610 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 1611 AMDGPU::SGPRRegBankID); 1612 } 1613 } else { 1614 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 1615 AMDGPU::VCCRegBankID); 1616 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 1617 AMDGPU::VCCRegBankID); 1618 1619 // Both inputs should be true booleans to produce a boolean result. 1620 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 1621 TargetBankID = AMDGPU::VGPRRegBankID; 1622 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 1623 TargetBankID = AMDGPU::VCCRegBankID; 1624 BankLHS = AMDGPU::VCCRegBankID; 1625 BankRHS = AMDGPU::VCCRegBankID; 1626 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 1627 TargetBankID = AMDGPU::SGPRRegBankID; 1628 } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) { 1629 // The operation must be done on a 32-bit register, but it will set 1630 // scc. The result type could interchangably be SCC or SGPR, since 1631 // both values will be produced. 1632 TargetBankID = AMDGPU::SCCRegBankID; 1633 BankLHS = AMDGPU::SGPRRegBankID; 1634 BankRHS = AMDGPU::SGPRRegBankID; 1635 } 1636 } 1637 1638 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 1639 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 1640 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 1641 break; 1642 } 1643 1644 if (Size == 64) { 1645 1646 if (isSALUMapping(MI)) { 1647 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 1648 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 1649 } else { 1650 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 1651 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/); 1652 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 1653 1654 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/); 1655 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 1656 } 1657 1658 break; 1659 } 1660 1661 LLVM_FALLTHROUGH; 1662 } 1663 1664 case AMDGPU::G_GEP: 1665 case AMDGPU::G_ADD: 1666 case AMDGPU::G_SUB: 1667 case AMDGPU::G_MUL: 1668 case AMDGPU::G_SHL: 1669 case AMDGPU::G_LSHR: 1670 case AMDGPU::G_ASHR: 1671 case AMDGPU::G_UADDO: 1672 case AMDGPU::G_SADDO: 1673 case AMDGPU::G_USUBO: 1674 case AMDGPU::G_SSUBO: 1675 case AMDGPU::G_UADDE: 1676 case AMDGPU::G_SADDE: 1677 case AMDGPU::G_USUBE: 1678 case AMDGPU::G_SSUBE: 1679 case AMDGPU::G_UMULH: 1680 case AMDGPU::G_SMULH: 1681 case AMDGPU::G_SMIN: 1682 case AMDGPU::G_SMAX: 1683 case AMDGPU::G_UMIN: 1684 case AMDGPU::G_UMAX: 1685 if (isSALUMapping(MI)) 1686 return getDefaultMappingSOP(MI); 1687 LLVM_FALLTHROUGH; 1688 1689 case AMDGPU::G_FADD: 1690 case AMDGPU::G_FSUB: 1691 case AMDGPU::G_FPTOSI: 1692 case AMDGPU::G_FPTOUI: 1693 case AMDGPU::G_FMUL: 1694 case AMDGPU::G_FMA: 1695 case AMDGPU::G_FSQRT: 1696 case AMDGPU::G_SITOFP: 1697 case AMDGPU::G_UITOFP: 1698 case AMDGPU::G_FPTRUNC: 1699 case AMDGPU::G_FPEXT: 1700 case AMDGPU::G_FEXP2: 1701 case AMDGPU::G_FLOG2: 1702 case AMDGPU::G_FCANONICALIZE: 1703 case AMDGPU::G_INTRINSIC_TRUNC: 1704 case AMDGPU::G_INTRINSIC_ROUND: 1705 return getDefaultMappingVOP(MI); 1706 case AMDGPU::G_IMPLICIT_DEF: { 1707 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1708 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 1709 break; 1710 } 1711 case AMDGPU::G_FCONSTANT: 1712 case AMDGPU::G_CONSTANT: 1713 case AMDGPU::G_FRAME_INDEX: 1714 case AMDGPU::G_BLOCK_ADDR: { 1715 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1716 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 1717 break; 1718 } 1719 case AMDGPU::G_INSERT: { 1720 unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : 1721 AMDGPU::VGPRRegBankID; 1722 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 1723 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 1724 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 1725 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 1726 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 1727 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 1728 OpdsMapping[3] = nullptr; 1729 break; 1730 } 1731 case AMDGPU::G_EXTRACT: { 1732 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 1733 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 1734 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 1735 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 1736 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 1737 OpdsMapping[2] = nullptr; 1738 break; 1739 } 1740 case AMDGPU::G_MERGE_VALUES: 1741 case AMDGPU::G_BUILD_VECTOR: 1742 case AMDGPU::G_CONCAT_VECTORS: { 1743 unsigned Bank = isSALUMapping(MI) ? 1744 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 1745 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1746 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 1747 1748 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 1749 // Op1 and Dst should use the same register bank. 1750 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 1751 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 1752 break; 1753 } 1754 case AMDGPU::G_BITCAST: 1755 case AMDGPU::G_INTTOPTR: 1756 case AMDGPU::G_PTRTOINT: 1757 case AMDGPU::G_CTLZ: 1758 case AMDGPU::G_CTLZ_ZERO_UNDEF: 1759 case AMDGPU::G_CTTZ: 1760 case AMDGPU::G_CTTZ_ZERO_UNDEF: 1761 case AMDGPU::G_CTPOP: 1762 case AMDGPU::G_BSWAP: 1763 case AMDGPU::G_FABS: 1764 case AMDGPU::G_FNEG: { 1765 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1766 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 1767 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 1768 break; 1769 } 1770 case AMDGPU::G_TRUNC: { 1771 Register Dst = MI.getOperand(0).getReg(); 1772 Register Src = MI.getOperand(1).getReg(); 1773 unsigned Bank = getRegBankID(Src, MRI, *TRI); 1774 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 1775 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 1776 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 1777 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 1778 break; 1779 } 1780 case AMDGPU::G_ZEXT: 1781 case AMDGPU::G_SEXT: 1782 case AMDGPU::G_ANYEXT: { 1783 Register Dst = MI.getOperand(0).getReg(); 1784 Register Src = MI.getOperand(1).getReg(); 1785 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 1786 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 1787 1788 unsigned DstBank; 1789 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 1790 assert(SrcBank); 1791 switch (SrcBank->getID()) { 1792 case AMDGPU::SCCRegBankID: 1793 case AMDGPU::SGPRRegBankID: 1794 DstBank = AMDGPU::SGPRRegBankID; 1795 break; 1796 default: 1797 DstBank = AMDGPU::VGPRRegBankID; 1798 break; 1799 } 1800 1801 // TODO: Should anyext be split into 32-bit part as well? 1802 if (MI.getOpcode() == AMDGPU::G_ANYEXT) { 1803 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize); 1804 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize); 1805 } else { 1806 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 1807 // 32-bits, and then to 64. 1808 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 1809 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 1810 SrcSize); 1811 } 1812 break; 1813 } 1814 case AMDGPU::G_FCMP: { 1815 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 1816 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 1817 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 1818 OpdsMapping[1] = nullptr; // Predicate Operand. 1819 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 1820 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 1821 break; 1822 } 1823 case AMDGPU::G_STORE: { 1824 assert(MI.getOperand(0).isReg()); 1825 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1826 // FIXME: We need to specify a different reg bank once scalar stores 1827 // are supported. 1828 const ValueMapping *ValMapping = 1829 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 1830 // FIXME: Depending on the type of store, the pointer could be in 1831 // the SGPR Reg bank. 1832 // FIXME: Pointer size should be based on the address space. 1833 const ValueMapping *PtrMapping = 1834 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 1835 1836 OpdsMapping[0] = ValMapping; 1837 OpdsMapping[1] = PtrMapping; 1838 break; 1839 } 1840 1841 case AMDGPU::G_ICMP: { 1842 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 1843 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 1844 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 1845 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); 1846 1847 bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID && 1848 Op3Bank == AMDGPU::SGPRRegBankID && 1849 (Size == 32 || (Size == 64 && 1850 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 1851 MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64())); 1852 1853 unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; 1854 1855 OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1); 1856 OpdsMapping[1] = nullptr; // Predicate Operand. 1857 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 1858 OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size); 1859 break; 1860 } 1861 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 1862 unsigned OutputBankID = isSALUMapping(MI) ? 1863 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 1864 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 1865 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 1866 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 1867 1868 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize); 1869 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize); 1870 1871 // The index can be either if the source vector is VGPR. 1872 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 1873 break; 1874 } 1875 case AMDGPU::G_INSERT_VECTOR_ELT: { 1876 unsigned OutputBankID = isSALUMapping(MI) ? 1877 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 1878 1879 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1880 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 1881 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 1882 unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 1883 unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); 1884 1885 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 1886 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 1887 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize); 1888 1889 // The index can be either if the source vector is VGPR. 1890 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 1891 break; 1892 } 1893 case AMDGPU::G_UNMERGE_VALUES: { 1894 unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : 1895 AMDGPU::VGPRRegBankID; 1896 1897 // Op1 and Dst should use the same register bank. 1898 // FIXME: Shouldn't this be the default? Why do we need to handle this? 1899 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 1900 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 1901 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 1902 } 1903 break; 1904 } 1905 case AMDGPU::G_INTRINSIC: { 1906 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1907 default: 1908 return getInvalidInstructionMapping(); 1909 case Intrinsic::maxnum: 1910 case Intrinsic::minnum: 1911 case Intrinsic::amdgcn_div_fmas: 1912 case Intrinsic::amdgcn_trig_preop: 1913 case Intrinsic::amdgcn_sin: 1914 case Intrinsic::amdgcn_cos: 1915 case Intrinsic::amdgcn_log_clamp: 1916 case Intrinsic::amdgcn_rcp: 1917 case Intrinsic::amdgcn_rcp_legacy: 1918 case Intrinsic::amdgcn_rsq: 1919 case Intrinsic::amdgcn_rsq_legacy: 1920 case Intrinsic::amdgcn_rsq_clamp: 1921 case Intrinsic::amdgcn_ldexp: 1922 case Intrinsic::amdgcn_frexp_mant: 1923 case Intrinsic::amdgcn_frexp_exp: 1924 case Intrinsic::amdgcn_fract: 1925 case Intrinsic::amdgcn_cvt_pkrtz: 1926 case Intrinsic::amdgcn_cvt_pknorm_i16: 1927 case Intrinsic::amdgcn_cvt_pknorm_u16: 1928 case Intrinsic::amdgcn_cvt_pk_i16: 1929 case Intrinsic::amdgcn_cvt_pk_u16: 1930 case Intrinsic::amdgcn_fmed3: 1931 case Intrinsic::amdgcn_cubeid: 1932 case Intrinsic::amdgcn_cubema: 1933 case Intrinsic::amdgcn_cubesc: 1934 case Intrinsic::amdgcn_cubetc: 1935 case Intrinsic::amdgcn_sffbh: 1936 case Intrinsic::amdgcn_fmad_ftz: 1937 case Intrinsic::amdgcn_mbcnt_lo: 1938 case Intrinsic::amdgcn_mbcnt_hi: 1939 case Intrinsic::amdgcn_ubfe: 1940 case Intrinsic::amdgcn_sbfe: 1941 case Intrinsic::amdgcn_lerp: 1942 case Intrinsic::amdgcn_sad_u8: 1943 case Intrinsic::amdgcn_msad_u8: 1944 case Intrinsic::amdgcn_sad_hi_u8: 1945 case Intrinsic::amdgcn_sad_u16: 1946 case Intrinsic::amdgcn_qsad_pk_u16_u8: 1947 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 1948 case Intrinsic::amdgcn_mqsad_u32_u8: 1949 case Intrinsic::amdgcn_cvt_pk_u8_f32: 1950 case Intrinsic::amdgcn_alignbit: 1951 case Intrinsic::amdgcn_alignbyte: 1952 case Intrinsic::amdgcn_fdot2: 1953 case Intrinsic::amdgcn_sdot2: 1954 case Intrinsic::amdgcn_udot2: 1955 case Intrinsic::amdgcn_sdot4: 1956 case Intrinsic::amdgcn_udot4: 1957 case Intrinsic::amdgcn_sdot8: 1958 case Intrinsic::amdgcn_udot8: 1959 case Intrinsic::amdgcn_fdiv_fast: 1960 case Intrinsic::amdgcn_wwm: 1961 case Intrinsic::amdgcn_wqm: 1962 return getDefaultMappingVOP(MI); 1963 case Intrinsic::amdgcn_ds_permute: 1964 case Intrinsic::amdgcn_ds_bpermute: 1965 case Intrinsic::amdgcn_update_dpp: 1966 return getDefaultMappingAllVGPR(MI); 1967 case Intrinsic::amdgcn_kernarg_segment_ptr: 1968 case Intrinsic::amdgcn_s_getpc: 1969 case Intrinsic::amdgcn_groupstaticsize: { 1970 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1971 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 1972 break; 1973 } 1974 case Intrinsic::amdgcn_wqm_vote: { 1975 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1976 OpdsMapping[0] = OpdsMapping[2] 1977 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 1978 break; 1979 } 1980 case Intrinsic::amdgcn_s_buffer_load: { 1981 // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS 1982 Register RSrc = MI.getOperand(2).getReg(); // SGPR 1983 Register Offset = MI.getOperand(3).getReg(); // SGPR/imm 1984 1985 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1986 unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); 1987 unsigned Size3 = MRI.getType(Offset).getSizeInBits(); 1988 1989 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); 1990 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); 1991 1992 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0); 1993 OpdsMapping[1] = nullptr; // intrinsic id 1994 1995 // Lie and claim everything is legal, even though some need to be 1996 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 1997 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc 1998 OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3); 1999 OpdsMapping[4] = nullptr; 2000 break; 2001 } 2002 case Intrinsic::amdgcn_div_scale: { 2003 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2004 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 2005 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 2006 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 2007 2008 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 2009 OpdsMapping[3] = AMDGPU::getValueMapping( 2010 getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize); 2011 OpdsMapping[4] = AMDGPU::getValueMapping( 2012 getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize); 2013 2014 break; 2015 } 2016 case Intrinsic::amdgcn_class: { 2017 Register Src0Reg = MI.getOperand(2).getReg(); 2018 Register Src1Reg = MI.getOperand(3).getReg(); 2019 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 2020 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 2021 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2022 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 2023 OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI), 2024 Src0Size); 2025 OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI), 2026 Src1Size); 2027 break; 2028 } 2029 case Intrinsic::amdgcn_icmp: 2030 case Intrinsic::amdgcn_fcmp: { 2031 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2032 // This is not VCCRegBank because this is not used in boolean contexts. 2033 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 2034 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 2035 unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 2036 unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); 2037 OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize); 2038 OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize); 2039 break; 2040 } 2041 case Intrinsic::amdgcn_readlane: { 2042 // This must be an SGPR, but accept a VGPR. 2043 unsigned IdxReg = MI.getOperand(3).getReg(); 2044 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 2045 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 2046 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 2047 LLVM_FALLTHROUGH; 2048 } 2049 case Intrinsic::amdgcn_readfirstlane: { 2050 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2051 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 2052 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 2053 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 2054 break; 2055 } 2056 case Intrinsic::amdgcn_writelane: { 2057 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2058 unsigned SrcReg = MI.getOperand(2).getReg(); 2059 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 2060 unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 2061 unsigned IdxReg = MI.getOperand(3).getReg(); 2062 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 2063 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 2064 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 2065 2066 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 2067 // to legalize. 2068 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 2069 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 2070 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 2071 break; 2072 } 2073 case Intrinsic::amdgcn_if_break: { 2074 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 2075 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2076 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 2077 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2078 break; 2079 } 2080 } 2081 break; 2082 } 2083 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 2084 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 2085 default: 2086 return getInvalidInstructionMapping(); 2087 case Intrinsic::amdgcn_s_getreg: 2088 case Intrinsic::amdgcn_s_memtime: 2089 case Intrinsic::amdgcn_s_memrealtime: 2090 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { 2091 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2092 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2093 break; 2094 } 2095 case Intrinsic::amdgcn_ds_append: 2096 case Intrinsic::amdgcn_ds_consume: 2097 case Intrinsic::amdgcn_ds_fadd: 2098 case Intrinsic::amdgcn_ds_fmin: 2099 case Intrinsic::amdgcn_ds_fmax: 2100 case Intrinsic::amdgcn_atomic_inc: 2101 case Intrinsic::amdgcn_atomic_dec: 2102 return getDefaultMappingAllVGPR(MI); 2103 case Intrinsic::amdgcn_ds_ordered_add: 2104 case Intrinsic::amdgcn_ds_ordered_swap: { 2105 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2106 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 2107 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 2108 AMDGPU::SGPRRegBankID); 2109 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 2110 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2111 break; 2112 } 2113 case Intrinsic::amdgcn_exp_compr: 2114 OpdsMapping[0] = nullptr; // IntrinsicID 2115 // FIXME: These are immediate values which can't be read from registers. 2116 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2117 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2118 // FIXME: Could we support packed types here? 2119 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2120 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2121 // FIXME: These are immediate values which can't be read from registers. 2122 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2123 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2124 break; 2125 case Intrinsic::amdgcn_exp: 2126 OpdsMapping[0] = nullptr; // IntrinsicID 2127 // FIXME: These are immediate values which can't be read from registers. 2128 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2129 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2130 // FIXME: Could we support packed types here? 2131 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2132 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2133 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2134 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2135 // FIXME: These are immediate values which can't be read from registers. 2136 OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2137 OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2138 break; 2139 case Intrinsic::amdgcn_buffer_load: { 2140 Register RSrc = MI.getOperand(2).getReg(); // SGPR 2141 Register VIndex = MI.getOperand(3).getReg(); // VGPR 2142 Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm 2143 2144 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2145 unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); 2146 unsigned Size3 = MRI.getType(VIndex).getSizeInBits(); 2147 unsigned Size4 = MRI.getType(Offset).getSizeInBits(); 2148 2149 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); 2150 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); 2151 2152 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); 2153 OpdsMapping[1] = nullptr; // intrinsic id 2154 2155 // Lie and claim everything is legal, even though some need to be 2156 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 2157 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc 2158 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3); 2159 OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4); 2160 OpdsMapping[5] = nullptr; 2161 OpdsMapping[6] = nullptr; 2162 break; 2163 } 2164 case Intrinsic::amdgcn_s_sendmsg: 2165 case Intrinsic::amdgcn_s_sendmsghalt: { 2166 // This must be an SGPR, but accept a VGPR. 2167 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 2168 AMDGPU::SGPRRegBankID); 2169 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 2170 break; 2171 } 2172 case Intrinsic::amdgcn_end_cf: { 2173 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 2174 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2175 break; 2176 } 2177 } 2178 break; 2179 } 2180 case AMDGPU::G_SELECT: { 2181 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2182 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 2183 AMDGPU::SGPRRegBankID); 2184 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI, 2185 AMDGPU::SGPRRegBankID); 2186 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 2187 Op3Bank == AMDGPU::SGPRRegBankID; 2188 2189 unsigned CondBankDefault = SGPRSrcs ? 2190 AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; 2191 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 2192 CondBankDefault); 2193 if (CondBank == AMDGPU::SGPRRegBankID) 2194 CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; 2195 else if (CondBank == AMDGPU::VGPRRegBankID) 2196 CondBank = AMDGPU::VCCRegBankID; 2197 2198 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ? 2199 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 2200 2201 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID); 2202 2203 if (Size == 64) { 2204 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 2205 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 2206 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 2207 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 2208 } else { 2209 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 2210 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 2211 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 2212 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 2213 } 2214 2215 break; 2216 } 2217 2218 case AMDGPU::G_LOAD: 2219 return getInstrMappingForLoad(MI); 2220 2221 case AMDGPU::G_ATOMICRMW_XCHG: 2222 case AMDGPU::G_ATOMICRMW_ADD: 2223 case AMDGPU::G_ATOMICRMW_SUB: 2224 case AMDGPU::G_ATOMICRMW_AND: 2225 case AMDGPU::G_ATOMICRMW_OR: 2226 case AMDGPU::G_ATOMICRMW_XOR: 2227 case AMDGPU::G_ATOMICRMW_MAX: 2228 case AMDGPU::G_ATOMICRMW_MIN: 2229 case AMDGPU::G_ATOMICRMW_UMAX: 2230 case AMDGPU::G_ATOMICRMW_UMIN: 2231 case AMDGPU::G_ATOMIC_CMPXCHG: { 2232 return getDefaultMappingAllVGPR(MI); 2233 } 2234 case AMDGPU::G_BRCOND: { 2235 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI, 2236 AMDGPU::SGPRRegBankID); 2237 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 2238 if (Bank != AMDGPU::SCCRegBankID) 2239 Bank = AMDGPU::VCCRegBankID; 2240 2241 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 2242 break; 2243 } 2244 } 2245 2246 return getInstructionMapping(/*ID*/1, /*Cost*/1, 2247 getOperandsMapping(OpdsMapping), 2248 MI.getNumOperands()); 2249 } 2250 2251