1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include <optional>
31
32 #define DEBUG_TYPE "amdgpu-isel"
33
34 using namespace llvm;
35 using namespace MIPatternMatch;
36
37 static cl::opt<bool> AllowRiskySelect(
38 "amdgpu-global-isel-risky-select",
39 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
40 cl::init(false),
41 cl::ReallyHidden);
42
43 #define GET_GLOBALISEL_IMPL
44 #define AMDGPUSubtarget GCNSubtarget
45 #include "AMDGPUGenGlobalISel.inc"
46 #undef GET_GLOBALISEL_IMPL
47 #undef AMDGPUSubtarget
48
AMDGPUInstructionSelector(const GCNSubtarget & STI,const AMDGPURegisterBankInfo & RBI,const AMDGPUTargetMachine & TM)49 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
50 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
51 const AMDGPUTargetMachine &TM)
52 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
53 STI(STI),
54 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
55 #define GET_GLOBALISEL_PREDICATES_INIT
56 #include "AMDGPUGenGlobalISel.inc"
57 #undef GET_GLOBALISEL_PREDICATES_INIT
58 #define GET_GLOBALISEL_TEMPORARIES_INIT
59 #include "AMDGPUGenGlobalISel.inc"
60 #undef GET_GLOBALISEL_TEMPORARIES_INIT
61 {
62 }
63
getName()64 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
65
setupMF(MachineFunction & MF,GISelKnownBits * KB,CodeGenCoverage * CoverageInfo,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)66 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
67 CodeGenCoverage *CoverageInfo,
68 ProfileSummaryInfo *PSI,
69 BlockFrequencyInfo *BFI) {
70 MRI = &MF.getRegInfo();
71 Subtarget = &MF.getSubtarget<GCNSubtarget>();
72 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
73 }
74
75 // Return the wave level SGPR base address if this is a wave address.
getWaveAddress(const MachineInstr * Def)76 static Register getWaveAddress(const MachineInstr *Def) {
77 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
78 ? Def->getOperand(1).getReg()
79 : Register();
80 }
81
isVCC(Register Reg,const MachineRegisterInfo & MRI) const82 bool AMDGPUInstructionSelector::isVCC(Register Reg,
83 const MachineRegisterInfo &MRI) const {
84 // The verifier is oblivious to s1 being a valid value for wavesize registers.
85 if (Reg.isPhysical())
86 return false;
87
88 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
89 const TargetRegisterClass *RC =
90 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
91 if (RC) {
92 const LLT Ty = MRI.getType(Reg);
93 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
94 return false;
95 // G_TRUNC s1 result is never vcc.
96 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
97 RC->hasSuperClassEq(TRI.getBoolRC());
98 }
99
100 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
101 return RB->getID() == AMDGPU::VCCRegBankID;
102 }
103
constrainCopyLikeIntrin(MachineInstr & MI,unsigned NewOpc) const104 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
105 unsigned NewOpc) const {
106 MI.setDesc(TII.get(NewOpc));
107 MI.removeOperand(1); // Remove intrinsic ID.
108 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
109
110 MachineOperand &Dst = MI.getOperand(0);
111 MachineOperand &Src = MI.getOperand(1);
112
113 // TODO: This should be legalized to s32 if needed
114 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
115 return false;
116
117 const TargetRegisterClass *DstRC
118 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
119 const TargetRegisterClass *SrcRC
120 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
121 if (!DstRC || DstRC != SrcRC)
122 return false;
123
124 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
125 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
126 }
127
selectCOPY(MachineInstr & I) const128 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
129 const DebugLoc &DL = I.getDebugLoc();
130 MachineBasicBlock *BB = I.getParent();
131 I.setDesc(TII.get(TargetOpcode::COPY));
132
133 const MachineOperand &Src = I.getOperand(1);
134 MachineOperand &Dst = I.getOperand(0);
135 Register DstReg = Dst.getReg();
136 Register SrcReg = Src.getReg();
137
138 if (isVCC(DstReg, *MRI)) {
139 if (SrcReg == AMDGPU::SCC) {
140 const TargetRegisterClass *RC
141 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
142 if (!RC)
143 return true;
144 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
145 }
146
147 if (!isVCC(SrcReg, *MRI)) {
148 // TODO: Should probably leave the copy and let copyPhysReg expand it.
149 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
150 return false;
151
152 const TargetRegisterClass *SrcRC
153 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
154
155 std::optional<ValueAndVReg> ConstVal =
156 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
157 if (ConstVal) {
158 unsigned MovOpc =
159 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
160 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
161 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
162 } else {
163 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
164
165 // We can't trust the high bits at this point, so clear them.
166
167 // TODO: Skip masking high bits if def is known boolean.
168
169 bool IsSGPR = TRI.isSGPRClass(SrcRC);
170 unsigned AndOpc =
171 IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
172 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
173 .addImm(1)
174 .addReg(SrcReg);
175 if (IsSGPR)
176 And.setOperandDead(3); // Dead scc
177
178 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
179 .addImm(0)
180 .addReg(MaskedReg);
181 }
182
183 if (!MRI->getRegClassOrNull(SrcReg))
184 MRI->setRegClass(SrcReg, SrcRC);
185 I.eraseFromParent();
186 return true;
187 }
188
189 const TargetRegisterClass *RC =
190 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
191 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
192 return false;
193
194 return true;
195 }
196
197 for (const MachineOperand &MO : I.operands()) {
198 if (MO.getReg().isPhysical())
199 continue;
200
201 const TargetRegisterClass *RC =
202 TRI.getConstrainedRegClassForOperand(MO, *MRI);
203 if (!RC)
204 continue;
205 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
206 }
207 return true;
208 }
209
selectPHI(MachineInstr & I) const210 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
211 const Register DefReg = I.getOperand(0).getReg();
212 const LLT DefTy = MRI->getType(DefReg);
213 if (DefTy == LLT::scalar(1)) {
214 if (!AllowRiskySelect) {
215 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
216 return false;
217 }
218
219 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
220 }
221
222 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
223
224 const RegClassOrRegBank &RegClassOrBank =
225 MRI->getRegClassOrRegBank(DefReg);
226
227 const TargetRegisterClass *DefRC
228 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
229 if (!DefRC) {
230 if (!DefTy.isValid()) {
231 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
232 return false;
233 }
234
235 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
236 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
237 if (!DefRC) {
238 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
239 return false;
240 }
241 }
242
243 // TODO: Verify that all registers have the same bank
244 I.setDesc(TII.get(TargetOpcode::PHI));
245 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
246 }
247
248 MachineOperand
getSubOperand64(MachineOperand & MO,const TargetRegisterClass & SubRC,unsigned SubIdx) const249 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
250 const TargetRegisterClass &SubRC,
251 unsigned SubIdx) const {
252
253 MachineInstr *MI = MO.getParent();
254 MachineBasicBlock *BB = MO.getParent()->getParent();
255 Register DstReg = MRI->createVirtualRegister(&SubRC);
256
257 if (MO.isReg()) {
258 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
259 Register Reg = MO.getReg();
260 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
261 .addReg(Reg, 0, ComposedSubIdx);
262
263 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
264 MO.isKill(), MO.isDead(), MO.isUndef(),
265 MO.isEarlyClobber(), 0, MO.isDebug(),
266 MO.isInternalRead());
267 }
268
269 assert(MO.isImm());
270
271 APInt Imm(64, MO.getImm());
272
273 switch (SubIdx) {
274 default:
275 llvm_unreachable("do not know to split immediate with this sub index.");
276 case AMDGPU::sub0:
277 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
278 case AMDGPU::sub1:
279 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
280 }
281 }
282
getLogicalBitOpcode(unsigned Opc,bool Is64)283 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
284 switch (Opc) {
285 case AMDGPU::G_AND:
286 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
287 case AMDGPU::G_OR:
288 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
289 case AMDGPU::G_XOR:
290 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
291 default:
292 llvm_unreachable("not a bit op");
293 }
294 }
295
selectG_AND_OR_XOR(MachineInstr & I) const296 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
297 Register DstReg = I.getOperand(0).getReg();
298 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
299
300 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
301 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
302 DstRB->getID() != AMDGPU::VCCRegBankID)
303 return false;
304
305 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
306 STI.isWave64());
307 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
308
309 // Dead implicit-def of scc
310 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
311 true, // isImp
312 false, // isKill
313 true)); // isDead
314 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
315 }
316
selectG_ADD_SUB(MachineInstr & I) const317 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
318 MachineBasicBlock *BB = I.getParent();
319 MachineFunction *MF = BB->getParent();
320 Register DstReg = I.getOperand(0).getReg();
321 const DebugLoc &DL = I.getDebugLoc();
322 LLT Ty = MRI->getType(DstReg);
323 if (Ty.isVector())
324 return false;
325
326 unsigned Size = Ty.getSizeInBits();
327 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
328 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
329 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
330
331 if (Size == 32) {
332 if (IsSALU) {
333 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
334 MachineInstr *Add =
335 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
336 .add(I.getOperand(1))
337 .add(I.getOperand(2))
338 .setOperandDead(3); // Dead scc
339 I.eraseFromParent();
340 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
341 }
342
343 if (STI.hasAddNoCarry()) {
344 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
345 I.setDesc(TII.get(Opc));
346 I.addOperand(*MF, MachineOperand::CreateImm(0));
347 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
348 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
349 }
350
351 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
352
353 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
354 MachineInstr *Add
355 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
356 .addDef(UnusedCarry, RegState::Dead)
357 .add(I.getOperand(1))
358 .add(I.getOperand(2))
359 .addImm(0);
360 I.eraseFromParent();
361 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
362 }
363
364 assert(!Sub && "illegal sub should not reach here");
365
366 const TargetRegisterClass &RC
367 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
368 const TargetRegisterClass &HalfRC
369 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
370
371 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
372 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
373 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
374 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
375
376 Register DstLo = MRI->createVirtualRegister(&HalfRC);
377 Register DstHi = MRI->createVirtualRegister(&HalfRC);
378
379 if (IsSALU) {
380 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
381 .add(Lo1)
382 .add(Lo2);
383 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
384 .add(Hi1)
385 .add(Hi2)
386 .setOperandDead(3); // Dead scc
387 } else {
388 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
389 Register CarryReg = MRI->createVirtualRegister(CarryRC);
390 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
391 .addDef(CarryReg)
392 .add(Lo1)
393 .add(Lo2)
394 .addImm(0);
395 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
396 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
397 .add(Hi1)
398 .add(Hi2)
399 .addReg(CarryReg, RegState::Kill)
400 .addImm(0);
401
402 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
403 return false;
404 }
405
406 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
407 .addReg(DstLo)
408 .addImm(AMDGPU::sub0)
409 .addReg(DstHi)
410 .addImm(AMDGPU::sub1);
411
412
413 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
414 return false;
415
416 I.eraseFromParent();
417 return true;
418 }
419
selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr & I) const420 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
421 MachineInstr &I) const {
422 MachineBasicBlock *BB = I.getParent();
423 MachineFunction *MF = BB->getParent();
424 const DebugLoc &DL = I.getDebugLoc();
425 Register Dst0Reg = I.getOperand(0).getReg();
426 Register Dst1Reg = I.getOperand(1).getReg();
427 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
428 I.getOpcode() == AMDGPU::G_UADDE;
429 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
430 I.getOpcode() == AMDGPU::G_USUBE;
431
432 if (isVCC(Dst1Reg, *MRI)) {
433 unsigned NoCarryOpc =
434 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
435 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
436 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
437 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
438 I.addOperand(*MF, MachineOperand::CreateImm(0));
439 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
440 }
441
442 Register Src0Reg = I.getOperand(2).getReg();
443 Register Src1Reg = I.getOperand(3).getReg();
444
445 if (HasCarryIn) {
446 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
447 .addReg(I.getOperand(4).getReg());
448 }
449
450 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
451 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
452
453 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
454 .add(I.getOperand(2))
455 .add(I.getOperand(3));
456
457 if (MRI->use_nodbg_empty(Dst1Reg)) {
458 CarryInst.setOperandDead(3); // Dead scc
459 } else {
460 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
461 .addReg(AMDGPU::SCC);
462 if (!MRI->getRegClassOrNull(Dst1Reg))
463 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
464 }
465
466 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
467 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
468 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
469 return false;
470
471 if (HasCarryIn &&
472 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
473 AMDGPU::SReg_32RegClass, *MRI))
474 return false;
475
476 I.eraseFromParent();
477 return true;
478 }
479
selectG_AMDGPU_MAD_64_32(MachineInstr & I) const480 bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
481 MachineInstr &I) const {
482 MachineBasicBlock *BB = I.getParent();
483 MachineFunction *MF = BB->getParent();
484 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
485
486 unsigned Opc;
487 if (Subtarget->hasMADIntraFwdBug())
488 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
489 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
490 else
491 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
492 I.setDesc(TII.get(Opc));
493 I.addOperand(*MF, MachineOperand::CreateImm(0));
494 I.addImplicitDefUseOperands(*MF);
495 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
496 }
497
498 // TODO: We should probably legalize these to only using 32-bit results.
selectG_EXTRACT(MachineInstr & I) const499 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
500 MachineBasicBlock *BB = I.getParent();
501 Register DstReg = I.getOperand(0).getReg();
502 Register SrcReg = I.getOperand(1).getReg();
503 LLT DstTy = MRI->getType(DstReg);
504 LLT SrcTy = MRI->getType(SrcReg);
505 const unsigned SrcSize = SrcTy.getSizeInBits();
506 unsigned DstSize = DstTy.getSizeInBits();
507
508 // TODO: Should handle any multiple of 32 offset.
509 unsigned Offset = I.getOperand(2).getImm();
510 if (Offset % 32 != 0 || DstSize > 128)
511 return false;
512
513 // 16-bit operations really use 32-bit registers.
514 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
515 if (DstSize == 16)
516 DstSize = 32;
517
518 const TargetRegisterClass *DstRC =
519 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
520 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
521 return false;
522
523 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
524 const TargetRegisterClass *SrcRC =
525 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
526 if (!SrcRC)
527 return false;
528 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
529 DstSize / 32);
530 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
531 if (!SrcRC)
532 return false;
533
534 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
535 *SrcRC, I.getOperand(1));
536 const DebugLoc &DL = I.getDebugLoc();
537 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
538 .addReg(SrcReg, 0, SubReg);
539
540 I.eraseFromParent();
541 return true;
542 }
543
selectG_MERGE_VALUES(MachineInstr & MI) const544 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
545 MachineBasicBlock *BB = MI.getParent();
546 Register DstReg = MI.getOperand(0).getReg();
547 LLT DstTy = MRI->getType(DstReg);
548 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
549
550 const unsigned SrcSize = SrcTy.getSizeInBits();
551 if (SrcSize < 32)
552 return selectImpl(MI, *CoverageInfo);
553
554 const DebugLoc &DL = MI.getDebugLoc();
555 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
556 const unsigned DstSize = DstTy.getSizeInBits();
557 const TargetRegisterClass *DstRC =
558 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
559 if (!DstRC)
560 return false;
561
562 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
563 MachineInstrBuilder MIB =
564 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
565 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
566 MachineOperand &Src = MI.getOperand(I + 1);
567 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
568 MIB.addImm(SubRegs[I]);
569
570 const TargetRegisterClass *SrcRC
571 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
572 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
573 return false;
574 }
575
576 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
577 return false;
578
579 MI.eraseFromParent();
580 return true;
581 }
582
selectG_UNMERGE_VALUES(MachineInstr & MI) const583 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
584 MachineBasicBlock *BB = MI.getParent();
585 const int NumDst = MI.getNumOperands() - 1;
586
587 MachineOperand &Src = MI.getOperand(NumDst);
588
589 Register SrcReg = Src.getReg();
590 Register DstReg0 = MI.getOperand(0).getReg();
591 LLT DstTy = MRI->getType(DstReg0);
592 LLT SrcTy = MRI->getType(SrcReg);
593
594 const unsigned DstSize = DstTy.getSizeInBits();
595 const unsigned SrcSize = SrcTy.getSizeInBits();
596 const DebugLoc &DL = MI.getDebugLoc();
597 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
598
599 const TargetRegisterClass *SrcRC =
600 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
601 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
602 return false;
603
604 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
605 // source, and this relies on the fact that the same subregister indices are
606 // used for both.
607 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
608 for (int I = 0, E = NumDst; I != E; ++I) {
609 MachineOperand &Dst = MI.getOperand(I);
610 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
611 .addReg(SrcReg, 0, SubRegs[I]);
612
613 // Make sure the subregister index is valid for the source register.
614 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
615 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
616 return false;
617
618 const TargetRegisterClass *DstRC =
619 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
620 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
621 return false;
622 }
623
624 MI.eraseFromParent();
625 return true;
626 }
627
selectG_BUILD_VECTOR(MachineInstr & MI) const628 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
629 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
630 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
631
632 Register Src0 = MI.getOperand(1).getReg();
633 Register Src1 = MI.getOperand(2).getReg();
634 LLT SrcTy = MRI->getType(Src0);
635 const unsigned SrcSize = SrcTy.getSizeInBits();
636
637 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
638 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
639 return selectG_MERGE_VALUES(MI);
640 }
641
642 // Selection logic below is for V2S16 only.
643 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
644 Register Dst = MI.getOperand(0).getReg();
645 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
646 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
647 SrcTy != LLT::scalar(32)))
648 return selectImpl(MI, *CoverageInfo);
649
650 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
651 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
652 return false;
653
654 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
655 DstBank->getID() == AMDGPU::VGPRRegBankID);
656 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
657
658 const DebugLoc &DL = MI.getDebugLoc();
659 MachineBasicBlock *BB = MI.getParent();
660
661 // First, before trying TableGen patterns, check if both sources are
662 // constants. In those cases, we can trivially compute the final constant
663 // and emit a simple move.
664 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
665 if (ConstSrc1) {
666 auto ConstSrc0 =
667 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
668 if (ConstSrc0) {
669 const int64_t K0 = ConstSrc0->Value.getSExtValue();
670 const int64_t K1 = ConstSrc1->Value.getSExtValue();
671 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
672 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
673 uint32_t Imm = Lo16 | (Hi16 << 16);
674
675 // VALU
676 if (IsVector) {
677 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
678 MI.eraseFromParent();
679 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
680 }
681
682 // SALU
683 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
684 MI.eraseFromParent();
685 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
686 }
687 }
688
689 // Now try TableGen patterns.
690 if (selectImpl(MI, *CoverageInfo))
691 return true;
692
693 // TODO: This should probably be a combine somewhere
694 // (build_vector $src0, undef) -> copy $src0
695 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
696 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
697 MI.setDesc(TII.get(AMDGPU::COPY));
698 MI.removeOperand(2);
699 const auto &RC =
700 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
701 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
702 RBI.constrainGenericRegister(Src0, RC, *MRI);
703 }
704
705 // TODO: Can be improved?
706 if (IsVector) {
707 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
708 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
709 .addImm(0xFFFF)
710 .addReg(Src0);
711 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
712 return false;
713
714 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
715 .addReg(Src1)
716 .addImm(16)
717 .addReg(TmpReg);
718 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
719 return false;
720
721 MI.eraseFromParent();
722 return true;
723 }
724
725 Register ShiftSrc0;
726 Register ShiftSrc1;
727
728 // With multiple uses of the shift, this will duplicate the shift and
729 // increase register pressure.
730 //
731 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
732 // => (S_PACK_HH_B32_B16 $src0, $src1)
733 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
734 // => (S_PACK_HL_B32_B16 $src0, $src1)
735 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
736 // => (S_PACK_LH_B32_B16 $src0, $src1)
737 // (build_vector $src0, $src1)
738 // => (S_PACK_LL_B32_B16 $src0, $src1)
739
740 bool Shift0 = mi_match(
741 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
742
743 bool Shift1 = mi_match(
744 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
745
746 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
747 if (Shift0 && Shift1) {
748 Opc = AMDGPU::S_PACK_HH_B32_B16;
749 MI.getOperand(1).setReg(ShiftSrc0);
750 MI.getOperand(2).setReg(ShiftSrc1);
751 } else if (Shift1) {
752 Opc = AMDGPU::S_PACK_LH_B32_B16;
753 MI.getOperand(2).setReg(ShiftSrc1);
754 } else if (Shift0) {
755 auto ConstSrc1 =
756 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
757 if (ConstSrc1 && ConstSrc1->Value == 0) {
758 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
759 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
760 .addReg(ShiftSrc0)
761 .addImm(16)
762 .setOperandDead(3); // Dead scc
763
764 MI.eraseFromParent();
765 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
766 }
767 if (STI.hasSPackHL()) {
768 Opc = AMDGPU::S_PACK_HL_B32_B16;
769 MI.getOperand(1).setReg(ShiftSrc0);
770 }
771 }
772
773 MI.setDesc(TII.get(Opc));
774 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
775 }
776
selectG_PTR_ADD(MachineInstr & I) const777 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
778 return selectG_ADD_SUB(I);
779 }
780
selectG_IMPLICIT_DEF(MachineInstr & I) const781 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
782 const MachineOperand &MO = I.getOperand(0);
783
784 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
785 // regbank check here is to know why getConstrainedRegClassForOperand failed.
786 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
787 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
788 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
789 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
790 return true;
791 }
792
793 return false;
794 }
795
selectG_INSERT(MachineInstr & I) const796 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
797 MachineBasicBlock *BB = I.getParent();
798
799 Register DstReg = I.getOperand(0).getReg();
800 Register Src0Reg = I.getOperand(1).getReg();
801 Register Src1Reg = I.getOperand(2).getReg();
802 LLT Src1Ty = MRI->getType(Src1Reg);
803
804 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
805 unsigned InsSize = Src1Ty.getSizeInBits();
806
807 int64_t Offset = I.getOperand(3).getImm();
808
809 // FIXME: These cases should have been illegal and unnecessary to check here.
810 if (Offset % 32 != 0 || InsSize % 32 != 0)
811 return false;
812
813 // Currently not handled by getSubRegFromChannel.
814 if (InsSize > 128)
815 return false;
816
817 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
818 if (SubReg == AMDGPU::NoSubRegister)
819 return false;
820
821 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
822 const TargetRegisterClass *DstRC =
823 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
824 if (!DstRC)
825 return false;
826
827 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
828 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
829 const TargetRegisterClass *Src0RC =
830 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
831 const TargetRegisterClass *Src1RC =
832 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
833
834 // Deal with weird cases where the class only partially supports the subreg
835 // index.
836 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
837 if (!Src0RC || !Src1RC)
838 return false;
839
840 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
841 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
842 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
843 return false;
844
845 const DebugLoc &DL = I.getDebugLoc();
846 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
847 .addReg(Src0Reg)
848 .addReg(Src1Reg)
849 .addImm(SubReg);
850
851 I.eraseFromParent();
852 return true;
853 }
854
selectG_SBFX_UBFX(MachineInstr & MI) const855 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
856 Register DstReg = MI.getOperand(0).getReg();
857 Register SrcReg = MI.getOperand(1).getReg();
858 Register OffsetReg = MI.getOperand(2).getReg();
859 Register WidthReg = MI.getOperand(3).getReg();
860
861 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
862 "scalar BFX instructions are expanded in regbankselect");
863 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
864 "64-bit vector BFX instructions are expanded in regbankselect");
865
866 const DebugLoc &DL = MI.getDebugLoc();
867 MachineBasicBlock *MBB = MI.getParent();
868
869 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
870 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
871 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
872 .addReg(SrcReg)
873 .addReg(OffsetReg)
874 .addReg(WidthReg);
875 MI.eraseFromParent();
876 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
877 }
878
selectInterpP1F16(MachineInstr & MI) const879 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
880 if (STI.getLDSBankCount() != 16)
881 return selectImpl(MI, *CoverageInfo);
882
883 Register Dst = MI.getOperand(0).getReg();
884 Register Src0 = MI.getOperand(2).getReg();
885 Register M0Val = MI.getOperand(6).getReg();
886 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
887 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
888 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
889 return false;
890
891 // This requires 2 instructions. It is possible to write a pattern to support
892 // this, but the generated isel emitter doesn't correctly deal with multiple
893 // output instructions using the same physical register input. The copy to m0
894 // is incorrectly placed before the second instruction.
895 //
896 // TODO: Match source modifiers.
897
898 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
899 const DebugLoc &DL = MI.getDebugLoc();
900 MachineBasicBlock *MBB = MI.getParent();
901
902 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
903 .addReg(M0Val);
904 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
905 .addImm(2)
906 .addImm(MI.getOperand(4).getImm()) // $attr
907 .addImm(MI.getOperand(3).getImm()); // $attrchan
908
909 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
910 .addImm(0) // $src0_modifiers
911 .addReg(Src0) // $src0
912 .addImm(MI.getOperand(4).getImm()) // $attr
913 .addImm(MI.getOperand(3).getImm()) // $attrchan
914 .addImm(0) // $src2_modifiers
915 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
916 .addImm(MI.getOperand(5).getImm()) // $high
917 .addImm(0) // $clamp
918 .addImm(0); // $omod
919
920 MI.eraseFromParent();
921 return true;
922 }
923
924 // Writelane is special in that it can use SGPR and M0 (which would normally
925 // count as using the constant bus twice - but in this case it is allowed since
926 // the lane selector doesn't count as a use of the constant bus). However, it is
927 // still required to abide by the 1 SGPR rule. Fix this up if we might have
928 // multiple SGPRs.
selectWritelane(MachineInstr & MI) const929 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
930 // With a constant bus limit of at least 2, there's no issue.
931 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
932 return selectImpl(MI, *CoverageInfo);
933
934 MachineBasicBlock *MBB = MI.getParent();
935 const DebugLoc &DL = MI.getDebugLoc();
936 Register VDst = MI.getOperand(0).getReg();
937 Register Val = MI.getOperand(2).getReg();
938 Register LaneSelect = MI.getOperand(3).getReg();
939 Register VDstIn = MI.getOperand(4).getReg();
940
941 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
942
943 std::optional<ValueAndVReg> ConstSelect =
944 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
945 if (ConstSelect) {
946 // The selector has to be an inline immediate, so we can use whatever for
947 // the other operands.
948 MIB.addReg(Val);
949 MIB.addImm(ConstSelect->Value.getSExtValue() &
950 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
951 } else {
952 std::optional<ValueAndVReg> ConstVal =
953 getIConstantVRegValWithLookThrough(Val, *MRI);
954
955 // If the value written is an inline immediate, we can get away without a
956 // copy to m0.
957 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
958 STI.hasInv2PiInlineImm())) {
959 MIB.addImm(ConstVal->Value.getSExtValue());
960 MIB.addReg(LaneSelect);
961 } else {
962 MIB.addReg(Val);
963
964 // If the lane selector was originally in a VGPR and copied with
965 // readfirstlane, there's a hazard to read the same SGPR from the
966 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
967 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
968
969 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
970 .addReg(LaneSelect);
971 MIB.addReg(AMDGPU::M0);
972 }
973 }
974
975 MIB.addReg(VDstIn);
976
977 MI.eraseFromParent();
978 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
979 }
980
981 // We need to handle this here because tablegen doesn't support matching
982 // instructions with multiple outputs.
selectDivScale(MachineInstr & MI) const983 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
984 Register Dst0 = MI.getOperand(0).getReg();
985 Register Dst1 = MI.getOperand(1).getReg();
986
987 LLT Ty = MRI->getType(Dst0);
988 unsigned Opc;
989 if (Ty == LLT::scalar(32))
990 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
991 else if (Ty == LLT::scalar(64))
992 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
993 else
994 return false;
995
996 // TODO: Match source modifiers.
997
998 const DebugLoc &DL = MI.getDebugLoc();
999 MachineBasicBlock *MBB = MI.getParent();
1000
1001 Register Numer = MI.getOperand(3).getReg();
1002 Register Denom = MI.getOperand(4).getReg();
1003 unsigned ChooseDenom = MI.getOperand(5).getImm();
1004
1005 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1006
1007 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1008 .addDef(Dst1)
1009 .addImm(0) // $src0_modifiers
1010 .addUse(Src0) // $src0
1011 .addImm(0) // $src1_modifiers
1012 .addUse(Denom) // $src1
1013 .addImm(0) // $src2_modifiers
1014 .addUse(Numer) // $src2
1015 .addImm(0) // $clamp
1016 .addImm(0); // $omod
1017
1018 MI.eraseFromParent();
1019 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1020 }
1021
selectG_INTRINSIC(MachineInstr & I) const1022 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1023 unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1024 switch (IntrinsicID) {
1025 case Intrinsic::amdgcn_if_break: {
1026 MachineBasicBlock *BB = I.getParent();
1027
1028 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1029 // SelectionDAG uses for wave32 vs wave64.
1030 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1031 .add(I.getOperand(0))
1032 .add(I.getOperand(2))
1033 .add(I.getOperand(3));
1034
1035 Register DstReg = I.getOperand(0).getReg();
1036 Register Src0Reg = I.getOperand(2).getReg();
1037 Register Src1Reg = I.getOperand(3).getReg();
1038
1039 I.eraseFromParent();
1040
1041 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1042 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1043
1044 return true;
1045 }
1046 case Intrinsic::amdgcn_interp_p1_f16:
1047 return selectInterpP1F16(I);
1048 case Intrinsic::amdgcn_wqm:
1049 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1050 case Intrinsic::amdgcn_softwqm:
1051 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1052 case Intrinsic::amdgcn_strict_wwm:
1053 case Intrinsic::amdgcn_wwm:
1054 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1055 case Intrinsic::amdgcn_strict_wqm:
1056 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1057 case Intrinsic::amdgcn_writelane:
1058 return selectWritelane(I);
1059 case Intrinsic::amdgcn_div_scale:
1060 return selectDivScale(I);
1061 case Intrinsic::amdgcn_icmp:
1062 case Intrinsic::amdgcn_fcmp:
1063 if (selectImpl(I, *CoverageInfo))
1064 return true;
1065 return selectIntrinsicCmp(I);
1066 case Intrinsic::amdgcn_ballot:
1067 return selectBallot(I);
1068 case Intrinsic::amdgcn_inverse_ballot:
1069 return selectInverseBallot(I);
1070 case Intrinsic::amdgcn_reloc_constant:
1071 return selectRelocConstant(I);
1072 case Intrinsic::amdgcn_groupstaticsize:
1073 return selectGroupStaticSize(I);
1074 case Intrinsic::returnaddress:
1075 return selectReturnAddress(I);
1076 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1077 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1078 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1079 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1080 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1081 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1082 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1083 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1084 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1085 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1086 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1087 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1088 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1089 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1090 return selectSMFMACIntrin(I);
1091 default:
1092 return selectImpl(I, *CoverageInfo);
1093 }
1094 }
1095
getV_CMPOpcode(CmpInst::Predicate P,unsigned Size,const GCNSubtarget & ST)1096 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
1097 const GCNSubtarget &ST) {
1098 if (Size != 16 && Size != 32 && Size != 64)
1099 return -1;
1100
1101 if (Size == 16 && !ST.has16BitInsts())
1102 return -1;
1103
1104 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1105 unsigned S64Opc) {
1106 if (Size == 16)
1107 return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1108 if (Size == 32)
1109 return S32Opc;
1110 return S64Opc;
1111 };
1112
1113 switch (P) {
1114 default:
1115 llvm_unreachable("Unknown condition code!");
1116 case CmpInst::ICMP_NE:
1117 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1118 AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1119 case CmpInst::ICMP_EQ:
1120 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1121 AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1122 case CmpInst::ICMP_SGT:
1123 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1124 AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1125 case CmpInst::ICMP_SGE:
1126 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1127 AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1128 case CmpInst::ICMP_SLT:
1129 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1130 AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1131 case CmpInst::ICMP_SLE:
1132 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1133 AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1134 case CmpInst::ICMP_UGT:
1135 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1136 AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1137 case CmpInst::ICMP_UGE:
1138 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1139 AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1140 case CmpInst::ICMP_ULT:
1141 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1142 AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1143 case CmpInst::ICMP_ULE:
1144 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1145 AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1146
1147 case CmpInst::FCMP_OEQ:
1148 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1149 AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1150 case CmpInst::FCMP_OGT:
1151 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1152 AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1153 case CmpInst::FCMP_OGE:
1154 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1155 AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1156 case CmpInst::FCMP_OLT:
1157 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1158 AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1159 case CmpInst::FCMP_OLE:
1160 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1161 AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1162 case CmpInst::FCMP_ONE:
1163 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1164 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1165 case CmpInst::FCMP_ORD:
1166 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1167 AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1168 case CmpInst::FCMP_UNO:
1169 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1170 AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1171 case CmpInst::FCMP_UEQ:
1172 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1173 AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1174 case CmpInst::FCMP_UGT:
1175 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1176 AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1177 case CmpInst::FCMP_UGE:
1178 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1179 AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1180 case CmpInst::FCMP_ULT:
1181 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1182 AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1183 case CmpInst::FCMP_ULE:
1184 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1185 AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1186 case CmpInst::FCMP_UNE:
1187 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1188 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1189 case CmpInst::FCMP_TRUE:
1190 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1191 AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1192 case CmpInst::FCMP_FALSE:
1193 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1194 AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1195 }
1196 }
1197
getS_CMPOpcode(CmpInst::Predicate P,unsigned Size) const1198 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1199 unsigned Size) const {
1200 if (Size == 64) {
1201 if (!STI.hasScalarCompareEq64())
1202 return -1;
1203
1204 switch (P) {
1205 case CmpInst::ICMP_NE:
1206 return AMDGPU::S_CMP_LG_U64;
1207 case CmpInst::ICMP_EQ:
1208 return AMDGPU::S_CMP_EQ_U64;
1209 default:
1210 return -1;
1211 }
1212 }
1213
1214 if (Size == 32) {
1215 switch (P) {
1216 case CmpInst::ICMP_NE:
1217 return AMDGPU::S_CMP_LG_U32;
1218 case CmpInst::ICMP_EQ:
1219 return AMDGPU::S_CMP_EQ_U32;
1220 case CmpInst::ICMP_SGT:
1221 return AMDGPU::S_CMP_GT_I32;
1222 case CmpInst::ICMP_SGE:
1223 return AMDGPU::S_CMP_GE_I32;
1224 case CmpInst::ICMP_SLT:
1225 return AMDGPU::S_CMP_LT_I32;
1226 case CmpInst::ICMP_SLE:
1227 return AMDGPU::S_CMP_LE_I32;
1228 case CmpInst::ICMP_UGT:
1229 return AMDGPU::S_CMP_GT_U32;
1230 case CmpInst::ICMP_UGE:
1231 return AMDGPU::S_CMP_GE_U32;
1232 case CmpInst::ICMP_ULT:
1233 return AMDGPU::S_CMP_LT_U32;
1234 case CmpInst::ICMP_ULE:
1235 return AMDGPU::S_CMP_LE_U32;
1236 case CmpInst::FCMP_OEQ:
1237 return AMDGPU::S_CMP_EQ_F32;
1238 case CmpInst::FCMP_OGT:
1239 return AMDGPU::S_CMP_GT_F32;
1240 case CmpInst::FCMP_OGE:
1241 return AMDGPU::S_CMP_GE_F32;
1242 case CmpInst::FCMP_OLT:
1243 return AMDGPU::S_CMP_LT_F32;
1244 case CmpInst::FCMP_OLE:
1245 return AMDGPU::S_CMP_LE_F32;
1246 case CmpInst::FCMP_ONE:
1247 return AMDGPU::S_CMP_LG_F32;
1248 case CmpInst::FCMP_ORD:
1249 return AMDGPU::S_CMP_O_F32;
1250 case CmpInst::FCMP_UNO:
1251 return AMDGPU::S_CMP_U_F32;
1252 case CmpInst::FCMP_UEQ:
1253 return AMDGPU::S_CMP_NLG_F32;
1254 case CmpInst::FCMP_UGT:
1255 return AMDGPU::S_CMP_NLE_F32;
1256 case CmpInst::FCMP_UGE:
1257 return AMDGPU::S_CMP_NLT_F32;
1258 case CmpInst::FCMP_ULT:
1259 return AMDGPU::S_CMP_NGE_F32;
1260 case CmpInst::FCMP_ULE:
1261 return AMDGPU::S_CMP_NGT_F32;
1262 case CmpInst::FCMP_UNE:
1263 return AMDGPU::S_CMP_NEQ_F32;
1264 default:
1265 llvm_unreachable("Unknown condition code!");
1266 }
1267 }
1268
1269 if (Size == 16) {
1270 if (!STI.hasSALUFloatInsts())
1271 return -1;
1272
1273 switch (P) {
1274 case CmpInst::FCMP_OEQ:
1275 return AMDGPU::S_CMP_EQ_F16;
1276 case CmpInst::FCMP_OGT:
1277 return AMDGPU::S_CMP_GT_F16;
1278 case CmpInst::FCMP_OGE:
1279 return AMDGPU::S_CMP_GE_F16;
1280 case CmpInst::FCMP_OLT:
1281 return AMDGPU::S_CMP_LT_F16;
1282 case CmpInst::FCMP_OLE:
1283 return AMDGPU::S_CMP_LE_F16;
1284 case CmpInst::FCMP_ONE:
1285 return AMDGPU::S_CMP_LG_F16;
1286 case CmpInst::FCMP_ORD:
1287 return AMDGPU::S_CMP_O_F16;
1288 case CmpInst::FCMP_UNO:
1289 return AMDGPU::S_CMP_U_F16;
1290 case CmpInst::FCMP_UEQ:
1291 return AMDGPU::S_CMP_NLG_F16;
1292 case CmpInst::FCMP_UGT:
1293 return AMDGPU::S_CMP_NLE_F16;
1294 case CmpInst::FCMP_UGE:
1295 return AMDGPU::S_CMP_NLT_F16;
1296 case CmpInst::FCMP_ULT:
1297 return AMDGPU::S_CMP_NGE_F16;
1298 case CmpInst::FCMP_ULE:
1299 return AMDGPU::S_CMP_NGT_F16;
1300 case CmpInst::FCMP_UNE:
1301 return AMDGPU::S_CMP_NEQ_F16;
1302 default:
1303 llvm_unreachable("Unknown condition code!");
1304 }
1305 }
1306
1307 return -1;
1308 }
1309
selectG_ICMP_or_FCMP(MachineInstr & I) const1310 bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1311
1312 MachineBasicBlock *BB = I.getParent();
1313 const DebugLoc &DL = I.getDebugLoc();
1314
1315 Register SrcReg = I.getOperand(2).getReg();
1316 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1317
1318 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1319
1320 Register CCReg = I.getOperand(0).getReg();
1321 if (!isVCC(CCReg, *MRI)) {
1322 int Opcode = getS_CMPOpcode(Pred, Size);
1323 if (Opcode == -1)
1324 return false;
1325 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1326 .add(I.getOperand(2))
1327 .add(I.getOperand(3));
1328 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1329 .addReg(AMDGPU::SCC);
1330 bool Ret =
1331 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1332 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1333 I.eraseFromParent();
1334 return Ret;
1335 }
1336
1337 if (I.getOpcode() == AMDGPU::G_FCMP)
1338 return false;
1339
1340 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1341 if (Opcode == -1)
1342 return false;
1343
1344 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1345 I.getOperand(0).getReg())
1346 .add(I.getOperand(2))
1347 .add(I.getOperand(3));
1348 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1349 *TRI.getBoolRC(), *MRI);
1350 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1351 I.eraseFromParent();
1352 return Ret;
1353 }
1354
selectIntrinsicCmp(MachineInstr & I) const1355 bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1356 Register Dst = I.getOperand(0).getReg();
1357 if (isVCC(Dst, *MRI))
1358 return false;
1359
1360 LLT DstTy = MRI->getType(Dst);
1361 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1362 return false;
1363
1364 MachineBasicBlock *BB = I.getParent();
1365 const DebugLoc &DL = I.getDebugLoc();
1366 Register SrcReg = I.getOperand(2).getReg();
1367 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1368
1369 // i1 inputs are not supported in GlobalISel.
1370 if (Size == 1)
1371 return false;
1372
1373 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1374 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1375 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1376 I.eraseFromParent();
1377 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1378 }
1379
1380 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1381 if (Opcode == -1)
1382 return false;
1383
1384 MachineInstrBuilder SelectedMI;
1385 MachineOperand &LHS = I.getOperand(2);
1386 MachineOperand &RHS = I.getOperand(3);
1387 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1388 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1389 Register Src0Reg =
1390 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1391 Register Src1Reg =
1392 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1393 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1394 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1395 SelectedMI.addImm(Src0Mods);
1396 SelectedMI.addReg(Src0Reg);
1397 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1398 SelectedMI.addImm(Src1Mods);
1399 SelectedMI.addReg(Src1Reg);
1400 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1401 SelectedMI.addImm(0); // clamp
1402 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1403 SelectedMI.addImm(0); // op_sel
1404
1405 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1406 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1407 return false;
1408
1409 I.eraseFromParent();
1410 return true;
1411 }
1412
selectBallot(MachineInstr & I) const1413 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1414 MachineBasicBlock *BB = I.getParent();
1415 const DebugLoc &DL = I.getDebugLoc();
1416 Register DstReg = I.getOperand(0).getReg();
1417 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1418 const bool Is64 = Size == 64;
1419 const bool IsWave32 = (STI.getWavefrontSize() == 32);
1420
1421 // In the common case, the return type matches the wave size.
1422 // However we also support emitting i64 ballots in wave32 mode.
1423 if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1424 return false;
1425
1426 std::optional<ValueAndVReg> Arg =
1427 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1428
1429 const auto BuildCopy = [&](Register SrcReg) {
1430 if (Size == STI.getWavefrontSize()) {
1431 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1432 .addReg(SrcReg);
1433 return;
1434 }
1435
1436 // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1437 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1438 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1439 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1440 .addReg(SrcReg)
1441 .addImm(AMDGPU::sub0)
1442 .addReg(HiReg)
1443 .addImm(AMDGPU::sub1);
1444 };
1445
1446 if (Arg) {
1447 const int64_t Value = Arg->Value.getSExtValue();
1448 if (Value == 0) {
1449 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1450 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1451 } else if (Value == -1) // all ones
1452 BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1453 else
1454 return false;
1455 } else
1456 BuildCopy(I.getOperand(2).getReg());
1457
1458 I.eraseFromParent();
1459 return true;
1460 }
1461
selectInverseBallot(MachineInstr & I) const1462 bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const {
1463 MachineBasicBlock *BB = I.getParent();
1464 const DebugLoc &DL = I.getDebugLoc();
1465 const Register DstReg = I.getOperand(0).getReg();
1466 const Register MaskReg = I.getOperand(2).getReg();
1467
1468 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg);
1469 I.eraseFromParent();
1470 return true;
1471 }
1472
selectRelocConstant(MachineInstr & I) const1473 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1474 Register DstReg = I.getOperand(0).getReg();
1475 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1476 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1477 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1478 return false;
1479
1480 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1481
1482 Module *M = MF->getFunction().getParent();
1483 const MDNode *Metadata = I.getOperand(2).getMetadata();
1484 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1485 auto RelocSymbol = cast<GlobalVariable>(
1486 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1487
1488 MachineBasicBlock *BB = I.getParent();
1489 BuildMI(*BB, &I, I.getDebugLoc(),
1490 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1491 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1492
1493 I.eraseFromParent();
1494 return true;
1495 }
1496
selectGroupStaticSize(MachineInstr & I) const1497 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1498 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1499
1500 Register DstReg = I.getOperand(0).getReg();
1501 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1502 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1503 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1504
1505 MachineBasicBlock *MBB = I.getParent();
1506 const DebugLoc &DL = I.getDebugLoc();
1507
1508 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1509
1510 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1511 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1512 MIB.addImm(MFI->getLDSSize());
1513 } else {
1514 Module *M = MF->getFunction().getParent();
1515 const GlobalValue *GV
1516 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1517 MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1518 }
1519
1520 I.eraseFromParent();
1521 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1522 }
1523
selectReturnAddress(MachineInstr & I) const1524 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1525 MachineBasicBlock *MBB = I.getParent();
1526 MachineFunction &MF = *MBB->getParent();
1527 const DebugLoc &DL = I.getDebugLoc();
1528
1529 MachineOperand &Dst = I.getOperand(0);
1530 Register DstReg = Dst.getReg();
1531 unsigned Depth = I.getOperand(2).getImm();
1532
1533 const TargetRegisterClass *RC
1534 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1535 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1536 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1537 return false;
1538
1539 // Check for kernel and shader functions
1540 if (Depth != 0 ||
1541 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1542 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1543 .addImm(0);
1544 I.eraseFromParent();
1545 return true;
1546 }
1547
1548 MachineFrameInfo &MFI = MF.getFrameInfo();
1549 // There is a call to @llvm.returnaddress in this function
1550 MFI.setReturnAddressIsTaken(true);
1551
1552 // Get the return address reg and mark it as an implicit live-in
1553 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1554 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1555 AMDGPU::SReg_64RegClass, DL);
1556 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1557 .addReg(LiveIn);
1558 I.eraseFromParent();
1559 return true;
1560 }
1561
selectEndCfIntrinsic(MachineInstr & MI) const1562 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1563 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1564 // SelectionDAG uses for wave32 vs wave64.
1565 MachineBasicBlock *BB = MI.getParent();
1566 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1567 .add(MI.getOperand(1));
1568
1569 Register Reg = MI.getOperand(1).getReg();
1570 MI.eraseFromParent();
1571
1572 if (!MRI->getRegClassOrNull(Reg))
1573 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1574 return true;
1575 }
1576
selectDSOrderedIntrinsic(MachineInstr & MI,Intrinsic::ID IntrID) const1577 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1578 MachineInstr &MI, Intrinsic::ID IntrID) const {
1579 MachineBasicBlock *MBB = MI.getParent();
1580 MachineFunction *MF = MBB->getParent();
1581 const DebugLoc &DL = MI.getDebugLoc();
1582
1583 unsigned IndexOperand = MI.getOperand(7).getImm();
1584 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1585 bool WaveDone = MI.getOperand(9).getImm() != 0;
1586
1587 if (WaveDone && !WaveRelease)
1588 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1589
1590 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1591 IndexOperand &= ~0x3f;
1592 unsigned CountDw = 0;
1593
1594 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1595 CountDw = (IndexOperand >> 24) & 0xf;
1596 IndexOperand &= ~(0xf << 24);
1597
1598 if (CountDw < 1 || CountDw > 4) {
1599 report_fatal_error(
1600 "ds_ordered_count: dword count must be between 1 and 4");
1601 }
1602 }
1603
1604 if (IndexOperand)
1605 report_fatal_error("ds_ordered_count: bad index operand");
1606
1607 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1608 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1609
1610 unsigned Offset0 = OrderedCountIndex << 2;
1611 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1612
1613 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1614 Offset1 |= (CountDw - 1) << 6;
1615
1616 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1617 Offset1 |= ShaderType << 2;
1618
1619 unsigned Offset = Offset0 | (Offset1 << 8);
1620
1621 Register M0Val = MI.getOperand(2).getReg();
1622 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1623 .addReg(M0Val);
1624
1625 Register DstReg = MI.getOperand(0).getReg();
1626 Register ValReg = MI.getOperand(3).getReg();
1627 MachineInstrBuilder DS =
1628 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1629 .addReg(ValReg)
1630 .addImm(Offset)
1631 .cloneMemRefs(MI);
1632
1633 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1634 return false;
1635
1636 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1637 MI.eraseFromParent();
1638 return Ret;
1639 }
1640
gwsIntrinToOpcode(unsigned IntrID)1641 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1642 switch (IntrID) {
1643 case Intrinsic::amdgcn_ds_gws_init:
1644 return AMDGPU::DS_GWS_INIT;
1645 case Intrinsic::amdgcn_ds_gws_barrier:
1646 return AMDGPU::DS_GWS_BARRIER;
1647 case Intrinsic::amdgcn_ds_gws_sema_v:
1648 return AMDGPU::DS_GWS_SEMA_V;
1649 case Intrinsic::amdgcn_ds_gws_sema_br:
1650 return AMDGPU::DS_GWS_SEMA_BR;
1651 case Intrinsic::amdgcn_ds_gws_sema_p:
1652 return AMDGPU::DS_GWS_SEMA_P;
1653 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1654 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1655 default:
1656 llvm_unreachable("not a gws intrinsic");
1657 }
1658 }
1659
selectDSGWSIntrinsic(MachineInstr & MI,Intrinsic::ID IID) const1660 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1661 Intrinsic::ID IID) const {
1662 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1663 !STI.hasGWSSemaReleaseAll()))
1664 return false;
1665
1666 // intrinsic ID, vsrc, offset
1667 const bool HasVSrc = MI.getNumOperands() == 3;
1668 assert(HasVSrc || MI.getNumOperands() == 2);
1669
1670 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1671 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1672 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1673 return false;
1674
1675 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1676 unsigned ImmOffset;
1677
1678 MachineBasicBlock *MBB = MI.getParent();
1679 const DebugLoc &DL = MI.getDebugLoc();
1680
1681 MachineInstr *Readfirstlane = nullptr;
1682
1683 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1684 // incoming offset, in case there's an add of a constant. We'll have to put it
1685 // back later.
1686 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1687 Readfirstlane = OffsetDef;
1688 BaseOffset = OffsetDef->getOperand(1).getReg();
1689 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1690 }
1691
1692 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1693 // If we have a constant offset, try to use the 0 in m0 as the base.
1694 // TODO: Look into changing the default m0 initialization value. If the
1695 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1696 // the immediate offset.
1697
1698 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1699 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1700 .addImm(0);
1701 } else {
1702 std::tie(BaseOffset, ImmOffset) =
1703 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1704
1705 if (Readfirstlane) {
1706 // We have the constant offset now, so put the readfirstlane back on the
1707 // variable component.
1708 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1709 return false;
1710
1711 Readfirstlane->getOperand(1).setReg(BaseOffset);
1712 BaseOffset = Readfirstlane->getOperand(0).getReg();
1713 } else {
1714 if (!RBI.constrainGenericRegister(BaseOffset,
1715 AMDGPU::SReg_32RegClass, *MRI))
1716 return false;
1717 }
1718
1719 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1720 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1721 .addReg(BaseOffset)
1722 .addImm(16)
1723 .setOperandDead(3); // Dead scc
1724
1725 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1726 .addReg(M0Base);
1727 }
1728
1729 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1730 // offset field) % 64. Some versions of the programming guide omit the m0
1731 // part, or claim it's from offset 0.
1732 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1733
1734 if (HasVSrc) {
1735 Register VSrc = MI.getOperand(1).getReg();
1736 MIB.addReg(VSrc);
1737
1738 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1739 return false;
1740 }
1741
1742 MIB.addImm(ImmOffset)
1743 .cloneMemRefs(MI);
1744
1745 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1746
1747 MI.eraseFromParent();
1748 return true;
1749 }
1750
selectDSAppendConsume(MachineInstr & MI,bool IsAppend) const1751 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1752 bool IsAppend) const {
1753 Register PtrBase = MI.getOperand(2).getReg();
1754 LLT PtrTy = MRI->getType(PtrBase);
1755 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1756
1757 unsigned Offset;
1758 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1759
1760 // TODO: Should this try to look through readfirstlane like GWS?
1761 if (!isDSOffsetLegal(PtrBase, Offset)) {
1762 PtrBase = MI.getOperand(2).getReg();
1763 Offset = 0;
1764 }
1765
1766 MachineBasicBlock *MBB = MI.getParent();
1767 const DebugLoc &DL = MI.getDebugLoc();
1768 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1769
1770 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1771 .addReg(PtrBase);
1772 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1773 return false;
1774
1775 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1776 .addImm(Offset)
1777 .addImm(IsGDS ? -1 : 0)
1778 .cloneMemRefs(MI);
1779 MI.eraseFromParent();
1780 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1781 }
1782
selectSBarrier(MachineInstr & MI) const1783 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1784 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1785 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1786 if (WGSize <= STI.getWavefrontSize()) {
1787 MachineBasicBlock *MBB = MI.getParent();
1788 const DebugLoc &DL = MI.getDebugLoc();
1789 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1790 MI.eraseFromParent();
1791 return true;
1792 }
1793 }
1794
1795 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1796 if (STI.hasSplitBarriers()) {
1797 MachineBasicBlock *MBB = MI.getParent();
1798 const DebugLoc &DL = MI.getDebugLoc();
1799 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1800 .addImm(AMDGPU::Barrier::WORKGROUP);
1801 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1802 .addImm(AMDGPU::Barrier::WORKGROUP);
1803 MI.eraseFromParent();
1804 return true;
1805 }
1806
1807 return selectImpl(MI, *CoverageInfo);
1808 }
1809
parseTexFail(uint64_t TexFailCtrl,bool & TFE,bool & LWE,bool & IsTexFail)1810 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1811 bool &IsTexFail) {
1812 if (TexFailCtrl)
1813 IsTexFail = true;
1814
1815 TFE = (TexFailCtrl & 0x1) ? true : false;
1816 TexFailCtrl &= ~(uint64_t)0x1;
1817 LWE = (TexFailCtrl & 0x2) ? true : false;
1818 TexFailCtrl &= ~(uint64_t)0x2;
1819
1820 return TexFailCtrl == 0;
1821 }
1822
selectImageIntrinsic(MachineInstr & MI,const AMDGPU::ImageDimIntrinsicInfo * Intr) const1823 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1824 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1825 MachineBasicBlock *MBB = MI.getParent();
1826 const DebugLoc &DL = MI.getDebugLoc();
1827
1828 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1829 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1830
1831 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1832 unsigned IntrOpcode = Intr->BaseOpcode;
1833 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1834 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1835 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1836
1837 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1838
1839 Register VDataIn, VDataOut;
1840 LLT VDataTy;
1841 int NumVDataDwords = -1;
1842 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1843 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1844
1845 bool Unorm;
1846 if (!BaseOpcode->Sampler)
1847 Unorm = true;
1848 else
1849 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1850
1851 bool TFE;
1852 bool LWE;
1853 bool IsTexFail = false;
1854 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1855 TFE, LWE, IsTexFail))
1856 return false;
1857
1858 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1859 const bool IsA16 = (Flags & 1) != 0;
1860 const bool IsG16 = (Flags & 2) != 0;
1861
1862 // A16 implies 16 bit gradients if subtarget doesn't support G16
1863 if (IsA16 && !STI.hasG16() && !IsG16)
1864 return false;
1865
1866 unsigned DMask = 0;
1867 unsigned DMaskLanes = 0;
1868
1869 if (BaseOpcode->Atomic) {
1870 VDataOut = MI.getOperand(0).getReg();
1871 VDataIn = MI.getOperand(2).getReg();
1872 LLT Ty = MRI->getType(VDataIn);
1873
1874 // Be careful to allow atomic swap on 16-bit element vectors.
1875 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1876 Ty.getSizeInBits() == 128 :
1877 Ty.getSizeInBits() == 64;
1878
1879 if (BaseOpcode->AtomicX2) {
1880 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1881
1882 DMask = Is64Bit ? 0xf : 0x3;
1883 NumVDataDwords = Is64Bit ? 4 : 2;
1884 } else {
1885 DMask = Is64Bit ? 0x3 : 0x1;
1886 NumVDataDwords = Is64Bit ? 2 : 1;
1887 }
1888 } else {
1889 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1890 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1891
1892 if (BaseOpcode->Store) {
1893 VDataIn = MI.getOperand(1).getReg();
1894 VDataTy = MRI->getType(VDataIn);
1895 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1896 } else {
1897 VDataOut = MI.getOperand(0).getReg();
1898 VDataTy = MRI->getType(VDataOut);
1899 NumVDataDwords = DMaskLanes;
1900
1901 if (IsD16 && !STI.hasUnpackedD16VMem())
1902 NumVDataDwords = (DMaskLanes + 1) / 2;
1903 }
1904 }
1905
1906 // Set G16 opcode
1907 if (Subtarget->hasG16() && IsG16) {
1908 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1909 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1910 assert(G16MappingInfo);
1911 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1912 }
1913
1914 // TODO: Check this in verifier.
1915 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1916
1917 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1918 if (BaseOpcode->Atomic)
1919 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1920 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
1921 AMDGPU::CPol::VOLATILE))
1922 return false;
1923
1924 int NumVAddrRegs = 0;
1925 int NumVAddrDwords = 0;
1926 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1927 // Skip the $noregs and 0s inserted during legalization.
1928 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1929 if (!AddrOp.isReg())
1930 continue; // XXX - Break?
1931
1932 Register Addr = AddrOp.getReg();
1933 if (!Addr)
1934 break;
1935
1936 ++NumVAddrRegs;
1937 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1938 }
1939
1940 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1941 // NSA, these should have been packed into a single value in the first
1942 // address register
1943 const bool UseNSA =
1944 NumVAddrRegs != 1 &&
1945 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
1946 : NumVAddrDwords == NumVAddrRegs);
1947 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1948 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1949 return false;
1950 }
1951
1952 if (IsTexFail)
1953 ++NumVDataDwords;
1954
1955 int Opcode = -1;
1956 if (IsGFX12Plus) {
1957 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
1958 NumVDataDwords, NumVAddrDwords);
1959 } else if (IsGFX11Plus) {
1960 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1961 UseNSA ? AMDGPU::MIMGEncGfx11NSA
1962 : AMDGPU::MIMGEncGfx11Default,
1963 NumVDataDwords, NumVAddrDwords);
1964 } else if (IsGFX10Plus) {
1965 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1966 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1967 : AMDGPU::MIMGEncGfx10Default,
1968 NumVDataDwords, NumVAddrDwords);
1969 } else {
1970 if (Subtarget->hasGFX90AInsts()) {
1971 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1972 NumVDataDwords, NumVAddrDwords);
1973 if (Opcode == -1) {
1974 LLVM_DEBUG(
1975 dbgs()
1976 << "requested image instruction is not supported on this GPU\n");
1977 return false;
1978 }
1979 }
1980 if (Opcode == -1 &&
1981 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1982 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1983 NumVDataDwords, NumVAddrDwords);
1984 if (Opcode == -1)
1985 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1986 NumVDataDwords, NumVAddrDwords);
1987 }
1988 if (Opcode == -1)
1989 return false;
1990
1991 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1992 .cloneMemRefs(MI);
1993
1994 if (VDataOut) {
1995 if (BaseOpcode->AtomicX2) {
1996 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1997
1998 Register TmpReg = MRI->createVirtualRegister(
1999 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2000 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2001
2002 MIB.addDef(TmpReg);
2003 if (!MRI->use_empty(VDataOut)) {
2004 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2005 .addReg(TmpReg, RegState::Kill, SubReg);
2006 }
2007
2008 } else {
2009 MIB.addDef(VDataOut); // vdata output
2010 }
2011 }
2012
2013 if (VDataIn)
2014 MIB.addReg(VDataIn); // vdata input
2015
2016 for (int I = 0; I != NumVAddrRegs; ++I) {
2017 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2018 if (SrcOp.isReg()) {
2019 assert(SrcOp.getReg() != 0);
2020 MIB.addReg(SrcOp.getReg());
2021 }
2022 }
2023
2024 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2025 if (BaseOpcode->Sampler)
2026 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2027
2028 MIB.addImm(DMask); // dmask
2029
2030 if (IsGFX10Plus)
2031 MIB.addImm(DimInfo->Encoding);
2032 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2033 MIB.addImm(Unorm);
2034
2035 MIB.addImm(CPol);
2036 MIB.addImm(IsA16 && // a16 or r128
2037 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2038 if (IsGFX10Plus)
2039 MIB.addImm(IsA16 ? -1 : 0);
2040
2041 if (!Subtarget->hasGFX90AInsts()) {
2042 MIB.addImm(TFE); // tfe
2043 } else if (TFE) {
2044 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2045 return false;
2046 }
2047
2048 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2049 MIB.addImm(LWE); // lwe
2050 if (!IsGFX10Plus)
2051 MIB.addImm(DimInfo->DA ? -1 : 0);
2052 if (BaseOpcode->HasD16)
2053 MIB.addImm(IsD16 ? -1 : 0);
2054
2055 if (IsTexFail) {
2056 // An image load instruction with TFE/LWE only conditionally writes to its
2057 // result registers. Initialize them to zero so that we always get well
2058 // defined result values.
2059 assert(VDataOut && !VDataIn);
2060 Register Tied = MRI->cloneVirtualRegister(VDataOut);
2061 Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2062 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
2063 .addImm(0);
2064 auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
2065 if (STI.usePRTStrictNull()) {
2066 // With enable-prt-strict-null enabled, initialize all result registers to
2067 // zero.
2068 auto RegSeq =
2069 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
2070 for (auto Sub : Parts)
2071 RegSeq.addReg(Zero).addImm(Sub);
2072 } else {
2073 // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
2074 // result register.
2075 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2076 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
2077 auto RegSeq =
2078 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
2079 for (auto Sub : Parts.drop_back(1))
2080 RegSeq.addReg(Undef).addImm(Sub);
2081 RegSeq.addReg(Zero).addImm(Parts.back());
2082 }
2083 MIB.addReg(Tied, RegState::Implicit);
2084 MIB->tieOperands(0, MIB->getNumOperands() - 1);
2085 }
2086
2087 MI.eraseFromParent();
2088 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2089 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2090 return true;
2091 }
2092
2093 // We need to handle this here because tablegen doesn't support matching
2094 // instructions with multiple outputs.
selectDSBvhStackIntrinsic(MachineInstr & MI) const2095 bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2096 MachineInstr &MI) const {
2097 Register Dst0 = MI.getOperand(0).getReg();
2098 Register Dst1 = MI.getOperand(1).getReg();
2099
2100 const DebugLoc &DL = MI.getDebugLoc();
2101 MachineBasicBlock *MBB = MI.getParent();
2102
2103 Register Addr = MI.getOperand(3).getReg();
2104 Register Data0 = MI.getOperand(4).getReg();
2105 Register Data1 = MI.getOperand(5).getReg();
2106 unsigned Offset = MI.getOperand(6).getImm();
2107
2108 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2109 .addDef(Dst1)
2110 .addUse(Addr)
2111 .addUse(Data0)
2112 .addUse(Data1)
2113 .addImm(Offset)
2114 .cloneMemRefs(MI);
2115
2116 MI.eraseFromParent();
2117 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2118 }
2119
selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr & I) const2120 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2121 MachineInstr &I) const {
2122 unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2123 switch (IntrinsicID) {
2124 case Intrinsic::amdgcn_end_cf:
2125 return selectEndCfIntrinsic(I);
2126 case Intrinsic::amdgcn_ds_ordered_add:
2127 case Intrinsic::amdgcn_ds_ordered_swap:
2128 return selectDSOrderedIntrinsic(I, IntrinsicID);
2129 case Intrinsic::amdgcn_ds_gws_init:
2130 case Intrinsic::amdgcn_ds_gws_barrier:
2131 case Intrinsic::amdgcn_ds_gws_sema_v:
2132 case Intrinsic::amdgcn_ds_gws_sema_br:
2133 case Intrinsic::amdgcn_ds_gws_sema_p:
2134 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2135 return selectDSGWSIntrinsic(I, IntrinsicID);
2136 case Intrinsic::amdgcn_ds_append:
2137 return selectDSAppendConsume(I, true);
2138 case Intrinsic::amdgcn_ds_consume:
2139 return selectDSAppendConsume(I, false);
2140 case Intrinsic::amdgcn_s_barrier:
2141 return selectSBarrier(I);
2142 case Intrinsic::amdgcn_raw_buffer_load_lds:
2143 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2144 case Intrinsic::amdgcn_struct_buffer_load_lds:
2145 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2146 return selectBufferLoadLds(I);
2147 case Intrinsic::amdgcn_global_load_lds:
2148 return selectGlobalLoadLds(I);
2149 case Intrinsic::amdgcn_exp_compr:
2150 if (!STI.hasCompressedExport()) {
2151 Function &F = I.getMF()->getFunction();
2152 DiagnosticInfoUnsupported NoFpRet(
2153 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2154 F.getContext().diagnose(NoFpRet);
2155 return false;
2156 }
2157 break;
2158 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2159 return selectDSBvhStackIntrinsic(I);
2160 case Intrinsic::amdgcn_s_barrier_init:
2161 case Intrinsic::amdgcn_s_barrier_join:
2162 case Intrinsic::amdgcn_s_wakeup_barrier:
2163 case Intrinsic::amdgcn_s_get_barrier_state:
2164 return selectNamedBarrierInst(I, IntrinsicID);
2165 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2166 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
2167 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2168 case Intrinsic::amdgcn_s_barrier_leave:
2169 return selectSBarrierLeave(I);
2170 }
2171 return selectImpl(I, *CoverageInfo);
2172 }
2173
selectG_SELECT(MachineInstr & I) const2174 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2175 if (selectImpl(I, *CoverageInfo))
2176 return true;
2177
2178 MachineBasicBlock *BB = I.getParent();
2179 const DebugLoc &DL = I.getDebugLoc();
2180
2181 Register DstReg = I.getOperand(0).getReg();
2182 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2183 assert(Size <= 32 || Size == 64);
2184 const MachineOperand &CCOp = I.getOperand(1);
2185 Register CCReg = CCOp.getReg();
2186 if (!isVCC(CCReg, *MRI)) {
2187 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2188 AMDGPU::S_CSELECT_B32;
2189 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2190 .addReg(CCReg);
2191
2192 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2193 // bank, because it does not cover the register class that we used to represent
2194 // for it. So we need to manually set the register class here.
2195 if (!MRI->getRegClassOrNull(CCReg))
2196 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2197 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2198 .add(I.getOperand(2))
2199 .add(I.getOperand(3));
2200
2201 bool Ret = false;
2202 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2203 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2204 I.eraseFromParent();
2205 return Ret;
2206 }
2207
2208 // Wide VGPR select should have been split in RegBankSelect.
2209 if (Size > 32)
2210 return false;
2211
2212 MachineInstr *Select =
2213 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2214 .addImm(0)
2215 .add(I.getOperand(3))
2216 .addImm(0)
2217 .add(I.getOperand(2))
2218 .add(I.getOperand(1));
2219
2220 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2221 I.eraseFromParent();
2222 return Ret;
2223 }
2224
sizeToSubRegIndex(unsigned Size)2225 static int sizeToSubRegIndex(unsigned Size) {
2226 switch (Size) {
2227 case 32:
2228 return AMDGPU::sub0;
2229 case 64:
2230 return AMDGPU::sub0_sub1;
2231 case 96:
2232 return AMDGPU::sub0_sub1_sub2;
2233 case 128:
2234 return AMDGPU::sub0_sub1_sub2_sub3;
2235 case 256:
2236 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2237 default:
2238 if (Size < 32)
2239 return AMDGPU::sub0;
2240 if (Size > 256)
2241 return -1;
2242 return sizeToSubRegIndex(llvm::bit_ceil(Size));
2243 }
2244 }
2245
selectG_TRUNC(MachineInstr & I) const2246 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2247 Register DstReg = I.getOperand(0).getReg();
2248 Register SrcReg = I.getOperand(1).getReg();
2249 const LLT DstTy = MRI->getType(DstReg);
2250 const LLT SrcTy = MRI->getType(SrcReg);
2251 const LLT S1 = LLT::scalar(1);
2252
2253 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2254 const RegisterBank *DstRB;
2255 if (DstTy == S1) {
2256 // This is a special case. We don't treat s1 for legalization artifacts as
2257 // vcc booleans.
2258 DstRB = SrcRB;
2259 } else {
2260 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2261 if (SrcRB != DstRB)
2262 return false;
2263 }
2264
2265 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2266
2267 unsigned DstSize = DstTy.getSizeInBits();
2268 unsigned SrcSize = SrcTy.getSizeInBits();
2269
2270 const TargetRegisterClass *SrcRC =
2271 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2272 const TargetRegisterClass *DstRC =
2273 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2274 if (!SrcRC || !DstRC)
2275 return false;
2276
2277 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2278 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2279 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2280 return false;
2281 }
2282
2283 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2284 MachineBasicBlock *MBB = I.getParent();
2285 const DebugLoc &DL = I.getDebugLoc();
2286
2287 Register LoReg = MRI->createVirtualRegister(DstRC);
2288 Register HiReg = MRI->createVirtualRegister(DstRC);
2289 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2290 .addReg(SrcReg, 0, AMDGPU::sub0);
2291 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2292 .addReg(SrcReg, 0, AMDGPU::sub1);
2293
2294 if (IsVALU && STI.hasSDWA()) {
2295 // Write the low 16-bits of the high element into the high 16-bits of the
2296 // low element.
2297 MachineInstr *MovSDWA =
2298 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2299 .addImm(0) // $src0_modifiers
2300 .addReg(HiReg) // $src0
2301 .addImm(0) // $clamp
2302 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2303 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2304 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2305 .addReg(LoReg, RegState::Implicit);
2306 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2307 } else {
2308 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2309 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2310 Register ImmReg = MRI->createVirtualRegister(DstRC);
2311 if (IsVALU) {
2312 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2313 .addImm(16)
2314 .addReg(HiReg);
2315 } else {
2316 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2317 .addReg(HiReg)
2318 .addImm(16)
2319 .setOperandDead(3); // Dead scc
2320 }
2321
2322 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2323 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2324 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2325
2326 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2327 .addImm(0xffff);
2328 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2329 .addReg(LoReg)
2330 .addReg(ImmReg);
2331 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2332 .addReg(TmpReg0)
2333 .addReg(TmpReg1);
2334
2335 if (!IsVALU) {
2336 And.setOperandDead(3); // Dead scc
2337 Or.setOperandDead(3); // Dead scc
2338 }
2339 }
2340
2341 I.eraseFromParent();
2342 return true;
2343 }
2344
2345 if (!DstTy.isScalar())
2346 return false;
2347
2348 if (SrcSize > 32) {
2349 int SubRegIdx = sizeToSubRegIndex(DstSize);
2350 if (SubRegIdx == -1)
2351 return false;
2352
2353 // Deal with weird cases where the class only partially supports the subreg
2354 // index.
2355 const TargetRegisterClass *SrcWithSubRC
2356 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2357 if (!SrcWithSubRC)
2358 return false;
2359
2360 if (SrcWithSubRC != SrcRC) {
2361 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2362 return false;
2363 }
2364
2365 I.getOperand(1).setSubReg(SubRegIdx);
2366 }
2367
2368 I.setDesc(TII.get(TargetOpcode::COPY));
2369 return true;
2370 }
2371
2372 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
shouldUseAndMask(unsigned Size,unsigned & Mask)2373 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2374 Mask = maskTrailingOnes<unsigned>(Size);
2375 int SignedMask = static_cast<int>(Mask);
2376 return SignedMask >= -16 && SignedMask <= 64;
2377 }
2378
2379 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
getArtifactRegBank(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const2380 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2381 Register Reg, const MachineRegisterInfo &MRI,
2382 const TargetRegisterInfo &TRI) const {
2383 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2384 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2385 return RB;
2386
2387 // Ignore the type, since we don't use vcc in artifacts.
2388 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2389 return &RBI.getRegBankFromRegClass(*RC, LLT());
2390 return nullptr;
2391 }
2392
selectG_SZA_EXT(MachineInstr & I) const2393 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2394 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2395 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2396 const DebugLoc &DL = I.getDebugLoc();
2397 MachineBasicBlock &MBB = *I.getParent();
2398 const Register DstReg = I.getOperand(0).getReg();
2399 const Register SrcReg = I.getOperand(1).getReg();
2400
2401 const LLT DstTy = MRI->getType(DstReg);
2402 const LLT SrcTy = MRI->getType(SrcReg);
2403 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2404 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2405 const unsigned DstSize = DstTy.getSizeInBits();
2406 if (!DstTy.isScalar())
2407 return false;
2408
2409 // Artifact casts should never use vcc.
2410 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2411
2412 // FIXME: This should probably be illegal and split earlier.
2413 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2414 if (DstSize <= 32)
2415 return selectCOPY(I);
2416
2417 const TargetRegisterClass *SrcRC =
2418 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2419 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2420 const TargetRegisterClass *DstRC =
2421 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2422
2423 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2424 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2425 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2426 .addReg(SrcReg)
2427 .addImm(AMDGPU::sub0)
2428 .addReg(UndefReg)
2429 .addImm(AMDGPU::sub1);
2430 I.eraseFromParent();
2431
2432 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2433 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2434 }
2435
2436 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2437 // 64-bit should have been split up in RegBankSelect
2438
2439 // Try to use an and with a mask if it will save code size.
2440 unsigned Mask;
2441 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2442 MachineInstr *ExtI =
2443 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2444 .addImm(Mask)
2445 .addReg(SrcReg);
2446 I.eraseFromParent();
2447 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2448 }
2449
2450 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2451 MachineInstr *ExtI =
2452 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2453 .addReg(SrcReg)
2454 .addImm(0) // Offset
2455 .addImm(SrcSize); // Width
2456 I.eraseFromParent();
2457 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2458 }
2459
2460 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2461 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2462 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2463 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2464 return false;
2465
2466 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2467 const unsigned SextOpc = SrcSize == 8 ?
2468 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2469 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2470 .addReg(SrcReg);
2471 I.eraseFromParent();
2472 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2473 }
2474
2475 // Using a single 32-bit SALU to calculate the high half is smaller than
2476 // S_BFE with a literal constant operand.
2477 if (DstSize > 32 && SrcSize == 32) {
2478 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2479 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2480 if (Signed) {
2481 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2482 .addReg(SrcReg, 0, SubReg)
2483 .addImm(31)
2484 .setOperandDead(3); // Dead scc
2485 } else {
2486 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2487 .addImm(0);
2488 }
2489 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2490 .addReg(SrcReg, 0, SubReg)
2491 .addImm(AMDGPU::sub0)
2492 .addReg(HiReg)
2493 .addImm(AMDGPU::sub1);
2494 I.eraseFromParent();
2495 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2496 *MRI);
2497 }
2498
2499 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2500 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2501
2502 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2503 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2504 // We need a 64-bit register source, but the high bits don't matter.
2505 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2506 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2507 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2508
2509 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2510 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2511 .addReg(SrcReg, 0, SubReg)
2512 .addImm(AMDGPU::sub0)
2513 .addReg(UndefReg)
2514 .addImm(AMDGPU::sub1);
2515
2516 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2517 .addReg(ExtReg)
2518 .addImm(SrcSize << 16);
2519
2520 I.eraseFromParent();
2521 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2522 }
2523
2524 unsigned Mask;
2525 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2526 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2527 .addReg(SrcReg)
2528 .addImm(Mask)
2529 .setOperandDead(3); // Dead scc
2530 } else {
2531 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2532 .addReg(SrcReg)
2533 .addImm(SrcSize << 16);
2534 }
2535
2536 I.eraseFromParent();
2537 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2538 }
2539
2540 return false;
2541 }
2542
isExtractHiElt(MachineRegisterInfo & MRI,Register In,Register & Out)2543 static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
2544 Register &Out) {
2545 Register LShlSrc;
2546 if (mi_match(In, MRI,
2547 m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
2548 Out = LShlSrc;
2549 return true;
2550 }
2551 return false;
2552 }
2553
selectG_FPEXT(MachineInstr & I) const2554 bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2555 if (!Subtarget->hasSALUFloatInsts())
2556 return false;
2557
2558 Register Dst = I.getOperand(0).getReg();
2559 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2560 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2561 return false;
2562
2563 Register Src = I.getOperand(1).getReg();
2564
2565 if (MRI->getType(Dst) == LLT::scalar(32) &&
2566 MRI->getType(Src) == LLT::scalar(16)) {
2567 if (isExtractHiElt(*MRI, Src, Src)) {
2568 MachineBasicBlock *BB = I.getParent();
2569 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2570 .addUse(Src);
2571 I.eraseFromParent();
2572 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2573 }
2574 }
2575
2576 return false;
2577 }
2578
selectG_CONSTANT(MachineInstr & I) const2579 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2580 MachineBasicBlock *BB = I.getParent();
2581 MachineOperand &ImmOp = I.getOperand(1);
2582 Register DstReg = I.getOperand(0).getReg();
2583 unsigned Size = MRI->getType(DstReg).getSizeInBits();
2584 bool IsFP = false;
2585
2586 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2587 if (ImmOp.isFPImm()) {
2588 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2589 ImmOp.ChangeToImmediate(Imm.getZExtValue());
2590 IsFP = true;
2591 } else if (ImmOp.isCImm()) {
2592 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2593 } else {
2594 llvm_unreachable("Not supported by g_constants");
2595 }
2596
2597 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2598 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2599
2600 unsigned Opcode;
2601 if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2602 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2603 } else if (Size == 64 &&
2604 AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
2605 Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
2606 I.setDesc(TII.get(Opcode));
2607 I.addImplicitDefUseOperands(*MF);
2608 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2609 } else {
2610 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2611
2612 // We should never produce s1 values on banks other than VCC. If the user of
2613 // this already constrained the register, we may incorrectly think it's VCC
2614 // if it wasn't originally.
2615 if (Size == 1)
2616 return false;
2617 }
2618
2619 if (Size != 64) {
2620 I.setDesc(TII.get(Opcode));
2621 I.addImplicitDefUseOperands(*MF);
2622 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2623 }
2624
2625 const DebugLoc &DL = I.getDebugLoc();
2626
2627 APInt Imm(Size, I.getOperand(1).getImm());
2628
2629 MachineInstr *ResInst;
2630 if (IsSgpr && TII.isInlineConstant(Imm)) {
2631 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2632 .addImm(I.getOperand(1).getImm());
2633 } else {
2634 const TargetRegisterClass *RC = IsSgpr ?
2635 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2636 Register LoReg = MRI->createVirtualRegister(RC);
2637 Register HiReg = MRI->createVirtualRegister(RC);
2638
2639 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2640 .addImm(Imm.trunc(32).getZExtValue());
2641
2642 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2643 .addImm(Imm.ashr(32).getZExtValue());
2644
2645 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2646 .addReg(LoReg)
2647 .addImm(AMDGPU::sub0)
2648 .addReg(HiReg)
2649 .addImm(AMDGPU::sub1);
2650 }
2651
2652 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2653 // work for target independent opcodes
2654 I.eraseFromParent();
2655 const TargetRegisterClass *DstRC =
2656 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2657 if (!DstRC)
2658 return true;
2659 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2660 }
2661
selectG_FNEG(MachineInstr & MI) const2662 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2663 // Only manually handle the f64 SGPR case.
2664 //
2665 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2666 // the bit ops theoretically have a second result due to the implicit def of
2667 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2668 // that is easy by disabling the check. The result works, but uses a
2669 // nonsensical sreg32orlds_and_sreg_1 regclass.
2670 //
2671 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2672 // the variadic REG_SEQUENCE operands.
2673
2674 Register Dst = MI.getOperand(0).getReg();
2675 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2676 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2677 MRI->getType(Dst) != LLT::scalar(64))
2678 return false;
2679
2680 Register Src = MI.getOperand(1).getReg();
2681 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2682 if (Fabs)
2683 Src = Fabs->getOperand(1).getReg();
2684
2685 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2686 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2687 return false;
2688
2689 MachineBasicBlock *BB = MI.getParent();
2690 const DebugLoc &DL = MI.getDebugLoc();
2691 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2692 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2693 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2694 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2695
2696 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2697 .addReg(Src, 0, AMDGPU::sub0);
2698 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2699 .addReg(Src, 0, AMDGPU::sub1);
2700 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2701 .addImm(0x80000000);
2702
2703 // Set or toggle sign bit.
2704 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2705 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2706 .addReg(HiReg)
2707 .addReg(ConstReg)
2708 .setOperandDead(3); // Dead scc
2709 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2710 .addReg(LoReg)
2711 .addImm(AMDGPU::sub0)
2712 .addReg(OpReg)
2713 .addImm(AMDGPU::sub1);
2714 MI.eraseFromParent();
2715 return true;
2716 }
2717
2718 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
selectG_FABS(MachineInstr & MI) const2719 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2720 Register Dst = MI.getOperand(0).getReg();
2721 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2722 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2723 MRI->getType(Dst) != LLT::scalar(64))
2724 return false;
2725
2726 Register Src = MI.getOperand(1).getReg();
2727 MachineBasicBlock *BB = MI.getParent();
2728 const DebugLoc &DL = MI.getDebugLoc();
2729 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2730 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2731 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2732 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2733
2734 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2735 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2736 return false;
2737
2738 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2739 .addReg(Src, 0, AMDGPU::sub0);
2740 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2741 .addReg(Src, 0, AMDGPU::sub1);
2742 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2743 .addImm(0x7fffffff);
2744
2745 // Clear sign bit.
2746 // TODO: Should this used S_BITSET0_*?
2747 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2748 .addReg(HiReg)
2749 .addReg(ConstReg)
2750 .setOperandDead(3); // Dead scc
2751 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2752 .addReg(LoReg)
2753 .addImm(AMDGPU::sub0)
2754 .addReg(OpReg)
2755 .addImm(AMDGPU::sub1);
2756
2757 MI.eraseFromParent();
2758 return true;
2759 }
2760
isConstant(const MachineInstr & MI)2761 static bool isConstant(const MachineInstr &MI) {
2762 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2763 }
2764
getAddrModeInfo(const MachineInstr & Load,const MachineRegisterInfo & MRI,SmallVectorImpl<GEPInfo> & AddrInfo) const2765 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2766 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2767
2768 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2769 const MachineInstr *PtrMI =
2770 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2771
2772 assert(PtrMI);
2773
2774 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2775 return;
2776
2777 GEPInfo GEPInfo;
2778
2779 for (unsigned i = 1; i != 3; ++i) {
2780 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2781 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2782 assert(OpDef);
2783 if (i == 2 && isConstant(*OpDef)) {
2784 // TODO: Could handle constant base + variable offset, but a combine
2785 // probably should have commuted it.
2786 assert(GEPInfo.Imm == 0);
2787 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2788 continue;
2789 }
2790 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2791 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2792 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2793 else
2794 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2795 }
2796
2797 AddrInfo.push_back(GEPInfo);
2798 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2799 }
2800
isSGPR(Register Reg) const2801 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2802 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2803 }
2804
isInstrUniform(const MachineInstr & MI) const2805 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2806 if (!MI.hasOneMemOperand())
2807 return false;
2808
2809 const MachineMemOperand *MMO = *MI.memoperands_begin();
2810 const Value *Ptr = MMO->getValue();
2811
2812 // UndefValue means this is a load of a kernel input. These are uniform.
2813 // Sometimes LDS instructions have constant pointers.
2814 // If Ptr is null, then that means this mem operand contains a
2815 // PseudoSourceValue like GOT.
2816 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2817 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2818 return true;
2819
2820 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2821 return true;
2822
2823 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2824 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2825 AMDGPU::SGPRRegBankID;
2826
2827 const Instruction *I = dyn_cast<Instruction>(Ptr);
2828 return I && I->getMetadata("amdgpu.uniform");
2829 }
2830
hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const2831 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2832 for (const GEPInfo &GEPInfo : AddrInfo) {
2833 if (!GEPInfo.VgprParts.empty())
2834 return true;
2835 }
2836 return false;
2837 }
2838
initM0(MachineInstr & I) const2839 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2840 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2841 unsigned AS = PtrTy.getAddressSpace();
2842 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2843 STI.ldsRequiresM0Init()) {
2844 MachineBasicBlock *BB = I.getParent();
2845
2846 // If DS instructions require M0 initialization, insert it before selecting.
2847 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2848 .addImm(-1);
2849 }
2850 }
2851
selectG_LOAD_STORE_ATOMICRMW(MachineInstr & I) const2852 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2853 MachineInstr &I) const {
2854 initM0(I);
2855 return selectImpl(I, *CoverageInfo);
2856 }
2857
isVCmpResult(Register Reg,MachineRegisterInfo & MRI)2858 static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
2859 if (Reg.isPhysical())
2860 return false;
2861
2862 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2863 const unsigned Opcode = MI.getOpcode();
2864
2865 if (Opcode == AMDGPU::COPY)
2866 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2867
2868 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2869 Opcode == AMDGPU::G_XOR)
2870 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2871 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2872
2873 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2874 return GI->is(Intrinsic::amdgcn_class);
2875
2876 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2877 }
2878
selectG_BRCOND(MachineInstr & I) const2879 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2880 MachineBasicBlock *BB = I.getParent();
2881 MachineOperand &CondOp = I.getOperand(0);
2882 Register CondReg = CondOp.getReg();
2883 const DebugLoc &DL = I.getDebugLoc();
2884
2885 unsigned BrOpcode;
2886 Register CondPhysReg;
2887 const TargetRegisterClass *ConstrainRC;
2888
2889 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2890 // whether the branch is uniform when selecting the instruction. In
2891 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2892 // RegBankSelect knows what it's doing if the branch condition is scc, even
2893 // though it currently does not.
2894 if (!isVCC(CondReg, *MRI)) {
2895 if (MRI->getType(CondReg) != LLT::scalar(32))
2896 return false;
2897
2898 CondPhysReg = AMDGPU::SCC;
2899 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2900 ConstrainRC = &AMDGPU::SReg_32RegClass;
2901 } else {
2902 // FIXME: Should scc->vcc copies and with exec?
2903
2904 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2905 // need to insert an and with exec.
2906 if (!isVCmpResult(CondReg, *MRI)) {
2907 const bool Is64 = STI.isWave64();
2908 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2909 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2910
2911 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2912 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2913 .addReg(CondReg)
2914 .addReg(Exec)
2915 .setOperandDead(3); // Dead scc
2916 CondReg = TmpReg;
2917 }
2918
2919 CondPhysReg = TRI.getVCC();
2920 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2921 ConstrainRC = TRI.getBoolRC();
2922 }
2923
2924 if (!MRI->getRegClassOrNull(CondReg))
2925 MRI->setRegClass(CondReg, ConstrainRC);
2926
2927 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2928 .addReg(CondReg);
2929 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2930 .addMBB(I.getOperand(1).getMBB());
2931
2932 I.eraseFromParent();
2933 return true;
2934 }
2935
selectG_GLOBAL_VALUE(MachineInstr & I) const2936 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2937 MachineInstr &I) const {
2938 Register DstReg = I.getOperand(0).getReg();
2939 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2940 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2941 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2942 if (IsVGPR)
2943 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2944
2945 return RBI.constrainGenericRegister(
2946 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2947 }
2948
selectG_PTRMASK(MachineInstr & I) const2949 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2950 Register DstReg = I.getOperand(0).getReg();
2951 Register SrcReg = I.getOperand(1).getReg();
2952 Register MaskReg = I.getOperand(2).getReg();
2953 LLT Ty = MRI->getType(DstReg);
2954 LLT MaskTy = MRI->getType(MaskReg);
2955 MachineBasicBlock *BB = I.getParent();
2956 const DebugLoc &DL = I.getDebugLoc();
2957
2958 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2959 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2960 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2961 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2962 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2963 return false;
2964
2965 // Try to avoid emitting a bit operation when we only need to touch half of
2966 // the 64-bit pointer.
2967 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2968 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2969 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2970
2971 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2972 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2973
2974 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2975 !CanCopyLow32 && !CanCopyHi32) {
2976 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2977 .addReg(SrcReg)
2978 .addReg(MaskReg)
2979 .setOperandDead(3); // Dead scc
2980 I.eraseFromParent();
2981 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2982 }
2983
2984 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2985 const TargetRegisterClass &RegRC
2986 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2987
2988 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2989 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2990 const TargetRegisterClass *MaskRC =
2991 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2992
2993 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2994 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2995 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2996 return false;
2997
2998 if (Ty.getSizeInBits() == 32) {
2999 assert(MaskTy.getSizeInBits() == 32 &&
3000 "ptrmask should have been narrowed during legalize");
3001
3002 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3003 .addReg(SrcReg)
3004 .addReg(MaskReg);
3005
3006 if (!IsVGPR)
3007 NewOp.setOperandDead(3); // Dead scc
3008 I.eraseFromParent();
3009 return true;
3010 }
3011
3012 Register HiReg = MRI->createVirtualRegister(&RegRC);
3013 Register LoReg = MRI->createVirtualRegister(&RegRC);
3014
3015 // Extract the subregisters from the source pointer.
3016 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3017 .addReg(SrcReg, 0, AMDGPU::sub0);
3018 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3019 .addReg(SrcReg, 0, AMDGPU::sub1);
3020
3021 Register MaskedLo, MaskedHi;
3022
3023 if (CanCopyLow32) {
3024 // If all the bits in the low half are 1, we only need a copy for it.
3025 MaskedLo = LoReg;
3026 } else {
3027 // Extract the mask subregister and apply the and.
3028 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3029 MaskedLo = MRI->createVirtualRegister(&RegRC);
3030
3031 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3032 .addReg(MaskReg, 0, AMDGPU::sub0);
3033 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3034 .addReg(LoReg)
3035 .addReg(MaskLo);
3036 }
3037
3038 if (CanCopyHi32) {
3039 // If all the bits in the high half are 1, we only need a copy for it.
3040 MaskedHi = HiReg;
3041 } else {
3042 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3043 MaskedHi = MRI->createVirtualRegister(&RegRC);
3044
3045 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3046 .addReg(MaskReg, 0, AMDGPU::sub1);
3047 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3048 .addReg(HiReg)
3049 .addReg(MaskHi);
3050 }
3051
3052 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3053 .addReg(MaskedLo)
3054 .addImm(AMDGPU::sub0)
3055 .addReg(MaskedHi)
3056 .addImm(AMDGPU::sub1);
3057 I.eraseFromParent();
3058 return true;
3059 }
3060
3061 /// Return the register to use for the index value, and the subregister to use
3062 /// for the indirectly accessed register.
3063 static std::pair<Register, unsigned>
computeIndirectRegIndex(MachineRegisterInfo & MRI,const SIRegisterInfo & TRI,const TargetRegisterClass * SuperRC,Register IdxReg,unsigned EltSize,GISelKnownBits & KnownBits)3064 computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
3065 const TargetRegisterClass *SuperRC, Register IdxReg,
3066 unsigned EltSize, GISelKnownBits &KnownBits) {
3067 Register IdxBaseReg;
3068 int Offset;
3069
3070 std::tie(IdxBaseReg, Offset) =
3071 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &KnownBits);
3072 if (IdxBaseReg == AMDGPU::NoRegister) {
3073 // This will happen if the index is a known constant. This should ordinarily
3074 // be legalized out, but handle it as a register just in case.
3075 assert(Offset == 0);
3076 IdxBaseReg = IdxReg;
3077 }
3078
3079 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3080
3081 // Skip out of bounds offsets, or else we would end up using an undefined
3082 // register.
3083 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3084 return std::pair(IdxReg, SubRegs[0]);
3085 return std::pair(IdxBaseReg, SubRegs[Offset]);
3086 }
3087
selectG_EXTRACT_VECTOR_ELT(MachineInstr & MI) const3088 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3089 MachineInstr &MI) const {
3090 Register DstReg = MI.getOperand(0).getReg();
3091 Register SrcReg = MI.getOperand(1).getReg();
3092 Register IdxReg = MI.getOperand(2).getReg();
3093
3094 LLT DstTy = MRI->getType(DstReg);
3095 LLT SrcTy = MRI->getType(SrcReg);
3096
3097 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3098 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3099 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3100
3101 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3102 // into a waterfall loop.
3103 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3104 return false;
3105
3106 const TargetRegisterClass *SrcRC =
3107 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3108 const TargetRegisterClass *DstRC =
3109 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3110 if (!SrcRC || !DstRC)
3111 return false;
3112 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3113 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3114 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3115 return false;
3116
3117 MachineBasicBlock *BB = MI.getParent();
3118 const DebugLoc &DL = MI.getDebugLoc();
3119 const bool Is64 = DstTy.getSizeInBits() == 64;
3120
3121 unsigned SubReg;
3122 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3123 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3124
3125 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3126 if (DstTy.getSizeInBits() != 32 && !Is64)
3127 return false;
3128
3129 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3130 .addReg(IdxReg);
3131
3132 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3133 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3134 .addReg(SrcReg, 0, SubReg)
3135 .addReg(SrcReg, RegState::Implicit);
3136 MI.eraseFromParent();
3137 return true;
3138 }
3139
3140 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3141 return false;
3142
3143 if (!STI.useVGPRIndexMode()) {
3144 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3145 .addReg(IdxReg);
3146 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3147 .addReg(SrcReg, 0, SubReg)
3148 .addReg(SrcReg, RegState::Implicit);
3149 MI.eraseFromParent();
3150 return true;
3151 }
3152
3153 const MCInstrDesc &GPRIDXDesc =
3154 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3155 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3156 .addReg(SrcReg)
3157 .addReg(IdxReg)
3158 .addImm(SubReg);
3159
3160 MI.eraseFromParent();
3161 return true;
3162 }
3163
3164 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
selectG_INSERT_VECTOR_ELT(MachineInstr & MI) const3165 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3166 MachineInstr &MI) const {
3167 Register DstReg = MI.getOperand(0).getReg();
3168 Register VecReg = MI.getOperand(1).getReg();
3169 Register ValReg = MI.getOperand(2).getReg();
3170 Register IdxReg = MI.getOperand(3).getReg();
3171
3172 LLT VecTy = MRI->getType(DstReg);
3173 LLT ValTy = MRI->getType(ValReg);
3174 unsigned VecSize = VecTy.getSizeInBits();
3175 unsigned ValSize = ValTy.getSizeInBits();
3176
3177 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3178 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3179 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3180
3181 assert(VecTy.getElementType() == ValTy);
3182
3183 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3184 // into a waterfall loop.
3185 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3186 return false;
3187
3188 const TargetRegisterClass *VecRC =
3189 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3190 const TargetRegisterClass *ValRC =
3191 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3192
3193 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3194 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3195 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3196 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3197 return false;
3198
3199 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3200 return false;
3201
3202 unsigned SubReg;
3203 std::tie(IdxReg, SubReg) =
3204 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3205
3206 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3207 STI.useVGPRIndexMode();
3208
3209 MachineBasicBlock *BB = MI.getParent();
3210 const DebugLoc &DL = MI.getDebugLoc();
3211
3212 if (!IndexMode) {
3213 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3214 .addReg(IdxReg);
3215
3216 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3217 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3218 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3219 .addReg(VecReg)
3220 .addReg(ValReg)
3221 .addImm(SubReg);
3222 MI.eraseFromParent();
3223 return true;
3224 }
3225
3226 const MCInstrDesc &GPRIDXDesc =
3227 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3228 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3229 .addReg(VecReg)
3230 .addReg(ValReg)
3231 .addReg(IdxReg)
3232 .addImm(SubReg);
3233
3234 MI.eraseFromParent();
3235 return true;
3236 }
3237
selectBufferLoadLds(MachineInstr & MI) const3238 bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3239 assert(!AMDGPU::isGFX12Plus(STI));
3240 unsigned Opc;
3241 unsigned Size = MI.getOperand(3).getImm();
3242
3243 // The struct intrinsic variants add one additional operand over raw.
3244 const bool HasVIndex = MI.getNumOperands() == 9;
3245 Register VIndex;
3246 int OpOffset = 0;
3247 if (HasVIndex) {
3248 VIndex = MI.getOperand(4).getReg();
3249 OpOffset = 1;
3250 }
3251
3252 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3253 std::optional<ValueAndVReg> MaybeVOffset =
3254 getIConstantVRegValWithLookThrough(VOffset, *MRI);
3255 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3256
3257 switch (Size) {
3258 default:
3259 return false;
3260 case 1:
3261 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3262 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3263 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3264 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3265 break;
3266 case 2:
3267 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3268 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3269 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3270 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3271 break;
3272 case 4:
3273 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3274 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3275 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3276 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3277 break;
3278 }
3279
3280 MachineBasicBlock *MBB = MI.getParent();
3281 const DebugLoc &DL = MI.getDebugLoc();
3282 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3283 .add(MI.getOperand(2));
3284
3285 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3286
3287 if (HasVIndex && HasVOffset) {
3288 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3289 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3290 .addReg(VIndex)
3291 .addImm(AMDGPU::sub0)
3292 .addReg(VOffset)
3293 .addImm(AMDGPU::sub1);
3294
3295 MIB.addReg(IdxReg);
3296 } else if (HasVIndex) {
3297 MIB.addReg(VIndex);
3298 } else if (HasVOffset) {
3299 MIB.addReg(VOffset);
3300 }
3301
3302 MIB.add(MI.getOperand(1)); // rsrc
3303 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3304 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3305 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3306 MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
3307 MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
3308
3309 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3310 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3311 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3312 MachinePointerInfo StorePtrI = LoadPtrI;
3313 StorePtrI.V = nullptr;
3314 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3315
3316 auto F = LoadMMO->getFlags() &
3317 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3318 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3319 Size, LoadMMO->getBaseAlign());
3320
3321 MachineMemOperand *StoreMMO =
3322 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3323 sizeof(int32_t), LoadMMO->getBaseAlign());
3324
3325 MIB.setMemRefs({LoadMMO, StoreMMO});
3326
3327 MI.eraseFromParent();
3328 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3329 }
3330
3331 /// Match a zero extend from a 32-bit value to 64-bits.
matchZeroExtendFromS32(MachineRegisterInfo & MRI,Register Reg)3332 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3333 Register ZExtSrc;
3334 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3335 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3336
3337 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3338 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3339 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3340 return Register();
3341
3342 assert(Def->getNumOperands() == 3 &&
3343 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3344 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3345 return Def->getOperand(1).getReg();
3346 }
3347
3348 return Register();
3349 }
3350
selectGlobalLoadLds(MachineInstr & MI) const3351 bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3352 unsigned Opc;
3353 unsigned Size = MI.getOperand(3).getImm();
3354
3355 switch (Size) {
3356 default:
3357 return false;
3358 case 1:
3359 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3360 break;
3361 case 2:
3362 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3363 break;
3364 case 4:
3365 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3366 break;
3367 }
3368
3369 MachineBasicBlock *MBB = MI.getParent();
3370 const DebugLoc &DL = MI.getDebugLoc();
3371 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3372 .add(MI.getOperand(2));
3373
3374 Register Addr = MI.getOperand(1).getReg();
3375 Register VOffset;
3376 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3377 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3378 if (!isSGPR(Addr)) {
3379 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3380 if (isSGPR(AddrDef->Reg)) {
3381 Addr = AddrDef->Reg;
3382 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3383 Register SAddr =
3384 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3385 if (isSGPR(SAddr)) {
3386 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3387 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3388 Addr = SAddr;
3389 VOffset = Off;
3390 }
3391 }
3392 }
3393 }
3394
3395 if (isSGPR(Addr)) {
3396 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3397 if (!VOffset) {
3398 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3399 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3400 .addImm(0);
3401 }
3402 }
3403
3404 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3405 .addReg(Addr);
3406
3407 if (isSGPR(Addr))
3408 MIB.addReg(VOffset);
3409
3410 MIB.add(MI.getOperand(4)) // offset
3411 .add(MI.getOperand(5)); // cpol
3412
3413 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3414 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3415 LoadPtrI.Offset = MI.getOperand(4).getImm();
3416 MachinePointerInfo StorePtrI = LoadPtrI;
3417 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
3418 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3419 auto F = LoadMMO->getFlags() &
3420 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3421 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3422 Size, LoadMMO->getBaseAlign());
3423 MachineMemOperand *StoreMMO =
3424 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3425 sizeof(int32_t), Align(4));
3426
3427 MIB.setMemRefs({LoadMMO, StoreMMO});
3428
3429 MI.eraseFromParent();
3430 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3431 }
3432
selectBVHIntrinsic(MachineInstr & MI) const3433 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3434 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3435 MI.removeOperand(1);
3436 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3437 return true;
3438 }
3439
selectSMFMACIntrin(MachineInstr & MI) const3440 bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3441 unsigned Opc;
3442 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3443 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3444 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3445 break;
3446 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3447 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3448 break;
3449 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3450 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3451 break;
3452 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3453 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3454 break;
3455 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3456 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3457 break;
3458 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3459 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3460 break;
3461 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3462 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3463 break;
3464 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3465 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3466 break;
3467 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3468 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3469 break;
3470 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3471 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3472 break;
3473 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3474 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3475 break;
3476 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3477 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3478 break;
3479 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3480 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3481 break;
3482 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3483 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3484 break;
3485 default:
3486 llvm_unreachable("unhandled smfmac intrinsic");
3487 }
3488
3489 auto VDst_In = MI.getOperand(4);
3490
3491 MI.setDesc(TII.get(Opc));
3492 MI.removeOperand(4); // VDst_In
3493 MI.removeOperand(1); // Intrinsic ID
3494 MI.addOperand(VDst_In); // Readd VDst_In to the end
3495 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3496 return true;
3497 }
3498
selectWaveAddress(MachineInstr & MI) const3499 bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3500 Register DstReg = MI.getOperand(0).getReg();
3501 Register SrcReg = MI.getOperand(1).getReg();
3502 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3503 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3504 MachineBasicBlock *MBB = MI.getParent();
3505 const DebugLoc &DL = MI.getDebugLoc();
3506
3507 if (IsVALU) {
3508 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3509 .addImm(Subtarget->getWavefrontSizeLog2())
3510 .addReg(SrcReg);
3511 } else {
3512 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3513 .addReg(SrcReg)
3514 .addImm(Subtarget->getWavefrontSizeLog2())
3515 .setOperandDead(3); // Dead scc
3516 }
3517
3518 const TargetRegisterClass &RC =
3519 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3520 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3521 return false;
3522
3523 MI.eraseFromParent();
3524 return true;
3525 }
3526
selectStackRestore(MachineInstr & MI) const3527 bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3528 Register SrcReg = MI.getOperand(0).getReg();
3529 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3530 return false;
3531
3532 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3533 Register SP =
3534 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
3535 Register WaveAddr = getWaveAddress(DefMI);
3536 MachineBasicBlock *MBB = MI.getParent();
3537 const DebugLoc &DL = MI.getDebugLoc();
3538
3539 if (!WaveAddr) {
3540 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3541 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3542 .addReg(SrcReg)
3543 .addImm(Subtarget->getWavefrontSizeLog2())
3544 .setOperandDead(3); // Dead scc
3545 }
3546
3547 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3548 .addReg(WaveAddr);
3549
3550 MI.eraseFromParent();
3551 return true;
3552 }
3553
select(MachineInstr & I)3554 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3555 if (I.isPHI())
3556 return selectPHI(I);
3557
3558 if (!I.isPreISelOpcode()) {
3559 if (I.isCopy())
3560 return selectCOPY(I);
3561 return true;
3562 }
3563
3564 switch (I.getOpcode()) {
3565 case TargetOpcode::G_AND:
3566 case TargetOpcode::G_OR:
3567 case TargetOpcode::G_XOR:
3568 if (selectImpl(I, *CoverageInfo))
3569 return true;
3570 return selectG_AND_OR_XOR(I);
3571 case TargetOpcode::G_ADD:
3572 case TargetOpcode::G_SUB:
3573 if (selectImpl(I, *CoverageInfo))
3574 return true;
3575 return selectG_ADD_SUB(I);
3576 case TargetOpcode::G_UADDO:
3577 case TargetOpcode::G_USUBO:
3578 case TargetOpcode::G_UADDE:
3579 case TargetOpcode::G_USUBE:
3580 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3581 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3582 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3583 return selectG_AMDGPU_MAD_64_32(I);
3584 case TargetOpcode::G_INTTOPTR:
3585 case TargetOpcode::G_BITCAST:
3586 case TargetOpcode::G_PTRTOINT:
3587 return selectCOPY(I);
3588 case TargetOpcode::G_CONSTANT:
3589 case TargetOpcode::G_FCONSTANT:
3590 return selectG_CONSTANT(I);
3591 case TargetOpcode::G_FNEG:
3592 if (selectImpl(I, *CoverageInfo))
3593 return true;
3594 return selectG_FNEG(I);
3595 case TargetOpcode::G_FABS:
3596 if (selectImpl(I, *CoverageInfo))
3597 return true;
3598 return selectG_FABS(I);
3599 case TargetOpcode::G_EXTRACT:
3600 return selectG_EXTRACT(I);
3601 case TargetOpcode::G_MERGE_VALUES:
3602 case TargetOpcode::G_CONCAT_VECTORS:
3603 return selectG_MERGE_VALUES(I);
3604 case TargetOpcode::G_UNMERGE_VALUES:
3605 return selectG_UNMERGE_VALUES(I);
3606 case TargetOpcode::G_BUILD_VECTOR:
3607 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3608 return selectG_BUILD_VECTOR(I);
3609 case TargetOpcode::G_PTR_ADD:
3610 if (selectImpl(I, *CoverageInfo))
3611 return true;
3612 return selectG_PTR_ADD(I);
3613 case TargetOpcode::G_IMPLICIT_DEF:
3614 return selectG_IMPLICIT_DEF(I);
3615 case TargetOpcode::G_FREEZE:
3616 return selectCOPY(I);
3617 case TargetOpcode::G_INSERT:
3618 return selectG_INSERT(I);
3619 case TargetOpcode::G_INTRINSIC:
3620 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3621 return selectG_INTRINSIC(I);
3622 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3623 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3624 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3625 case TargetOpcode::G_ICMP:
3626 case TargetOpcode::G_FCMP:
3627 if (selectG_ICMP_or_FCMP(I))
3628 return true;
3629 return selectImpl(I, *CoverageInfo);
3630 case TargetOpcode::G_LOAD:
3631 case TargetOpcode::G_STORE:
3632 case TargetOpcode::G_ATOMIC_CMPXCHG:
3633 case TargetOpcode::G_ATOMICRMW_XCHG:
3634 case TargetOpcode::G_ATOMICRMW_ADD:
3635 case TargetOpcode::G_ATOMICRMW_SUB:
3636 case TargetOpcode::G_ATOMICRMW_AND:
3637 case TargetOpcode::G_ATOMICRMW_OR:
3638 case TargetOpcode::G_ATOMICRMW_XOR:
3639 case TargetOpcode::G_ATOMICRMW_MIN:
3640 case TargetOpcode::G_ATOMICRMW_MAX:
3641 case TargetOpcode::G_ATOMICRMW_UMIN:
3642 case TargetOpcode::G_ATOMICRMW_UMAX:
3643 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3644 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3645 case TargetOpcode::G_ATOMICRMW_FADD:
3646 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3647 case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3648 return selectG_LOAD_STORE_ATOMICRMW(I);
3649 case TargetOpcode::G_SELECT:
3650 return selectG_SELECT(I);
3651 case TargetOpcode::G_TRUNC:
3652 return selectG_TRUNC(I);
3653 case TargetOpcode::G_SEXT:
3654 case TargetOpcode::G_ZEXT:
3655 case TargetOpcode::G_ANYEXT:
3656 case TargetOpcode::G_SEXT_INREG:
3657 // This is a workaround. For extension from type i1, `selectImpl()` uses
3658 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3659 // i1 can only be hold in a SGPR class.
3660 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3661 selectImpl(I, *CoverageInfo))
3662 return true;
3663 return selectG_SZA_EXT(I);
3664 case TargetOpcode::G_FPEXT:
3665 if (selectG_FPEXT(I))
3666 return true;
3667 return selectImpl(I, *CoverageInfo);
3668 case TargetOpcode::G_BRCOND:
3669 return selectG_BRCOND(I);
3670 case TargetOpcode::G_GLOBAL_VALUE:
3671 return selectG_GLOBAL_VALUE(I);
3672 case TargetOpcode::G_PTRMASK:
3673 return selectG_PTRMASK(I);
3674 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3675 return selectG_EXTRACT_VECTOR_ELT(I);
3676 case TargetOpcode::G_INSERT_VECTOR_ELT:
3677 return selectG_INSERT_VECTOR_ELT(I);
3678 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3679 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3680 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3681 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3682 const AMDGPU::ImageDimIntrinsicInfo *Intr =
3683 AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I));
3684 assert(Intr && "not an image intrinsic with image pseudo");
3685 return selectImageIntrinsic(I, Intr);
3686 }
3687 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3688 return selectBVHIntrinsic(I);
3689 case AMDGPU::G_SBFX:
3690 case AMDGPU::G_UBFX:
3691 return selectG_SBFX_UBFX(I);
3692 case AMDGPU::G_SI_CALL:
3693 I.setDesc(TII.get(AMDGPU::SI_CALL));
3694 return true;
3695 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3696 return selectWaveAddress(I);
3697 case AMDGPU::G_STACKRESTORE:
3698 return selectStackRestore(I);
3699 default:
3700 return selectImpl(I, *CoverageInfo);
3701 }
3702 return false;
3703 }
3704
3705 InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand & Root) const3706 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3707 return {{
3708 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3709 }};
3710
3711 }
3712
3713 std::pair<Register, unsigned>
selectVOP3ModsImpl(MachineOperand & Root,bool IsCanonicalizing,bool AllowAbs,bool OpSel) const3714 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3715 bool IsCanonicalizing,
3716 bool AllowAbs, bool OpSel) const {
3717 Register Src = Root.getReg();
3718 unsigned Mods = 0;
3719 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3720
3721 if (MI->getOpcode() == AMDGPU::G_FNEG) {
3722 Src = MI->getOperand(1).getReg();
3723 Mods |= SISrcMods::NEG;
3724 MI = getDefIgnoringCopies(Src, *MRI);
3725 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3726 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3727 // denormal mode, but we're implicitly canonicalizing in a source operand.
3728 const ConstantFP *LHS =
3729 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
3730 if (LHS && LHS->isZero()) {
3731 Mods |= SISrcMods::NEG;
3732 Src = MI->getOperand(2).getReg();
3733 }
3734 }
3735
3736 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3737 Src = MI->getOperand(1).getReg();
3738 Mods |= SISrcMods::ABS;
3739 }
3740
3741 if (OpSel)
3742 Mods |= SISrcMods::OP_SEL_0;
3743
3744 return std::pair(Src, Mods);
3745 }
3746
copyToVGPRIfSrcFolded(Register Src,unsigned Mods,MachineOperand Root,MachineInstr * InsertPt,bool ForceVGPR) const3747 Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3748 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3749 bool ForceVGPR) const {
3750 if ((Mods != 0 || ForceVGPR) &&
3751 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3752
3753 // If we looked through copies to find source modifiers on an SGPR operand,
3754 // we now have an SGPR register source. To avoid potentially violating the
3755 // constant bus restriction, we need to insert a copy to a VGPR.
3756 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3757 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3758 TII.get(AMDGPU::COPY), VGPRSrc)
3759 .addReg(Src);
3760 Src = VGPRSrc;
3761 }
3762
3763 return Src;
3764 }
3765
3766 ///
3767 /// This will select either an SGPR or VGPR operand and will save us from
3768 /// having to write an extra tablegen pattern.
3769 InstructionSelector::ComplexRendererFns
selectVSRC0(MachineOperand & Root) const3770 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3771 return {{
3772 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3773 }};
3774 }
3775
3776 InstructionSelector::ComplexRendererFns
selectVOP3Mods0(MachineOperand & Root) const3777 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3778 Register Src;
3779 unsigned Mods;
3780 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3781
3782 return {{
3783 [=](MachineInstrBuilder &MIB) {
3784 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3785 },
3786 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3787 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3788 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3789 }};
3790 }
3791
3792 InstructionSelector::ComplexRendererFns
selectVOP3BMods0(MachineOperand & Root) const3793 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3794 Register Src;
3795 unsigned Mods;
3796 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3797 /*IsCanonicalizing=*/true,
3798 /*AllowAbs=*/false);
3799
3800 return {{
3801 [=](MachineInstrBuilder &MIB) {
3802 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3803 },
3804 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3805 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3806 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3807 }};
3808 }
3809
3810 InstructionSelector::ComplexRendererFns
selectVOP3OMods(MachineOperand & Root) const3811 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3812 return {{
3813 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3814 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3815 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3816 }};
3817 }
3818
3819 InstructionSelector::ComplexRendererFns
selectVOP3Mods(MachineOperand & Root) const3820 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3821 Register Src;
3822 unsigned Mods;
3823 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3824
3825 return {{
3826 [=](MachineInstrBuilder &MIB) {
3827 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3828 },
3829 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3830 }};
3831 }
3832
3833 InstructionSelector::ComplexRendererFns
selectVOP3ModsNonCanonicalizing(MachineOperand & Root) const3834 AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
3835 MachineOperand &Root) const {
3836 Register Src;
3837 unsigned Mods;
3838 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
3839
3840 return {{
3841 [=](MachineInstrBuilder &MIB) {
3842 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3843 },
3844 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3845 }};
3846 }
3847
3848 InstructionSelector::ComplexRendererFns
selectVOP3BMods(MachineOperand & Root) const3849 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3850 Register Src;
3851 unsigned Mods;
3852 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
3853 /*AllowAbs=*/false);
3854
3855 return {{
3856 [=](MachineInstrBuilder &MIB) {
3857 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3858 },
3859 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3860 }};
3861 }
3862
3863 InstructionSelector::ComplexRendererFns
selectVOP3NoMods(MachineOperand & Root) const3864 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3865 Register Reg = Root.getReg();
3866 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3867 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3868 return {};
3869 return {{
3870 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3871 }};
3872 }
3873
3874 std::pair<Register, unsigned>
selectVOP3PModsImpl(Register Src,const MachineRegisterInfo & MRI,bool IsDOT) const3875 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3876 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3877 unsigned Mods = 0;
3878 MachineInstr *MI = MRI.getVRegDef(Src);
3879
3880 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3881 // It's possible to see an f32 fneg here, but unlikely.
3882 // TODO: Treat f32 fneg as only high bit.
3883 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3884 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3885 Src = MI->getOperand(1).getReg();
3886 MI = MRI.getVRegDef(Src);
3887 }
3888
3889 // TODO: Handle G_FSUB 0 as fneg
3890
3891 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3892 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3893
3894 // Packed instructions do not have abs modifiers.
3895 Mods |= SISrcMods::OP_SEL_1;
3896
3897 return std::pair(Src, Mods);
3898 }
3899
3900 InstructionSelector::ComplexRendererFns
selectVOP3PMods(MachineOperand & Root) const3901 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3902 MachineRegisterInfo &MRI
3903 = Root.getParent()->getParent()->getParent()->getRegInfo();
3904
3905 Register Src;
3906 unsigned Mods;
3907 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3908
3909 return {{
3910 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3911 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3912 }};
3913 }
3914
3915 InstructionSelector::ComplexRendererFns
selectVOP3PModsDOT(MachineOperand & Root) const3916 AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3917 MachineRegisterInfo &MRI
3918 = Root.getParent()->getParent()->getParent()->getRegInfo();
3919
3920 Register Src;
3921 unsigned Mods;
3922 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
3923
3924 return {{
3925 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3926 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3927 }};
3928 }
3929
3930 InstructionSelector::ComplexRendererFns
selectVOP3PModsNeg(MachineOperand & Root) const3931 AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
3932 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3933 // Value is in Imm operand as i1 sign extended to int64_t.
3934 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3935 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3936 "expected i1 value");
3937 unsigned Mods = SISrcMods::OP_SEL_1;
3938 if (Root.getImm() == -1)
3939 Mods ^= SISrcMods::NEG;
3940 return {{
3941 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3942 }};
3943 }
3944
3945 InstructionSelector::ComplexRendererFns
selectWMMAOpSelVOP3PMods(MachineOperand & Root) const3946 AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3947 MachineOperand &Root) const {
3948 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3949 "expected i1 value");
3950 unsigned Mods = SISrcMods::OP_SEL_1;
3951 if (Root.getImm() != 0)
3952 Mods |= SISrcMods::OP_SEL_0;
3953
3954 return {{
3955 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3956 }};
3957 }
3958
buildRegSequence(SmallVectorImpl<Register> & Elts,MachineInstr * InsertPt,MachineRegisterInfo & MRI)3959 static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
3960 MachineInstr *InsertPt,
3961 MachineRegisterInfo &MRI) {
3962 const TargetRegisterClass *DstRegClass;
3963 switch (Elts.size()) {
3964 case 8:
3965 DstRegClass = &AMDGPU::VReg_256RegClass;
3966 break;
3967 case 4:
3968 DstRegClass = &AMDGPU::VReg_128RegClass;
3969 break;
3970 case 2:
3971 DstRegClass = &AMDGPU::VReg_64RegClass;
3972 break;
3973 default:
3974 llvm_unreachable("unhandled Reg sequence size");
3975 }
3976
3977 MachineIRBuilder B(*InsertPt);
3978 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
3979 .addDef(MRI.createVirtualRegister(DstRegClass));
3980 for (unsigned i = 0; i < Elts.size(); ++i) {
3981 MIB.addReg(Elts[i]);
3982 MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));
3983 }
3984 return MIB->getOperand(0).getReg();
3985 }
3986
selectWMMAModsNegAbs(unsigned ModOpcode,unsigned & Mods,SmallVectorImpl<Register> & Elts,Register & Src,MachineInstr * InsertPt,MachineRegisterInfo & MRI)3987 static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3988 SmallVectorImpl<Register> &Elts, Register &Src,
3989 MachineInstr *InsertPt,
3990 MachineRegisterInfo &MRI) {
3991 if (ModOpcode == TargetOpcode::G_FNEG) {
3992 Mods |= SISrcMods::NEG;
3993 // Check if all elements also have abs modifier
3994 SmallVector<Register, 8> NegAbsElts;
3995 for (auto El : Elts) {
3996 Register FabsSrc;
3997 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
3998 break;
3999 NegAbsElts.push_back(FabsSrc);
4000 }
4001 if (Elts.size() != NegAbsElts.size()) {
4002 // Neg
4003 Src = buildRegSequence(Elts, InsertPt, MRI);
4004 } else {
4005 // Neg and Abs
4006 Mods |= SISrcMods::NEG_HI;
4007 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
4008 }
4009 } else {
4010 assert(ModOpcode == TargetOpcode::G_FABS);
4011 // Abs
4012 Mods |= SISrcMods::NEG_HI;
4013 Src = buildRegSequence(Elts, InsertPt, MRI);
4014 }
4015 }
4016
4017 InstructionSelector::ComplexRendererFns
selectWMMAModsF32NegAbs(MachineOperand & Root) const4018 AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
4019 Register Src = Root.getReg();
4020 unsigned Mods = SISrcMods::OP_SEL_1;
4021 unsigned ModOpcode;
4022 SmallVector<Register, 8> EltsF32;
4023
4024 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
4025 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
4026 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
4027 // Based on first element decide which mod we match, neg or abs
4028 if (EltsF32.empty())
4029 ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
4030 : AMDGPU::G_FABS;
4031 if (ElF32->getOpcode() != ModOpcode)
4032 break;
4033 EltsF32.push_back(ElF32->getOperand(1).getReg());
4034 }
4035
4036 // All elements had ModOpcode modifier
4037 if (BV->getNumSources() == EltsF32.size()) {
4038 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
4039 *MRI);
4040 }
4041 }
4042
4043 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4044 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4045 }
4046
4047 InstructionSelector::ComplexRendererFns
selectWMMAModsF16Neg(MachineOperand & Root) const4048 AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
4049 Register Src = Root.getReg();
4050 unsigned Mods = SISrcMods::OP_SEL_1;
4051 SmallVector<Register, 8> EltsV2F16;
4052
4053 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4054 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4055 Register FNegSrc;
4056 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4057 break;
4058 EltsV2F16.push_back(FNegSrc);
4059 }
4060
4061 // All elements had ModOpcode modifier
4062 if (CV->getNumSources() == EltsV2F16.size()) {
4063 Mods |= SISrcMods::NEG;
4064 Mods |= SISrcMods::NEG_HI;
4065 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4066 }
4067 }
4068
4069 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4070 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4071 }
4072
4073 InstructionSelector::ComplexRendererFns
selectWMMAModsF16NegAbs(MachineOperand & Root) const4074 AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4075 Register Src = Root.getReg();
4076 unsigned Mods = SISrcMods::OP_SEL_1;
4077 unsigned ModOpcode;
4078 SmallVector<Register, 8> EltsV2F16;
4079
4080 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4081 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4082 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4083 // Based on first element decide which mod we match, neg or abs
4084 if (EltsV2F16.empty())
4085 ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
4086 : AMDGPU::G_FABS;
4087 if (ElV2F16->getOpcode() != ModOpcode)
4088 break;
4089 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4090 }
4091
4092 // All elements had ModOpcode modifier
4093 if (CV->getNumSources() == EltsV2F16.size()) {
4094 MachineIRBuilder B(*Root.getParent());
4095 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4096 *MRI);
4097 }
4098 }
4099
4100 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4101 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4102 }
4103
4104 InstructionSelector::ComplexRendererFns
selectWMMAVISrc(MachineOperand & Root) const4105 AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4106 std::optional<FPValueAndVReg> FPValReg;
4107 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4108 if (TII.isInlineConstant(FPValReg->Value.bitcastToAPInt())) {
4109 return {{[=](MachineInstrBuilder &MIB) {
4110 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4111 }}};
4112 }
4113 // Non-inlineable splat floats should not fall-through for integer immediate
4114 // checks.
4115 return {};
4116 }
4117
4118 APInt ICst;
4119 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4120 if (TII.isInlineConstant(ICst)) {
4121 return {
4122 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4123 }
4124 }
4125
4126 return {};
4127 }
4128
4129 InstructionSelector::ComplexRendererFns
selectSWMMACIndex8(MachineOperand & Root) const4130 AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4131 Register Src =
4132 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4133 unsigned Key = 0;
4134
4135 Register ShiftSrc;
4136 std::optional<ValueAndVReg> ShiftAmt;
4137 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4138 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4139 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4140 Key = ShiftAmt->Value.getZExtValue() / 8;
4141 Src = ShiftSrc;
4142 }
4143
4144 return {{
4145 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4146 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4147 }};
4148 }
4149
4150 InstructionSelector::ComplexRendererFns
selectSWMMACIndex16(MachineOperand & Root) const4151 AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4152
4153 Register Src =
4154 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4155 unsigned Key = 0;
4156
4157 Register ShiftSrc;
4158 std::optional<ValueAndVReg> ShiftAmt;
4159 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4160 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4161 ShiftAmt->Value.getZExtValue() == 16) {
4162 Src = ShiftSrc;
4163 Key = 1;
4164 }
4165
4166 return {{
4167 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4168 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4169 }};
4170 }
4171
4172 InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand & Root) const4173 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4174 Register Src;
4175 unsigned Mods;
4176 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
4177
4178 // FIXME: Handle op_sel
4179 return {{
4180 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4181 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4182 }};
4183 }
4184
4185 InstructionSelector::ComplexRendererFns
selectVINTERPMods(MachineOperand & Root) const4186 AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4187 Register Src;
4188 unsigned Mods;
4189 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4190 /*IsCanonicalizing=*/true,
4191 /*AllowAbs=*/false,
4192 /*OpSel=*/false);
4193
4194 return {{
4195 [=](MachineInstrBuilder &MIB) {
4196 MIB.addReg(
4197 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4198 },
4199 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4200 }};
4201 }
4202
4203 InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand & Root) const4204 AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4205 Register Src;
4206 unsigned Mods;
4207 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4208 /*IsCanonicalizing=*/true,
4209 /*AllowAbs=*/false,
4210 /*OpSel=*/true);
4211
4212 return {{
4213 [=](MachineInstrBuilder &MIB) {
4214 MIB.addReg(
4215 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4216 },
4217 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4218 }};
4219 }
4220
selectSmrdOffset(MachineOperand & Root,Register & Base,Register * SOffset,int64_t * Offset) const4221 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4222 Register &Base,
4223 Register *SOffset,
4224 int64_t *Offset) const {
4225 MachineInstr *MI = Root.getParent();
4226 MachineBasicBlock *MBB = MI->getParent();
4227
4228 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4229 // then we can select all ptr + 32-bit offsets.
4230 SmallVector<GEPInfo, 4> AddrInfo;
4231 getAddrModeInfo(*MI, *MRI, AddrInfo);
4232
4233 if (AddrInfo.empty())
4234 return false;
4235
4236 const GEPInfo &GEPI = AddrInfo[0];
4237 std::optional<int64_t> EncodedImm =
4238 AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
4239
4240 if (SOffset && Offset) {
4241 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4242 AddrInfo.size() > 1) {
4243 const GEPInfo &GEPI2 = AddrInfo[1];
4244 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4245 if (Register OffsetReg =
4246 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4247 Base = GEPI2.SgprParts[0];
4248 *SOffset = OffsetReg;
4249 *Offset = *EncodedImm;
4250 return true;
4251 }
4252 }
4253 }
4254 return false;
4255 }
4256
4257 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4258 Base = GEPI.SgprParts[0];
4259 *Offset = *EncodedImm;
4260 return true;
4261 }
4262
4263 // SGPR offset is unsigned.
4264 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4265 GEPI.Imm != 0) {
4266 // If we make it this far we have a load with an 32-bit immediate offset.
4267 // It is OK to select this using a sgpr offset, because we have already
4268 // failed trying to select this load into one of the _IMM variants since
4269 // the _IMM Patterns are considered before the _SGPR patterns.
4270 Base = GEPI.SgprParts[0];
4271 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4272 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4273 .addImm(GEPI.Imm);
4274 return true;
4275 }
4276
4277 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4278 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4279 Base = GEPI.SgprParts[0];
4280 *SOffset = OffsetReg;
4281 return true;
4282 }
4283 }
4284
4285 return false;
4286 }
4287
4288 InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand & Root) const4289 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4290 Register Base;
4291 int64_t Offset;
4292 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4293 return std::nullopt;
4294
4295 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4296 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4297 }
4298
4299 InstructionSelector::ComplexRendererFns
selectSmrdImm32(MachineOperand & Root) const4300 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4301 SmallVector<GEPInfo, 4> AddrInfo;
4302 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4303
4304 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4305 return std::nullopt;
4306
4307 const GEPInfo &GEPInfo = AddrInfo[0];
4308 Register PtrReg = GEPInfo.SgprParts[0];
4309 std::optional<int64_t> EncodedImm =
4310 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4311 if (!EncodedImm)
4312 return std::nullopt;
4313
4314 return {{
4315 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4316 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4317 }};
4318 }
4319
4320 InstructionSelector::ComplexRendererFns
selectSmrdSgpr(MachineOperand & Root) const4321 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4322 Register Base, SOffset;
4323 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4324 return std::nullopt;
4325
4326 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4327 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4328 }
4329
4330 InstructionSelector::ComplexRendererFns
selectSmrdSgprImm(MachineOperand & Root) const4331 AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4332 Register Base, SOffset;
4333 int64_t Offset;
4334 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4335 return std::nullopt;
4336
4337 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4338 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4339 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4340 }
4341
4342 std::pair<Register, int>
selectFlatOffsetImpl(MachineOperand & Root,uint64_t FlatVariant) const4343 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4344 uint64_t FlatVariant) const {
4345 MachineInstr *MI = Root.getParent();
4346
4347 auto Default = std::pair(Root.getReg(), 0);
4348
4349 if (!STI.hasFlatInstOffsets())
4350 return Default;
4351
4352 Register PtrBase;
4353 int64_t ConstOffset;
4354 std::tie(PtrBase, ConstOffset) =
4355 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4356
4357 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4358 !isFlatScratchBaseLegal(Root.getReg())))
4359 return Default;
4360
4361 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4362 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4363 return Default;
4364
4365 return std::pair(PtrBase, ConstOffset);
4366 }
4367
4368 InstructionSelector::ComplexRendererFns
selectFlatOffset(MachineOperand & Root) const4369 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4370 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4371
4372 return {{
4373 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4374 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4375 }};
4376 }
4377
4378 InstructionSelector::ComplexRendererFns
selectGlobalOffset(MachineOperand & Root) const4379 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4380 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4381
4382 return {{
4383 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4384 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4385 }};
4386 }
4387
4388 InstructionSelector::ComplexRendererFns
selectScratchOffset(MachineOperand & Root) const4389 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4390 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4391
4392 return {{
4393 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4394 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4395 }};
4396 }
4397
4398 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4399 InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand & Root) const4400 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4401 Register Addr = Root.getReg();
4402 Register PtrBase;
4403 int64_t ConstOffset;
4404 int64_t ImmOffset = 0;
4405
4406 // Match the immediate offset first, which canonically is moved as low as
4407 // possible.
4408 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4409
4410 if (ConstOffset != 0) {
4411 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4412 SIInstrFlags::FlatGlobal)) {
4413 Addr = PtrBase;
4414 ImmOffset = ConstOffset;
4415 } else {
4416 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4417 if (isSGPR(PtrBaseDef->Reg)) {
4418 if (ConstOffset > 0) {
4419 // Offset is too large.
4420 //
4421 // saddr + large_offset -> saddr +
4422 // (voffset = large_offset & ~MaxOffset) +
4423 // (large_offset & MaxOffset);
4424 int64_t SplitImmOffset, RemainderOffset;
4425 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4426 ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
4427
4428 if (isUInt<32>(RemainderOffset)) {
4429 MachineInstr *MI = Root.getParent();
4430 MachineBasicBlock *MBB = MI->getParent();
4431 Register HighBits =
4432 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4433
4434 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4435 HighBits)
4436 .addImm(RemainderOffset);
4437
4438 return {{
4439 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4440 [=](MachineInstrBuilder &MIB) {
4441 MIB.addReg(HighBits);
4442 }, // voffset
4443 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4444 }};
4445 }
4446 }
4447
4448 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4449 // is 1 we would need to perform 1 or 2 extra moves for each half of
4450 // the constant and it is better to do a scalar add and then issue a
4451 // single VALU instruction to materialize zero. Otherwise it is less
4452 // instructions to perform VALU adds with immediates or inline literals.
4453 unsigned NumLiterals =
4454 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
4455 !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
4456 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4457 return std::nullopt;
4458 }
4459 }
4460 }
4461
4462 // Match the variable offset.
4463 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4464 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4465 // Look through the SGPR->VGPR copy.
4466 Register SAddr =
4467 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4468
4469 if (isSGPR(SAddr)) {
4470 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4471
4472 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4473 // inserted later.
4474 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4475 return {{[=](MachineInstrBuilder &MIB) { // saddr
4476 MIB.addReg(SAddr);
4477 },
4478 [=](MachineInstrBuilder &MIB) { // voffset
4479 MIB.addReg(VOffset);
4480 },
4481 [=](MachineInstrBuilder &MIB) { // offset
4482 MIB.addImm(ImmOffset);
4483 }}};
4484 }
4485 }
4486 }
4487
4488 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4489 // drop this.
4490 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4491 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4492 return std::nullopt;
4493
4494 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4495 // moves required to copy a 64-bit SGPR to VGPR.
4496 MachineInstr *MI = Root.getParent();
4497 MachineBasicBlock *MBB = MI->getParent();
4498 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4499
4500 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4501 .addImm(0);
4502
4503 return {{
4504 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4505 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4506 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4507 }};
4508 }
4509
4510 InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand & Root) const4511 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4512 Register Addr = Root.getReg();
4513 Register PtrBase;
4514 int64_t ConstOffset;
4515 int64_t ImmOffset = 0;
4516
4517 // Match the immediate offset first, which canonically is moved as low as
4518 // possible.
4519 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4520
4521 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4522 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
4523 SIInstrFlags::FlatScratch)) {
4524 Addr = PtrBase;
4525 ImmOffset = ConstOffset;
4526 }
4527
4528 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4529 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4530 int FI = AddrDef->MI->getOperand(1).getIndex();
4531 return {{
4532 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4533 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4534 }};
4535 }
4536
4537 Register SAddr = AddrDef->Reg;
4538
4539 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4540 Register LHS = AddrDef->MI->getOperand(1).getReg();
4541 Register RHS = AddrDef->MI->getOperand(2).getReg();
4542 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4543 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4544
4545 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4546 isSGPR(RHSDef->Reg)) {
4547 int FI = LHSDef->MI->getOperand(1).getIndex();
4548 MachineInstr &I = *Root.getParent();
4549 MachineBasicBlock *BB = I.getParent();
4550 const DebugLoc &DL = I.getDebugLoc();
4551 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4552
4553 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4554 .addFrameIndex(FI)
4555 .addReg(RHSDef->Reg)
4556 .setOperandDead(3); // Dead scc
4557 }
4558 }
4559
4560 if (!isSGPR(SAddr))
4561 return std::nullopt;
4562
4563 return {{
4564 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4565 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4566 }};
4567 }
4568
4569 // Check whether the flat scratch SVS swizzle bug affects this access.
checkFlatScratchSVSSwizzleBug(Register VAddr,Register SAddr,uint64_t ImmOffset) const4570 bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4571 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4572 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4573 return false;
4574
4575 // The bug affects the swizzling of SVS accesses if there is any carry out
4576 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4577 // voffset to (soffset + inst_offset).
4578 auto VKnown = KB->getKnownBits(VAddr);
4579 auto SKnown = KnownBits::computeForAddSub(
4580 true, false, KB->getKnownBits(SAddr),
4581 KnownBits::makeConstant(APInt(32, ImmOffset)));
4582 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4583 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4584 return (VMax & 3) + (SMax & 3) >= 4;
4585 }
4586
4587 InstructionSelector::ComplexRendererFns
selectScratchSVAddr(MachineOperand & Root) const4588 AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4589 Register Addr = Root.getReg();
4590 Register PtrBase;
4591 int64_t ConstOffset;
4592 int64_t ImmOffset = 0;
4593
4594 // Match the immediate offset first, which canonically is moved as low as
4595 // possible.
4596 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4597
4598 Register OrigAddr = Addr;
4599 if (ConstOffset != 0 &&
4600 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4601 Addr = PtrBase;
4602 ImmOffset = ConstOffset;
4603 }
4604
4605 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4606 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4607 return std::nullopt;
4608
4609 Register RHS = AddrDef->MI->getOperand(2).getReg();
4610 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4611 return std::nullopt;
4612
4613 Register LHS = AddrDef->MI->getOperand(1).getReg();
4614 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4615
4616 if (OrigAddr != Addr) {
4617 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4618 return std::nullopt;
4619 } else {
4620 if (!isFlatScratchBaseLegalSV(OrigAddr))
4621 return std::nullopt;
4622 }
4623
4624 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4625 return std::nullopt;
4626
4627 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4628 int FI = LHSDef->MI->getOperand(1).getIndex();
4629 return {{
4630 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4631 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4632 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4633 }};
4634 }
4635
4636 if (!isSGPR(LHS))
4637 return std::nullopt;
4638
4639 return {{
4640 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4641 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4642 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4643 }};
4644 }
4645
4646 InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffen(MachineOperand & Root) const4647 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4648 MachineInstr *MI = Root.getParent();
4649 MachineBasicBlock *MBB = MI->getParent();
4650 MachineFunction *MF = MBB->getParent();
4651 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4652
4653 int64_t Offset = 0;
4654 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4655 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
4656 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4657
4658 // TODO: Should this be inside the render function? The iterator seems to
4659 // move.
4660 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4661 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4662 HighBits)
4663 .addImm(Offset & ~MaxOffset);
4664
4665 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4666 MIB.addReg(Info->getScratchRSrcReg());
4667 },
4668 [=](MachineInstrBuilder &MIB) { // vaddr
4669 MIB.addReg(HighBits);
4670 },
4671 [=](MachineInstrBuilder &MIB) { // soffset
4672 // Use constant zero for soffset and rely on eliminateFrameIndex
4673 // to choose the appropriate frame register if need be.
4674 MIB.addImm(0);
4675 },
4676 [=](MachineInstrBuilder &MIB) { // offset
4677 MIB.addImm(Offset & MaxOffset);
4678 }}};
4679 }
4680
4681 assert(Offset == 0 || Offset == -1);
4682
4683 // Try to fold a frame index directly into the MUBUF vaddr field, and any
4684 // offsets.
4685 std::optional<int> FI;
4686 Register VAddr = Root.getReg();
4687 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4688 Register PtrBase;
4689 int64_t ConstOffset;
4690 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
4691 if (ConstOffset != 0) {
4692 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
4693 (!STI.privateMemoryResourceIsRangeChecked() ||
4694 KB->signBitIsZero(PtrBase))) {
4695 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4696 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4697 FI = PtrBaseDef->getOperand(1).getIndex();
4698 else
4699 VAddr = PtrBase;
4700 Offset = ConstOffset;
4701 }
4702 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4703 FI = RootDef->getOperand(1).getIndex();
4704 }
4705 }
4706
4707 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4708 MIB.addReg(Info->getScratchRSrcReg());
4709 },
4710 [=](MachineInstrBuilder &MIB) { // vaddr
4711 if (FI)
4712 MIB.addFrameIndex(*FI);
4713 else
4714 MIB.addReg(VAddr);
4715 },
4716 [=](MachineInstrBuilder &MIB) { // soffset
4717 // Use constant zero for soffset and rely on eliminateFrameIndex
4718 // to choose the appropriate frame register if need be.
4719 MIB.addImm(0);
4720 },
4721 [=](MachineInstrBuilder &MIB) { // offset
4722 MIB.addImm(Offset);
4723 }}};
4724 }
4725
isDSOffsetLegal(Register Base,int64_t Offset) const4726 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4727 int64_t Offset) const {
4728 if (!isUInt<16>(Offset))
4729 return false;
4730
4731 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
4732 return true;
4733
4734 // On Southern Islands instruction with a negative base value and an offset
4735 // don't seem to work.
4736 return KB->signBitIsZero(Base);
4737 }
4738
isDSOffset2Legal(Register Base,int64_t Offset0,int64_t Offset1,unsigned Size) const4739 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4740 int64_t Offset1,
4741 unsigned Size) const {
4742 if (Offset0 % Size != 0 || Offset1 % Size != 0)
4743 return false;
4744 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
4745 return false;
4746
4747 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
4748 return true;
4749
4750 // On Southern Islands instruction with a negative base value and an offset
4751 // don't seem to work.
4752 return KB->signBitIsZero(Base);
4753 }
4754
4755 // Return whether the operation has NoUnsignedWrap property.
isNoUnsignedWrap(MachineInstr * Addr)4756 static bool isNoUnsignedWrap(MachineInstr *Addr) {
4757 return Addr->getOpcode() == TargetOpcode::G_OR ||
4758 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
4759 Addr->getFlag(MachineInstr::NoUWrap));
4760 }
4761
4762 // Check that the base address of flat scratch load/store in the form of `base +
4763 // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
4764 // requirement). We always treat the first operand as the base address here.
isFlatScratchBaseLegal(Register Addr) const4765 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
4766 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4767
4768 if (isNoUnsignedWrap(AddrMI))
4769 return true;
4770
4771 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4772 // values.
4773 if (STI.hasSignedScratchOffsets())
4774 return true;
4775
4776 Register LHS = AddrMI->getOperand(1).getReg();
4777 Register RHS = AddrMI->getOperand(2).getReg();
4778
4779 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
4780 std::optional<ValueAndVReg> RhsValReg =
4781 getIConstantVRegValWithLookThrough(RHS, *MRI);
4782 // If the immediate offset is negative and within certain range, the base
4783 // address cannot also be negative. If the base is also negative, the sum
4784 // would be either negative or much larger than the valid range of scratch
4785 // memory a thread can access.
4786 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
4787 RhsValReg->Value.getSExtValue() > -0x40000000)
4788 return true;
4789 }
4790
4791 return KB->signBitIsZero(LHS);
4792 }
4793
4794 // Check address value in SGPR/VGPR are legal for flat scratch in the form
4795 // of: SGPR + VGPR.
isFlatScratchBaseLegalSV(Register Addr) const4796 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
4797 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4798
4799 if (isNoUnsignedWrap(AddrMI))
4800 return true;
4801
4802 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4803 // values.
4804 if (STI.hasSignedScratchOffsets())
4805 return true;
4806
4807 Register LHS = AddrMI->getOperand(1).getReg();
4808 Register RHS = AddrMI->getOperand(2).getReg();
4809 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4810 }
4811
4812 // Check address value in SGPR/VGPR are legal for flat scratch in the form
4813 // of: SGPR + VGPR + Imm.
isFlatScratchBaseLegalSVImm(Register Addr) const4814 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
4815 Register Addr) const {
4816 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4817 // values.
4818 if (STI.hasSignedScratchOffsets())
4819 return true;
4820
4821 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4822 Register Base = AddrMI->getOperand(1).getReg();
4823 std::optional<DefinitionAndSourceRegister> BaseDef =
4824 getDefSrcRegIgnoringCopies(Base, *MRI);
4825 std::optional<ValueAndVReg> RHSOffset =
4826 getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI);
4827 assert(RHSOffset);
4828
4829 // If the immediate offset is negative and within certain range, the base
4830 // address cannot also be negative. If the base is also negative, the sum
4831 // would be either negative or much larger than the valid range of scratch
4832 // memory a thread can access.
4833 if (isNoUnsignedWrap(BaseDef->MI) &&
4834 (isNoUnsignedWrap(AddrMI) ||
4835 (RHSOffset->Value.getSExtValue() < 0 &&
4836 RHSOffset->Value.getSExtValue() > -0x40000000)))
4837 return true;
4838
4839 Register LHS = BaseDef->MI->getOperand(1).getReg();
4840 Register RHS = BaseDef->MI->getOperand(2).getReg();
4841 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4842 }
4843
isUnneededShiftMask(const MachineInstr & MI,unsigned ShAmtBits) const4844 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4845 unsigned ShAmtBits) const {
4846 assert(MI.getOpcode() == TargetOpcode::G_AND);
4847
4848 std::optional<APInt> RHS =
4849 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
4850 if (!RHS)
4851 return false;
4852
4853 if (RHS->countr_one() >= ShAmtBits)
4854 return true;
4855
4856 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
4857 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
4858 }
4859
4860 InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffset(MachineOperand & Root) const4861 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
4862 MachineOperand &Root) const {
4863 Register Reg = Root.getReg();
4864 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4865
4866 std::optional<DefinitionAndSourceRegister> Def =
4867 getDefSrcRegIgnoringCopies(Reg, *MRI);
4868 assert(Def && "this shouldn't be an optional result");
4869 Reg = Def->Reg;
4870
4871 if (Register WaveBase = getWaveAddress(Def->MI)) {
4872 return {{
4873 [=](MachineInstrBuilder &MIB) { // rsrc
4874 MIB.addReg(Info->getScratchRSrcReg());
4875 },
4876 [=](MachineInstrBuilder &MIB) { // soffset
4877 MIB.addReg(WaveBase);
4878 },
4879 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
4880 }};
4881 }
4882
4883 int64_t Offset = 0;
4884
4885 // FIXME: Copy check is a hack
4886 Register BasePtr;
4887 if (mi_match(Reg, *MRI,
4888 m_GPtrAdd(m_Reg(BasePtr),
4889 m_any_of(m_ICst(Offset), m_Copy(m_ICst(Offset)))))) {
4890 if (!TII.isLegalMUBUFImmOffset(Offset))
4891 return {};
4892 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
4893 Register WaveBase = getWaveAddress(BasePtrDef);
4894 if (!WaveBase)
4895 return {};
4896
4897 return {{
4898 [=](MachineInstrBuilder &MIB) { // rsrc
4899 MIB.addReg(Info->getScratchRSrcReg());
4900 },
4901 [=](MachineInstrBuilder &MIB) { // soffset
4902 MIB.addReg(WaveBase);
4903 },
4904 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4905 }};
4906 }
4907
4908 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
4909 !TII.isLegalMUBUFImmOffset(Offset))
4910 return {};
4911
4912 return {{
4913 [=](MachineInstrBuilder &MIB) { // rsrc
4914 MIB.addReg(Info->getScratchRSrcReg());
4915 },
4916 [=](MachineInstrBuilder &MIB) { // soffset
4917 MIB.addImm(0);
4918 },
4919 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4920 }};
4921 }
4922
4923 std::pair<Register, unsigned>
selectDS1Addr1OffsetImpl(MachineOperand & Root) const4924 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
4925 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4926 if (!RootDef)
4927 return std::pair(Root.getReg(), 0);
4928
4929 int64_t ConstAddr = 0;
4930
4931 Register PtrBase;
4932 int64_t Offset;
4933 std::tie(PtrBase, Offset) =
4934 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4935
4936 if (Offset) {
4937 if (isDSOffsetLegal(PtrBase, Offset)) {
4938 // (add n0, c0)
4939 return std::pair(PtrBase, Offset);
4940 }
4941 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4942 // TODO
4943
4944
4945 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4946 // TODO
4947
4948 }
4949
4950 return std::pair(Root.getReg(), 0);
4951 }
4952
4953 InstructionSelector::ComplexRendererFns
selectDS1Addr1Offset(MachineOperand & Root) const4954 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4955 Register Reg;
4956 unsigned Offset;
4957 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
4958 return {{
4959 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4960 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
4961 }};
4962 }
4963
4964 InstructionSelector::ComplexRendererFns
selectDS64Bit4ByteAligned(MachineOperand & Root) const4965 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
4966 return selectDSReadWrite2(Root, 4);
4967 }
4968
4969 InstructionSelector::ComplexRendererFns
selectDS128Bit8ByteAligned(MachineOperand & Root) const4970 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
4971 return selectDSReadWrite2(Root, 8);
4972 }
4973
4974 InstructionSelector::ComplexRendererFns
selectDSReadWrite2(MachineOperand & Root,unsigned Size) const4975 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
4976 unsigned Size) const {
4977 Register Reg;
4978 unsigned Offset;
4979 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
4980 return {{
4981 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4982 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
4983 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
4984 }};
4985 }
4986
4987 std::pair<Register, unsigned>
selectDSReadWrite2Impl(MachineOperand & Root,unsigned Size) const4988 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
4989 unsigned Size) const {
4990 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4991 if (!RootDef)
4992 return std::pair(Root.getReg(), 0);
4993
4994 int64_t ConstAddr = 0;
4995
4996 Register PtrBase;
4997 int64_t Offset;
4998 std::tie(PtrBase, Offset) =
4999 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5000
5001 if (Offset) {
5002 int64_t OffsetValue0 = Offset;
5003 int64_t OffsetValue1 = Offset + Size;
5004 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
5005 // (add n0, c0)
5006 return std::pair(PtrBase, OffsetValue0 / Size);
5007 }
5008 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5009 // TODO
5010
5011 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5012 // TODO
5013
5014 }
5015
5016 return std::pair(Root.getReg(), 0);
5017 }
5018
5019 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
5020 /// the base value with the constant offset. There may be intervening copies
5021 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
5022 /// not match the pattern.
5023 std::pair<Register, int64_t>
getPtrBaseWithConstantOffset(Register Root,const MachineRegisterInfo & MRI) const5024 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
5025 Register Root, const MachineRegisterInfo &MRI) const {
5026 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
5027 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
5028 return {Root, 0};
5029
5030 MachineOperand &RHS = RootI->getOperand(2);
5031 std::optional<ValueAndVReg> MaybeOffset =
5032 getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
5033 if (!MaybeOffset)
5034 return {Root, 0};
5035 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
5036 }
5037
addZeroImm(MachineInstrBuilder & MIB)5038 static void addZeroImm(MachineInstrBuilder &MIB) {
5039 MIB.addImm(0);
5040 }
5041
5042 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5043 /// BasePtr is not valid, a null base pointer will be used.
buildRSRC(MachineIRBuilder & B,MachineRegisterInfo & MRI,uint32_t FormatLo,uint32_t FormatHi,Register BasePtr)5044 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5045 uint32_t FormatLo, uint32_t FormatHi,
5046 Register BasePtr) {
5047 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5048 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5049 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5050 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5051
5052 B.buildInstr(AMDGPU::S_MOV_B32)
5053 .addDef(RSrc2)
5054 .addImm(FormatLo);
5055 B.buildInstr(AMDGPU::S_MOV_B32)
5056 .addDef(RSrc3)
5057 .addImm(FormatHi);
5058
5059 // Build the half of the subregister with the constants before building the
5060 // full 128-bit register. If we are building multiple resource descriptors,
5061 // this will allow CSEing of the 2-component register.
5062 B.buildInstr(AMDGPU::REG_SEQUENCE)
5063 .addDef(RSrcHi)
5064 .addReg(RSrc2)
5065 .addImm(AMDGPU::sub0)
5066 .addReg(RSrc3)
5067 .addImm(AMDGPU::sub1);
5068
5069 Register RSrcLo = BasePtr;
5070 if (!BasePtr) {
5071 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5072 B.buildInstr(AMDGPU::S_MOV_B64)
5073 .addDef(RSrcLo)
5074 .addImm(0);
5075 }
5076
5077 B.buildInstr(AMDGPU::REG_SEQUENCE)
5078 .addDef(RSrc)
5079 .addReg(RSrcLo)
5080 .addImm(AMDGPU::sub0_sub1)
5081 .addReg(RSrcHi)
5082 .addImm(AMDGPU::sub2_sub3);
5083
5084 return RSrc;
5085 }
5086
buildAddr64RSrc(MachineIRBuilder & B,MachineRegisterInfo & MRI,const SIInstrInfo & TII,Register BasePtr)5087 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5088 const SIInstrInfo &TII, Register BasePtr) {
5089 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5090
5091 // FIXME: Why are half the "default" bits ignored based on the addressing
5092 // mode?
5093 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5094 }
5095
buildOffsetSrc(MachineIRBuilder & B,MachineRegisterInfo & MRI,const SIInstrInfo & TII,Register BasePtr)5096 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5097 const SIInstrInfo &TII, Register BasePtr) {
5098 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5099
5100 // FIXME: Why are half the "default" bits ignored based on the addressing
5101 // mode?
5102 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5103 }
5104
5105 AMDGPUInstructionSelector::MUBUFAddressData
parseMUBUFAddress(Register Src) const5106 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5107 MUBUFAddressData Data;
5108 Data.N0 = Src;
5109
5110 Register PtrBase;
5111 int64_t Offset;
5112
5113 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5114 if (isUInt<32>(Offset)) {
5115 Data.N0 = PtrBase;
5116 Data.Offset = Offset;
5117 }
5118
5119 if (MachineInstr *InputAdd
5120 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5121 Data.N2 = InputAdd->getOperand(1).getReg();
5122 Data.N3 = InputAdd->getOperand(2).getReg();
5123
5124 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5125 // FIXME: Don't know this was defined by operand 0
5126 //
5127 // TODO: Remove this when we have copy folding optimizations after
5128 // RegBankSelect.
5129 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5130 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5131 }
5132
5133 return Data;
5134 }
5135
5136 /// Return if the addr64 mubuf mode should be used for the given address.
shouldUseAddr64(MUBUFAddressData Addr) const5137 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5138 // (ptr_add N2, N3) -> addr64, or
5139 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5140 if (Addr.N2)
5141 return true;
5142
5143 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5144 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5145 }
5146
5147 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
5148 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5149 /// component.
splitIllegalMUBUFOffset(MachineIRBuilder & B,Register & SOffset,int64_t & ImmOffset) const5150 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5151 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5152 if (TII.isLegalMUBUFImmOffset(ImmOffset))
5153 return;
5154
5155 // Illegal offset, store it in soffset.
5156 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5157 B.buildInstr(AMDGPU::S_MOV_B32)
5158 .addDef(SOffset)
5159 .addImm(ImmOffset);
5160 ImmOffset = 0;
5161 }
5162
selectMUBUFAddr64Impl(MachineOperand & Root,Register & VAddr,Register & RSrcReg,Register & SOffset,int64_t & Offset) const5163 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5164 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5165 Register &SOffset, int64_t &Offset) const {
5166 // FIXME: Predicates should stop this from reaching here.
5167 // addr64 bit was removed for volcanic islands.
5168 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5169 return false;
5170
5171 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5172 if (!shouldUseAddr64(AddrData))
5173 return false;
5174
5175 Register N0 = AddrData.N0;
5176 Register N2 = AddrData.N2;
5177 Register N3 = AddrData.N3;
5178 Offset = AddrData.Offset;
5179
5180 // Base pointer for the SRD.
5181 Register SRDPtr;
5182
5183 if (N2) {
5184 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5185 assert(N3);
5186 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5187 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5188 // addr64, and construct the default resource from a 0 address.
5189 VAddr = N0;
5190 } else {
5191 SRDPtr = N3;
5192 VAddr = N2;
5193 }
5194 } else {
5195 // N2 is not divergent.
5196 SRDPtr = N2;
5197 VAddr = N3;
5198 }
5199 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5200 // Use the default null pointer in the resource
5201 VAddr = N0;
5202 } else {
5203 // N0 -> offset, or
5204 // (N0 + C1) -> offset
5205 SRDPtr = N0;
5206 }
5207
5208 MachineIRBuilder B(*Root.getParent());
5209 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5210 splitIllegalMUBUFOffset(B, SOffset, Offset);
5211 return true;
5212 }
5213
selectMUBUFOffsetImpl(MachineOperand & Root,Register & RSrcReg,Register & SOffset,int64_t & Offset) const5214 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5215 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5216 int64_t &Offset) const {
5217
5218 // FIXME: Pattern should not reach here.
5219 if (STI.useFlatForGlobal())
5220 return false;
5221
5222 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5223 if (shouldUseAddr64(AddrData))
5224 return false;
5225
5226 // N0 -> offset, or
5227 // (N0 + C1) -> offset
5228 Register SRDPtr = AddrData.N0;
5229 Offset = AddrData.Offset;
5230
5231 // TODO: Look through extensions for 32-bit soffset.
5232 MachineIRBuilder B(*Root.getParent());
5233
5234 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5235 splitIllegalMUBUFOffset(B, SOffset, Offset);
5236 return true;
5237 }
5238
5239 InstructionSelector::ComplexRendererFns
selectMUBUFAddr64(MachineOperand & Root) const5240 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5241 Register VAddr;
5242 Register RSrcReg;
5243 Register SOffset;
5244 int64_t Offset = 0;
5245
5246 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5247 return {};
5248
5249 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5250 // pattern.
5251 return {{
5252 [=](MachineInstrBuilder &MIB) { // rsrc
5253 MIB.addReg(RSrcReg);
5254 },
5255 [=](MachineInstrBuilder &MIB) { // vaddr
5256 MIB.addReg(VAddr);
5257 },
5258 [=](MachineInstrBuilder &MIB) { // soffset
5259 if (SOffset)
5260 MIB.addReg(SOffset);
5261 else if (STI.hasRestrictedSOffset())
5262 MIB.addReg(AMDGPU::SGPR_NULL);
5263 else
5264 MIB.addImm(0);
5265 },
5266 [=](MachineInstrBuilder &MIB) { // offset
5267 MIB.addImm(Offset);
5268 },
5269 addZeroImm, // cpol
5270 addZeroImm, // tfe
5271 addZeroImm // swz
5272 }};
5273 }
5274
5275 InstructionSelector::ComplexRendererFns
selectMUBUFOffset(MachineOperand & Root) const5276 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5277 Register RSrcReg;
5278 Register SOffset;
5279 int64_t Offset = 0;
5280
5281 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5282 return {};
5283
5284 return {{
5285 [=](MachineInstrBuilder &MIB) { // rsrc
5286 MIB.addReg(RSrcReg);
5287 },
5288 [=](MachineInstrBuilder &MIB) { // soffset
5289 if (SOffset)
5290 MIB.addReg(SOffset);
5291 else if (STI.hasRestrictedSOffset())
5292 MIB.addReg(AMDGPU::SGPR_NULL);
5293 else
5294 MIB.addImm(0);
5295 },
5296 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5297 addZeroImm, // cpol
5298 addZeroImm, // tfe
5299 addZeroImm, // swz
5300 }};
5301 }
5302
5303 InstructionSelector::ComplexRendererFns
selectBUFSOffset(MachineOperand & Root) const5304 AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5305
5306 Register SOffset = Root.getReg();
5307
5308 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5309 SOffset = AMDGPU::SGPR_NULL;
5310
5311 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5312 }
5313
5314 /// Get an immediate that must be 32-bits, and treated as zero extended.
5315 static std::optional<uint64_t>
getConstantZext32Val(Register Reg,const MachineRegisterInfo & MRI)5316 getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
5317 // getIConstantVRegVal sexts any values, so see if that matters.
5318 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5319 if (!OffsetVal || !isInt<32>(*OffsetVal))
5320 return std::nullopt;
5321 return Lo_32(*OffsetVal);
5322 }
5323
5324 InstructionSelector::ComplexRendererFns
selectSMRDBufferImm(MachineOperand & Root) const5325 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5326 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5327 if (!OffsetVal)
5328 return {};
5329
5330 std::optional<int64_t> EncodedImm =
5331 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5332 if (!EncodedImm)
5333 return {};
5334
5335 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5336 }
5337
5338 InstructionSelector::ComplexRendererFns
selectSMRDBufferImm32(MachineOperand & Root) const5339 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5340 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
5341
5342 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5343 if (!OffsetVal)
5344 return {};
5345
5346 std::optional<int64_t> EncodedImm =
5347 AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
5348 if (!EncodedImm)
5349 return {};
5350
5351 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5352 }
5353
5354 InstructionSelector::ComplexRendererFns
selectSMRDBufferSgprImm(MachineOperand & Root) const5355 AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5356 // Match the (soffset + offset) pair as a 32-bit register base and
5357 // an immediate offset.
5358 Register SOffset;
5359 unsigned Offset;
5360 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5361 *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5362 if (!SOffset)
5363 return std::nullopt;
5364
5365 std::optional<int64_t> EncodedOffset =
5366 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5367 if (!EncodedOffset)
5368 return std::nullopt;
5369
5370 assert(MRI->getType(SOffset) == LLT::scalar(32));
5371 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5372 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5373 }
5374
5375 // Variant of stripBitCast that returns the instruction instead of a
5376 // MachineOperand.
stripBitCast(MachineInstr * MI,MachineRegisterInfo & MRI)5377 static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
5378 if (MI->getOpcode() == AMDGPU::G_BITCAST)
5379 return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
5380 return MI;
5381 }
5382
5383 // Figure out if this is really an extract of the high 16-bits of a dword,
5384 // returns nullptr if it isn't.
isExtractHiElt(MachineInstr * Inst,MachineRegisterInfo & MRI)5385 static MachineInstr *isExtractHiElt(MachineInstr *Inst,
5386 MachineRegisterInfo &MRI) {
5387 Inst = stripBitCast(Inst, MRI);
5388
5389 if (Inst->getOpcode() != AMDGPU::G_TRUNC)
5390 return nullptr;
5391
5392 MachineInstr *TruncOp =
5393 getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
5394 TruncOp = stripBitCast(TruncOp, MRI);
5395
5396 // G_LSHR x, (G_CONSTANT i32 16)
5397 if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
5398 auto SrlAmount = getIConstantVRegValWithLookThrough(
5399 TruncOp->getOperand(2).getReg(), MRI);
5400 if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
5401 MachineInstr *SrlOp =
5402 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5403 return stripBitCast(SrlOp, MRI);
5404 }
5405 }
5406
5407 // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
5408 // 1, 0 swaps the low/high 16 bits.
5409 // 1, 1 sets the high 16 bits to be the same as the low 16.
5410 // in any case, it selects the high elts.
5411 if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
5412 assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
5413 LLT::fixed_vector(2, 16));
5414
5415 ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
5416 assert(Mask.size() == 2);
5417
5418 if (Mask[0] == 1 && Mask[1] <= 1) {
5419 MachineInstr *LHS =
5420 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5421 return stripBitCast(LHS, MRI);
5422 }
5423 }
5424
5425 return nullptr;
5426 }
5427
5428 std::pair<Register, unsigned>
selectVOP3PMadMixModsImpl(MachineOperand & Root,bool & Matched) const5429 AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5430 bool &Matched) const {
5431 Matched = false;
5432
5433 Register Src;
5434 unsigned Mods;
5435 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
5436
5437 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
5438 if (MI->getOpcode() == AMDGPU::G_FPEXT) {
5439 MachineOperand *MO = &MI->getOperand(1);
5440 Src = MO->getReg();
5441 MI = getDefIgnoringCopies(Src, *MRI);
5442
5443 assert(MRI->getType(Src) == LLT::scalar(16));
5444
5445 // See through bitcasts.
5446 // FIXME: Would be nice to use stripBitCast here.
5447 if (MI->getOpcode() == AMDGPU::G_BITCAST) {
5448 MO = &MI->getOperand(1);
5449 Src = MO->getReg();
5450 MI = getDefIgnoringCopies(Src, *MRI);
5451 }
5452
5453 const auto CheckAbsNeg = [&]() {
5454 // Be careful about folding modifiers if we already have an abs. fneg is
5455 // applied last, so we don't want to apply an earlier fneg.
5456 if ((Mods & SISrcMods::ABS) == 0) {
5457 unsigned ModsTmp;
5458 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
5459 MI = getDefIgnoringCopies(Src, *MRI);
5460
5461 if ((ModsTmp & SISrcMods::NEG) != 0)
5462 Mods ^= SISrcMods::NEG;
5463
5464 if ((ModsTmp & SISrcMods::ABS) != 0)
5465 Mods |= SISrcMods::ABS;
5466 }
5467 };
5468
5469 CheckAbsNeg();
5470
5471 // op_sel/op_sel_hi decide the source type and source.
5472 // If the source's op_sel_hi is set, it indicates to do a conversion from
5473 // fp16. If the sources's op_sel is set, it picks the high half of the
5474 // source register.
5475
5476 Mods |= SISrcMods::OP_SEL_1;
5477
5478 if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
5479 Mods |= SISrcMods::OP_SEL_0;
5480 MI = ExtractHiEltMI;
5481 MO = &MI->getOperand(0);
5482 Src = MO->getReg();
5483
5484 CheckAbsNeg();
5485 }
5486
5487 Matched = true;
5488 }
5489
5490 return {Src, Mods};
5491 }
5492
5493 InstructionSelector::ComplexRendererFns
selectVOP3PMadMixModsExt(MachineOperand & Root) const5494 AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5495 MachineOperand &Root) const {
5496 Register Src;
5497 unsigned Mods;
5498 bool Matched;
5499 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5500 if (!Matched)
5501 return {};
5502
5503 return {{
5504 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5505 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5506 }};
5507 }
5508
5509 InstructionSelector::ComplexRendererFns
selectVOP3PMadMixMods(MachineOperand & Root) const5510 AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5511 Register Src;
5512 unsigned Mods;
5513 bool Matched;
5514 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5515
5516 return {{
5517 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5518 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5519 }};
5520 }
5521
selectSBarrierSignalIsfirst(MachineInstr & I,Intrinsic::ID IntrID) const5522 bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5523 MachineInstr &I, Intrinsic::ID IntrID) const {
5524 MachineBasicBlock *MBB = I.getParent();
5525 const DebugLoc &DL = I.getDebugLoc();
5526 Register CCReg = I.getOperand(0).getReg();
5527
5528 bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
5529
5530 if (HasM0) {
5531 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5532 .addReg(I.getOperand(2).getReg());
5533 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
5534 if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
5535 return false;
5536 } else {
5537 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5538 .addImm(I.getOperand(2).getImm());
5539 }
5540
5541 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5542
5543 I.eraseFromParent();
5544 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5545 *MRI);
5546 }
5547
getNamedBarrierOp(bool HasInlineConst,Intrinsic::ID IntrID)5548 unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5549 if (HasInlineConst) {
5550 switch (IntrID) {
5551 default:
5552 llvm_unreachable("not a named barrier op");
5553 case Intrinsic::amdgcn_s_barrier_init:
5554 return AMDGPU::S_BARRIER_INIT_IMM;
5555 case Intrinsic::amdgcn_s_barrier_join:
5556 return AMDGPU::S_BARRIER_JOIN_IMM;
5557 case Intrinsic::amdgcn_s_wakeup_barrier:
5558 return AMDGPU::S_WAKEUP_BARRIER_IMM;
5559 case Intrinsic::amdgcn_s_get_barrier_state:
5560 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5561 };
5562 } else {
5563 switch (IntrID) {
5564 default:
5565 llvm_unreachable("not a named barrier op");
5566 case Intrinsic::amdgcn_s_barrier_init:
5567 return AMDGPU::S_BARRIER_INIT_M0;
5568 case Intrinsic::amdgcn_s_barrier_join:
5569 return AMDGPU::S_BARRIER_JOIN_M0;
5570 case Intrinsic::amdgcn_s_wakeup_barrier:
5571 return AMDGPU::S_WAKEUP_BARRIER_M0;
5572 case Intrinsic::amdgcn_s_get_barrier_state:
5573 return AMDGPU::S_GET_BARRIER_STATE_M0;
5574 };
5575 }
5576 }
5577
selectNamedBarrierInst(MachineInstr & I,Intrinsic::ID IntrID) const5578 bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5579 MachineInstr &I, Intrinsic::ID IntrID) const {
5580 MachineBasicBlock *MBB = I.getParent();
5581 const DebugLoc &DL = I.getDebugLoc();
5582 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
5583 ? I.getOperand(2)
5584 : I.getOperand(1);
5585 std::optional<int64_t> BarValImm =
5586 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5587 Register M0Val;
5588 Register TmpReg0;
5589
5590 // For S_BARRIER_INIT, member count will always be read from M0[16:22]
5591 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5592 Register MemberCount = I.getOperand(2).getReg();
5593 TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5594 // TODO: This should be expanded during legalization so that the the S_LSHL
5595 // and S_OR can be constant-folded
5596 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
5597 .addImm(16)
5598 .addReg(MemberCount);
5599 M0Val = TmpReg0;
5600 }
5601
5602 // If not inlinable, get reference to barrier depending on the instruction
5603 if (!BarValImm) {
5604 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5605 // If reference to barrier id is not an inlinable constant then it must be
5606 // referenced with M0[4:0]. Perform an OR with the member count to include
5607 // it in M0 for S_BARRIER_INIT.
5608 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5609 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
5610 .addReg(BarOp.getReg())
5611 .addReg(TmpReg0);
5612 M0Val = TmpReg1;
5613 } else {
5614 M0Val = BarOp.getReg();
5615 }
5616 }
5617
5618 // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
5619 if (M0Val) {
5620 auto CopyMIB =
5621 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
5622 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5623 }
5624
5625 MachineInstrBuilder MIB;
5626 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
5627 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5628
5629 if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
5630 MIB.addDef(I.getOperand(0).getReg());
5631
5632 if (BarValImm)
5633 MIB.addImm(*BarValImm);
5634
5635 I.eraseFromParent();
5636 return true;
5637 }
5638
selectSBarrierLeave(MachineInstr & I) const5639 bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
5640 MachineBasicBlock *BB = I.getParent();
5641 const DebugLoc &DL = I.getDebugLoc();
5642 Register CCReg = I.getOperand(0).getReg();
5643
5644 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
5645 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5646
5647 I.eraseFromParent();
5648 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5649 *MRI);
5650 }
5651
renderTruncImm32(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5652 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5653 const MachineInstr &MI,
5654 int OpIdx) const {
5655 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5656 "Expected G_CONSTANT");
5657 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5658 }
5659
renderNegateImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5660 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5661 const MachineInstr &MI,
5662 int OpIdx) const {
5663 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5664 "Expected G_CONSTANT");
5665 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5666 }
5667
renderBitcastImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5668 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
5669 const MachineInstr &MI,
5670 int OpIdx) const {
5671 assert(OpIdx == -1);
5672
5673 const MachineOperand &Op = MI.getOperand(1);
5674 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
5675 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5676 else {
5677 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
5678 MIB.addImm(Op.getCImm()->getSExtValue());
5679 }
5680 }
5681
renderPopcntImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5682 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5683 const MachineInstr &MI,
5684 int OpIdx) const {
5685 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5686 "Expected G_CONSTANT");
5687 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5688 }
5689
5690 /// This only really exists to satisfy DAG type checking machinery, so is a
5691 /// no-op here.
renderTruncTImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5692 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
5693 const MachineInstr &MI,
5694 int OpIdx) const {
5695 MIB.addImm(MI.getOperand(OpIdx).getImm());
5696 }
5697
renderOpSelTImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5698 void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
5699 const MachineInstr &MI,
5700 int OpIdx) const {
5701 assert(OpIdx >= 0 && "expected to match an immediate operand");
5702 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
5703 }
5704
renderExtractCPol(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5705 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
5706 const MachineInstr &MI,
5707 int OpIdx) const {
5708 assert(OpIdx >= 0 && "expected to match an immediate operand");
5709 MIB.addImm(MI.getOperand(OpIdx).getImm() &
5710 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
5711 : AMDGPU::CPol::ALL_pregfx12));
5712 }
5713
renderExtractSWZ(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5714 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
5715 const MachineInstr &MI,
5716 int OpIdx) const {
5717 assert(OpIdx >= 0 && "expected to match an immediate operand");
5718 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
5719 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ
5720 : AMDGPU::CPol::SWZ_pregfx12);
5721 MIB.addImm(Swizzle);
5722 }
5723
renderExtractCpolSetGLC(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5724 void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
5725 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
5726 assert(OpIdx >= 0 && "expected to match an immediate operand");
5727 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
5728 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
5729 : AMDGPU::CPol::ALL_pregfx12);
5730 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
5731 }
5732
renderFrameIndex(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5733 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
5734 const MachineInstr &MI,
5735 int OpIdx) const {
5736 MIB.addFrameIndex(MI.getOperand(1).getIndex());
5737 }
5738
renderFPPow2ToExponent(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5739 void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
5740 const MachineInstr &MI,
5741 int OpIdx) const {
5742 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
5743 int ExpVal = APF.getExactLog2Abs();
5744 assert(ExpVal != INT_MIN);
5745 MIB.addImm(ExpVal);
5746 }
5747
isInlineImmediate16(int64_t Imm) const5748 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
5749 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
5750 }
5751
isInlineImmediate32(int64_t Imm) const5752 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
5753 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
5754 }
5755
isInlineImmediate64(int64_t Imm) const5756 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
5757 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
5758 }
5759
isInlineImmediate(const APFloat & Imm) const5760 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
5761 return TII.isInlineConstant(Imm);
5762 }
5763