1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26
27 #define DEBUG_TYPE "amdgpu-isel"
28
29 using namespace llvm;
30 using namespace MIPatternMatch;
31
32 static cl::opt<bool> AllowRiskySelect(
33 "amdgpu-global-isel-risky-select",
34 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
35 cl::init(false),
36 cl::ReallyHidden);
37
38 #define GET_GLOBALISEL_IMPL
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenGlobalISel.inc"
41 #undef GET_GLOBALISEL_IMPL
42 #undef AMDGPUSubtarget
43
AMDGPUInstructionSelector(const GCNSubtarget & STI,const AMDGPURegisterBankInfo & RBI,const AMDGPUTargetMachine & TM)44 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
45 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
46 const AMDGPUTargetMachine &TM)
47 : InstructionSelector(), TII(*STI.getInstrInfo()),
48 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
49 STI(STI),
50 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
51 #define GET_GLOBALISEL_PREDICATES_INIT
52 #include "AMDGPUGenGlobalISel.inc"
53 #undef GET_GLOBALISEL_PREDICATES_INIT
54 #define GET_GLOBALISEL_TEMPORARIES_INIT
55 #include "AMDGPUGenGlobalISel.inc"
56 #undef GET_GLOBALISEL_TEMPORARIES_INIT
57 {
58 }
59
getName()60 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
61
setupMF(MachineFunction & MF,GISelKnownBits & KB,CodeGenCoverage & CoverageInfo)62 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
63 CodeGenCoverage &CoverageInfo) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
66 InstructionSelector::setupMF(MF, KB, CoverageInfo);
67 }
68
isVCC(Register Reg,const MachineRegisterInfo & MRI) const69 bool AMDGPUInstructionSelector::isVCC(Register Reg,
70 const MachineRegisterInfo &MRI) const {
71 // The verifier is oblivious to s1 being a valid value for wavesize registers.
72 if (Reg.isPhysical())
73 return false;
74
75 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
76 const TargetRegisterClass *RC =
77 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
78 if (RC) {
79 const LLT Ty = MRI.getType(Reg);
80 return RC->hasSuperClassEq(TRI.getBoolRC()) &&
81 Ty.isValid() && Ty.getSizeInBits() == 1;
82 }
83
84 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
85 return RB->getID() == AMDGPU::VCCRegBankID;
86 }
87
constrainCopyLikeIntrin(MachineInstr & MI,unsigned NewOpc) const88 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
89 unsigned NewOpc) const {
90 MI.setDesc(TII.get(NewOpc));
91 MI.RemoveOperand(1); // Remove intrinsic ID.
92 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
93
94 MachineOperand &Dst = MI.getOperand(0);
95 MachineOperand &Src = MI.getOperand(1);
96
97 // TODO: This should be legalized to s32 if needed
98 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
99 return false;
100
101 const TargetRegisterClass *DstRC
102 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
103 const TargetRegisterClass *SrcRC
104 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
105 if (!DstRC || DstRC != SrcRC)
106 return false;
107
108 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
109 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
110 }
111
selectCOPY(MachineInstr & I) const112 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
113 const DebugLoc &DL = I.getDebugLoc();
114 MachineBasicBlock *BB = I.getParent();
115 I.setDesc(TII.get(TargetOpcode::COPY));
116
117 const MachineOperand &Src = I.getOperand(1);
118 MachineOperand &Dst = I.getOperand(0);
119 Register DstReg = Dst.getReg();
120 Register SrcReg = Src.getReg();
121
122 if (isVCC(DstReg, *MRI)) {
123 if (SrcReg == AMDGPU::SCC) {
124 const TargetRegisterClass *RC
125 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
126 if (!RC)
127 return true;
128 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
129 }
130
131 if (!isVCC(SrcReg, *MRI)) {
132 // TODO: Should probably leave the copy and let copyPhysReg expand it.
133 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
134 return false;
135
136 const TargetRegisterClass *SrcRC
137 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
138
139 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
140
141 // We can't trust the high bits at this point, so clear them.
142
143 // TODO: Skip masking high bits if def is known boolean.
144
145 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
146 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
147 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
148 .addImm(1)
149 .addReg(SrcReg);
150 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
151 .addImm(0)
152 .addReg(MaskedReg);
153
154 if (!MRI->getRegClassOrNull(SrcReg))
155 MRI->setRegClass(SrcReg, SrcRC);
156 I.eraseFromParent();
157 return true;
158 }
159
160 const TargetRegisterClass *RC =
161 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
162 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
163 return false;
164
165 return true;
166 }
167
168 for (const MachineOperand &MO : I.operands()) {
169 if (MO.getReg().isPhysical())
170 continue;
171
172 const TargetRegisterClass *RC =
173 TRI.getConstrainedRegClassForOperand(MO, *MRI);
174 if (!RC)
175 continue;
176 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
177 }
178 return true;
179 }
180
selectPHI(MachineInstr & I) const181 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
182 const Register DefReg = I.getOperand(0).getReg();
183 const LLT DefTy = MRI->getType(DefReg);
184 if (DefTy == LLT::scalar(1)) {
185 if (!AllowRiskySelect) {
186 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
187 return false;
188 }
189
190 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
191 }
192
193 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
194
195 const RegClassOrRegBank &RegClassOrBank =
196 MRI->getRegClassOrRegBank(DefReg);
197
198 const TargetRegisterClass *DefRC
199 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
200 if (!DefRC) {
201 if (!DefTy.isValid()) {
202 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
203 return false;
204 }
205
206 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
207 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
208 if (!DefRC) {
209 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
210 return false;
211 }
212 }
213
214 // TODO: Verify that all registers have the same bank
215 I.setDesc(TII.get(TargetOpcode::PHI));
216 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
217 }
218
219 MachineOperand
getSubOperand64(MachineOperand & MO,const TargetRegisterClass & SubRC,unsigned SubIdx) const220 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
221 const TargetRegisterClass &SubRC,
222 unsigned SubIdx) const {
223
224 MachineInstr *MI = MO.getParent();
225 MachineBasicBlock *BB = MO.getParent()->getParent();
226 Register DstReg = MRI->createVirtualRegister(&SubRC);
227
228 if (MO.isReg()) {
229 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
230 Register Reg = MO.getReg();
231 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
232 .addReg(Reg, 0, ComposedSubIdx);
233
234 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
235 MO.isKill(), MO.isDead(), MO.isUndef(),
236 MO.isEarlyClobber(), 0, MO.isDebug(),
237 MO.isInternalRead());
238 }
239
240 assert(MO.isImm());
241
242 APInt Imm(64, MO.getImm());
243
244 switch (SubIdx) {
245 default:
246 llvm_unreachable("do not know to split immediate with this sub index.");
247 case AMDGPU::sub0:
248 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
249 case AMDGPU::sub1:
250 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
251 }
252 }
253
getLogicalBitOpcode(unsigned Opc,bool Is64)254 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
255 switch (Opc) {
256 case AMDGPU::G_AND:
257 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
258 case AMDGPU::G_OR:
259 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
260 case AMDGPU::G_XOR:
261 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
262 default:
263 llvm_unreachable("not a bit op");
264 }
265 }
266
selectG_AND_OR_XOR(MachineInstr & I) const267 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
268 Register DstReg = I.getOperand(0).getReg();
269 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
270
271 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
272 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
273 DstRB->getID() != AMDGPU::VCCRegBankID)
274 return false;
275
276 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
277 STI.isWave64());
278 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
279
280 // Dead implicit-def of scc
281 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
282 true, // isImp
283 false, // isKill
284 true)); // isDead
285 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
286 }
287
selectG_ADD_SUB(MachineInstr & I) const288 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
289 MachineBasicBlock *BB = I.getParent();
290 MachineFunction *MF = BB->getParent();
291 Register DstReg = I.getOperand(0).getReg();
292 const DebugLoc &DL = I.getDebugLoc();
293 LLT Ty = MRI->getType(DstReg);
294 if (Ty.isVector())
295 return false;
296
297 unsigned Size = Ty.getSizeInBits();
298 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
299 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
300 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
301
302 if (Size == 32) {
303 if (IsSALU) {
304 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
305 MachineInstr *Add =
306 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
307 .add(I.getOperand(1))
308 .add(I.getOperand(2));
309 I.eraseFromParent();
310 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
311 }
312
313 if (STI.hasAddNoCarry()) {
314 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
315 I.setDesc(TII.get(Opc));
316 I.addOperand(*MF, MachineOperand::CreateImm(0));
317 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
318 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
319 }
320
321 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
322
323 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
324 MachineInstr *Add
325 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
326 .addDef(UnusedCarry, RegState::Dead)
327 .add(I.getOperand(1))
328 .add(I.getOperand(2))
329 .addImm(0);
330 I.eraseFromParent();
331 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
332 }
333
334 assert(!Sub && "illegal sub should not reach here");
335
336 const TargetRegisterClass &RC
337 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
338 const TargetRegisterClass &HalfRC
339 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
340
341 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
342 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
343 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
344 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
345
346 Register DstLo = MRI->createVirtualRegister(&HalfRC);
347 Register DstHi = MRI->createVirtualRegister(&HalfRC);
348
349 if (IsSALU) {
350 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
351 .add(Lo1)
352 .add(Lo2);
353 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
354 .add(Hi1)
355 .add(Hi2);
356 } else {
357 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
358 Register CarryReg = MRI->createVirtualRegister(CarryRC);
359 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
360 .addDef(CarryReg)
361 .add(Lo1)
362 .add(Lo2)
363 .addImm(0);
364 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
365 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
366 .add(Hi1)
367 .add(Hi2)
368 .addReg(CarryReg, RegState::Kill)
369 .addImm(0);
370
371 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
372 return false;
373 }
374
375 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
376 .addReg(DstLo)
377 .addImm(AMDGPU::sub0)
378 .addReg(DstHi)
379 .addImm(AMDGPU::sub1);
380
381
382 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
383 return false;
384
385 I.eraseFromParent();
386 return true;
387 }
388
selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr & I) const389 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
390 MachineInstr &I) const {
391 MachineBasicBlock *BB = I.getParent();
392 MachineFunction *MF = BB->getParent();
393 const DebugLoc &DL = I.getDebugLoc();
394 Register Dst0Reg = I.getOperand(0).getReg();
395 Register Dst1Reg = I.getOperand(1).getReg();
396 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
397 I.getOpcode() == AMDGPU::G_UADDE;
398 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
399 I.getOpcode() == AMDGPU::G_USUBE;
400
401 if (isVCC(Dst1Reg, *MRI)) {
402 unsigned NoCarryOpc =
403 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
404 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
405 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
406 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
407 I.addOperand(*MF, MachineOperand::CreateImm(0));
408 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
409 }
410
411 Register Src0Reg = I.getOperand(2).getReg();
412 Register Src1Reg = I.getOperand(3).getReg();
413
414 if (HasCarryIn) {
415 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
416 .addReg(I.getOperand(4).getReg());
417 }
418
419 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
420 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
421
422 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
423 .add(I.getOperand(2))
424 .add(I.getOperand(3));
425 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
426 .addReg(AMDGPU::SCC);
427
428 if (!MRI->getRegClassOrNull(Dst1Reg))
429 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
430
431 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
432 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
433 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
434 return false;
435
436 if (HasCarryIn &&
437 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
438 AMDGPU::SReg_32RegClass, *MRI))
439 return false;
440
441 I.eraseFromParent();
442 return true;
443 }
444
445 // TODO: We should probably legalize these to only using 32-bit results.
selectG_EXTRACT(MachineInstr & I) const446 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
447 MachineBasicBlock *BB = I.getParent();
448 Register DstReg = I.getOperand(0).getReg();
449 Register SrcReg = I.getOperand(1).getReg();
450 LLT DstTy = MRI->getType(DstReg);
451 LLT SrcTy = MRI->getType(SrcReg);
452 const unsigned SrcSize = SrcTy.getSizeInBits();
453 unsigned DstSize = DstTy.getSizeInBits();
454
455 // TODO: Should handle any multiple of 32 offset.
456 unsigned Offset = I.getOperand(2).getImm();
457 if (Offset % 32 != 0 || DstSize > 128)
458 return false;
459
460 // 16-bit operations really use 32-bit registers.
461 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
462 if (DstSize == 16)
463 DstSize = 32;
464
465 const TargetRegisterClass *DstRC =
466 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
467 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
468 return false;
469
470 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
471 const TargetRegisterClass *SrcRC =
472 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
473 if (!SrcRC)
474 return false;
475 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
476 DstSize / 32);
477 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
478 if (!SrcRC)
479 return false;
480
481 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
482 *SrcRC, I.getOperand(1));
483 const DebugLoc &DL = I.getDebugLoc();
484 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
485 .addReg(SrcReg, 0, SubReg);
486
487 I.eraseFromParent();
488 return true;
489 }
490
selectG_MERGE_VALUES(MachineInstr & MI) const491 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
492 MachineBasicBlock *BB = MI.getParent();
493 Register DstReg = MI.getOperand(0).getReg();
494 LLT DstTy = MRI->getType(DstReg);
495 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
496
497 const unsigned SrcSize = SrcTy.getSizeInBits();
498 if (SrcSize < 32)
499 return selectImpl(MI, *CoverageInfo);
500
501 const DebugLoc &DL = MI.getDebugLoc();
502 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
503 const unsigned DstSize = DstTy.getSizeInBits();
504 const TargetRegisterClass *DstRC =
505 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
506 if (!DstRC)
507 return false;
508
509 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
510 MachineInstrBuilder MIB =
511 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
512 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
513 MachineOperand &Src = MI.getOperand(I + 1);
514 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
515 MIB.addImm(SubRegs[I]);
516
517 const TargetRegisterClass *SrcRC
518 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
519 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
520 return false;
521 }
522
523 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
524 return false;
525
526 MI.eraseFromParent();
527 return true;
528 }
529
selectG_UNMERGE_VALUES(MachineInstr & MI) const530 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
531 MachineBasicBlock *BB = MI.getParent();
532 const int NumDst = MI.getNumOperands() - 1;
533
534 MachineOperand &Src = MI.getOperand(NumDst);
535
536 Register SrcReg = Src.getReg();
537 Register DstReg0 = MI.getOperand(0).getReg();
538 LLT DstTy = MRI->getType(DstReg0);
539 LLT SrcTy = MRI->getType(SrcReg);
540
541 const unsigned DstSize = DstTy.getSizeInBits();
542 const unsigned SrcSize = SrcTy.getSizeInBits();
543 const DebugLoc &DL = MI.getDebugLoc();
544 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
545
546 const TargetRegisterClass *SrcRC =
547 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
548 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
549 return false;
550
551 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
552 // source, and this relies on the fact that the same subregister indices are
553 // used for both.
554 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
555 for (int I = 0, E = NumDst; I != E; ++I) {
556 MachineOperand &Dst = MI.getOperand(I);
557 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
558 .addReg(SrcReg, 0, SubRegs[I]);
559
560 // Make sure the subregister index is valid for the source register.
561 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
562 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
563 return false;
564
565 const TargetRegisterClass *DstRC =
566 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
567 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
568 return false;
569 }
570
571 MI.eraseFromParent();
572 return true;
573 }
574
selectG_BUILD_VECTOR_TRUNC(MachineInstr & MI) const575 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
576 MachineInstr &MI) const {
577 if (selectImpl(MI, *CoverageInfo))
578 return true;
579
580 const LLT S32 = LLT::scalar(32);
581 const LLT V2S16 = LLT::vector(2, 16);
582
583 Register Dst = MI.getOperand(0).getReg();
584 if (MRI->getType(Dst) != V2S16)
585 return false;
586
587 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
588 if (DstBank->getID() != AMDGPU::SGPRRegBankID)
589 return false;
590
591 Register Src0 = MI.getOperand(1).getReg();
592 Register Src1 = MI.getOperand(2).getReg();
593 if (MRI->getType(Src0) != S32)
594 return false;
595
596 const DebugLoc &DL = MI.getDebugLoc();
597 MachineBasicBlock *BB = MI.getParent();
598
599 auto ConstSrc1 =
600 getConstantVRegValWithLookThrough(Src1, *MRI, true, true, true);
601 if (ConstSrc1) {
602 auto ConstSrc0 =
603 getConstantVRegValWithLookThrough(Src0, *MRI, true, true, true);
604 if (ConstSrc0) {
605 const int64_t K0 = ConstSrc0->Value.getSExtValue();
606 const int64_t K1 = ConstSrc1->Value.getSExtValue();
607 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
608 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
609
610 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
611 .addImm(Lo16 | (Hi16 << 16));
612 MI.eraseFromParent();
613 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
614 }
615 }
616
617 // TODO: This should probably be a combine somewhere
618 // (build_vector_trunc $src0, undef -> copy $src0
619 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
620 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
621 MI.setDesc(TII.get(AMDGPU::COPY));
622 MI.RemoveOperand(2);
623 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
624 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
625 }
626
627 Register ShiftSrc0;
628 Register ShiftSrc1;
629
630 // With multiple uses of the shift, this will duplicate the shift and
631 // increase register pressure.
632 //
633 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
634 // => (S_PACK_HH_B32_B16 $src0, $src1)
635 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
636 // => (S_PACK_LH_B32_B16 $src0, $src1)
637 // (build_vector_trunc $src0, $src1)
638 // => (S_PACK_LL_B32_B16 $src0, $src1)
639
640 bool Shift0 = mi_match(
641 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
642
643 bool Shift1 = mi_match(
644 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
645
646 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
647 if (Shift0 && Shift1) {
648 Opc = AMDGPU::S_PACK_HH_B32_B16;
649 MI.getOperand(1).setReg(ShiftSrc0);
650 MI.getOperand(2).setReg(ShiftSrc1);
651 } else if (Shift1) {
652 Opc = AMDGPU::S_PACK_LH_B32_B16;
653 MI.getOperand(2).setReg(ShiftSrc1);
654 } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
655 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
656 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
657 .addReg(ShiftSrc0)
658 .addImm(16);
659
660 MI.eraseFromParent();
661 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
662 }
663
664 MI.setDesc(TII.get(Opc));
665 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
666 }
667
selectG_PTR_ADD(MachineInstr & I) const668 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
669 return selectG_ADD_SUB(I);
670 }
671
selectG_IMPLICIT_DEF(MachineInstr & I) const672 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
673 const MachineOperand &MO = I.getOperand(0);
674
675 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
676 // regbank check here is to know why getConstrainedRegClassForOperand failed.
677 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
678 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
679 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
680 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
681 return true;
682 }
683
684 return false;
685 }
686
selectG_INSERT(MachineInstr & I) const687 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
688 MachineBasicBlock *BB = I.getParent();
689
690 Register DstReg = I.getOperand(0).getReg();
691 Register Src0Reg = I.getOperand(1).getReg();
692 Register Src1Reg = I.getOperand(2).getReg();
693 LLT Src1Ty = MRI->getType(Src1Reg);
694
695 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
696 unsigned InsSize = Src1Ty.getSizeInBits();
697
698 int64_t Offset = I.getOperand(3).getImm();
699
700 // FIXME: These cases should have been illegal and unnecessary to check here.
701 if (Offset % 32 != 0 || InsSize % 32 != 0)
702 return false;
703
704 // Currently not handled by getSubRegFromChannel.
705 if (InsSize > 128)
706 return false;
707
708 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
709 if (SubReg == AMDGPU::NoSubRegister)
710 return false;
711
712 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
713 const TargetRegisterClass *DstRC =
714 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
715 if (!DstRC)
716 return false;
717
718 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
719 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
720 const TargetRegisterClass *Src0RC =
721 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
722 const TargetRegisterClass *Src1RC =
723 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
724
725 // Deal with weird cases where the class only partially supports the subreg
726 // index.
727 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
728 if (!Src0RC || !Src1RC)
729 return false;
730
731 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
732 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
733 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
734 return false;
735
736 const DebugLoc &DL = I.getDebugLoc();
737 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
738 .addReg(Src0Reg)
739 .addReg(Src1Reg)
740 .addImm(SubReg);
741
742 I.eraseFromParent();
743 return true;
744 }
745
selectInterpP1F16(MachineInstr & MI) const746 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
747 if (STI.getLDSBankCount() != 16)
748 return selectImpl(MI, *CoverageInfo);
749
750 Register Dst = MI.getOperand(0).getReg();
751 Register Src0 = MI.getOperand(2).getReg();
752 Register M0Val = MI.getOperand(6).getReg();
753 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
754 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
755 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
756 return false;
757
758 // This requires 2 instructions. It is possible to write a pattern to support
759 // this, but the generated isel emitter doesn't correctly deal with multiple
760 // output instructions using the same physical register input. The copy to m0
761 // is incorrectly placed before the second instruction.
762 //
763 // TODO: Match source modifiers.
764
765 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
766 const DebugLoc &DL = MI.getDebugLoc();
767 MachineBasicBlock *MBB = MI.getParent();
768
769 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
770 .addReg(M0Val);
771 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
772 .addImm(2)
773 .addImm(MI.getOperand(4).getImm()) // $attr
774 .addImm(MI.getOperand(3).getImm()); // $attrchan
775
776 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
777 .addImm(0) // $src0_modifiers
778 .addReg(Src0) // $src0
779 .addImm(MI.getOperand(4).getImm()) // $attr
780 .addImm(MI.getOperand(3).getImm()) // $attrchan
781 .addImm(0) // $src2_modifiers
782 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
783 .addImm(MI.getOperand(5).getImm()) // $high
784 .addImm(0) // $clamp
785 .addImm(0); // $omod
786
787 MI.eraseFromParent();
788 return true;
789 }
790
791 // Writelane is special in that it can use SGPR and M0 (which would normally
792 // count as using the constant bus twice - but in this case it is allowed since
793 // the lane selector doesn't count as a use of the constant bus). However, it is
794 // still required to abide by the 1 SGPR rule. Fix this up if we might have
795 // multiple SGPRs.
selectWritelane(MachineInstr & MI) const796 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
797 // With a constant bus limit of at least 2, there's no issue.
798 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
799 return selectImpl(MI, *CoverageInfo);
800
801 MachineBasicBlock *MBB = MI.getParent();
802 const DebugLoc &DL = MI.getDebugLoc();
803 Register VDst = MI.getOperand(0).getReg();
804 Register Val = MI.getOperand(2).getReg();
805 Register LaneSelect = MI.getOperand(3).getReg();
806 Register VDstIn = MI.getOperand(4).getReg();
807
808 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
809
810 Optional<ValueAndVReg> ConstSelect =
811 getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true);
812 if (ConstSelect) {
813 // The selector has to be an inline immediate, so we can use whatever for
814 // the other operands.
815 MIB.addReg(Val);
816 MIB.addImm(ConstSelect->Value.getSExtValue() &
817 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
818 } else {
819 Optional<ValueAndVReg> ConstVal =
820 getConstantVRegValWithLookThrough(Val, *MRI, true, true);
821
822 // If the value written is an inline immediate, we can get away without a
823 // copy to m0.
824 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
825 STI.hasInv2PiInlineImm())) {
826 MIB.addImm(ConstVal->Value.getSExtValue());
827 MIB.addReg(LaneSelect);
828 } else {
829 MIB.addReg(Val);
830
831 // If the lane selector was originally in a VGPR and copied with
832 // readfirstlane, there's a hazard to read the same SGPR from the
833 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
834 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
835
836 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
837 .addReg(LaneSelect);
838 MIB.addReg(AMDGPU::M0);
839 }
840 }
841
842 MIB.addReg(VDstIn);
843
844 MI.eraseFromParent();
845 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
846 }
847
848 // We need to handle this here because tablegen doesn't support matching
849 // instructions with multiple outputs.
selectDivScale(MachineInstr & MI) const850 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
851 Register Dst0 = MI.getOperand(0).getReg();
852 Register Dst1 = MI.getOperand(1).getReg();
853
854 LLT Ty = MRI->getType(Dst0);
855 unsigned Opc;
856 if (Ty == LLT::scalar(32))
857 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
858 else if (Ty == LLT::scalar(64))
859 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
860 else
861 return false;
862
863 // TODO: Match source modifiers.
864
865 const DebugLoc &DL = MI.getDebugLoc();
866 MachineBasicBlock *MBB = MI.getParent();
867
868 Register Numer = MI.getOperand(3).getReg();
869 Register Denom = MI.getOperand(4).getReg();
870 unsigned ChooseDenom = MI.getOperand(5).getImm();
871
872 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
873
874 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
875 .addDef(Dst1)
876 .addImm(0) // $src0_modifiers
877 .addUse(Src0) // $src0
878 .addImm(0) // $src1_modifiers
879 .addUse(Denom) // $src1
880 .addImm(0) // $src2_modifiers
881 .addUse(Numer) // $src2
882 .addImm(0) // $clamp
883 .addImm(0); // $omod
884
885 MI.eraseFromParent();
886 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
887 }
888
selectG_INTRINSIC(MachineInstr & I) const889 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
890 unsigned IntrinsicID = I.getIntrinsicID();
891 switch (IntrinsicID) {
892 case Intrinsic::amdgcn_if_break: {
893 MachineBasicBlock *BB = I.getParent();
894
895 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
896 // SelectionDAG uses for wave32 vs wave64.
897 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
898 .add(I.getOperand(0))
899 .add(I.getOperand(2))
900 .add(I.getOperand(3));
901
902 Register DstReg = I.getOperand(0).getReg();
903 Register Src0Reg = I.getOperand(2).getReg();
904 Register Src1Reg = I.getOperand(3).getReg();
905
906 I.eraseFromParent();
907
908 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
909 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
910
911 return true;
912 }
913 case Intrinsic::amdgcn_interp_p1_f16:
914 return selectInterpP1F16(I);
915 case Intrinsic::amdgcn_wqm:
916 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
917 case Intrinsic::amdgcn_softwqm:
918 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
919 case Intrinsic::amdgcn_wwm:
920 return constrainCopyLikeIntrin(I, AMDGPU::WWM);
921 case Intrinsic::amdgcn_writelane:
922 return selectWritelane(I);
923 case Intrinsic::amdgcn_div_scale:
924 return selectDivScale(I);
925 case Intrinsic::amdgcn_icmp:
926 return selectIntrinsicIcmp(I);
927 case Intrinsic::amdgcn_ballot:
928 return selectBallot(I);
929 case Intrinsic::amdgcn_reloc_constant:
930 return selectRelocConstant(I);
931 case Intrinsic::amdgcn_groupstaticsize:
932 return selectGroupStaticSize(I);
933 case Intrinsic::returnaddress:
934 return selectReturnAddress(I);
935 default:
936 return selectImpl(I, *CoverageInfo);
937 }
938 }
939
getV_CMPOpcode(CmpInst::Predicate P,unsigned Size)940 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
941 if (Size != 32 && Size != 64)
942 return -1;
943 switch (P) {
944 default:
945 llvm_unreachable("Unknown condition code!");
946 case CmpInst::ICMP_NE:
947 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
948 case CmpInst::ICMP_EQ:
949 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
950 case CmpInst::ICMP_SGT:
951 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
952 case CmpInst::ICMP_SGE:
953 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
954 case CmpInst::ICMP_SLT:
955 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
956 case CmpInst::ICMP_SLE:
957 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
958 case CmpInst::ICMP_UGT:
959 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
960 case CmpInst::ICMP_UGE:
961 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
962 case CmpInst::ICMP_ULT:
963 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
964 case CmpInst::ICMP_ULE:
965 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
966 }
967 }
968
getS_CMPOpcode(CmpInst::Predicate P,unsigned Size) const969 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
970 unsigned Size) const {
971 if (Size == 64) {
972 if (!STI.hasScalarCompareEq64())
973 return -1;
974
975 switch (P) {
976 case CmpInst::ICMP_NE:
977 return AMDGPU::S_CMP_LG_U64;
978 case CmpInst::ICMP_EQ:
979 return AMDGPU::S_CMP_EQ_U64;
980 default:
981 return -1;
982 }
983 }
984
985 if (Size != 32)
986 return -1;
987
988 switch (P) {
989 case CmpInst::ICMP_NE:
990 return AMDGPU::S_CMP_LG_U32;
991 case CmpInst::ICMP_EQ:
992 return AMDGPU::S_CMP_EQ_U32;
993 case CmpInst::ICMP_SGT:
994 return AMDGPU::S_CMP_GT_I32;
995 case CmpInst::ICMP_SGE:
996 return AMDGPU::S_CMP_GE_I32;
997 case CmpInst::ICMP_SLT:
998 return AMDGPU::S_CMP_LT_I32;
999 case CmpInst::ICMP_SLE:
1000 return AMDGPU::S_CMP_LE_I32;
1001 case CmpInst::ICMP_UGT:
1002 return AMDGPU::S_CMP_GT_U32;
1003 case CmpInst::ICMP_UGE:
1004 return AMDGPU::S_CMP_GE_U32;
1005 case CmpInst::ICMP_ULT:
1006 return AMDGPU::S_CMP_LT_U32;
1007 case CmpInst::ICMP_ULE:
1008 return AMDGPU::S_CMP_LE_U32;
1009 default:
1010 llvm_unreachable("Unknown condition code!");
1011 }
1012 }
1013
selectG_ICMP(MachineInstr & I) const1014 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1015 MachineBasicBlock *BB = I.getParent();
1016 const DebugLoc &DL = I.getDebugLoc();
1017
1018 Register SrcReg = I.getOperand(2).getReg();
1019 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1020
1021 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1022
1023 Register CCReg = I.getOperand(0).getReg();
1024 if (!isVCC(CCReg, *MRI)) {
1025 int Opcode = getS_CMPOpcode(Pred, Size);
1026 if (Opcode == -1)
1027 return false;
1028 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1029 .add(I.getOperand(2))
1030 .add(I.getOperand(3));
1031 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1032 .addReg(AMDGPU::SCC);
1033 bool Ret =
1034 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1035 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1036 I.eraseFromParent();
1037 return Ret;
1038 }
1039
1040 int Opcode = getV_CMPOpcode(Pred, Size);
1041 if (Opcode == -1)
1042 return false;
1043
1044 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1045 I.getOperand(0).getReg())
1046 .add(I.getOperand(2))
1047 .add(I.getOperand(3));
1048 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1049 *TRI.getBoolRC(), *MRI);
1050 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1051 I.eraseFromParent();
1052 return Ret;
1053 }
1054
selectIntrinsicIcmp(MachineInstr & I) const1055 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
1056 Register Dst = I.getOperand(0).getReg();
1057 if (isVCC(Dst, *MRI))
1058 return false;
1059
1060 if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1061 return false;
1062
1063 MachineBasicBlock *BB = I.getParent();
1064 const DebugLoc &DL = I.getDebugLoc();
1065 Register SrcReg = I.getOperand(2).getReg();
1066 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1067 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1068
1069 int Opcode = getV_CMPOpcode(Pred, Size);
1070 if (Opcode == -1)
1071 return false;
1072
1073 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1074 .add(I.getOperand(2))
1075 .add(I.getOperand(3));
1076 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1077 *MRI);
1078 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1079 I.eraseFromParent();
1080 return Ret;
1081 }
1082
selectBallot(MachineInstr & I) const1083 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1084 MachineBasicBlock *BB = I.getParent();
1085 const DebugLoc &DL = I.getDebugLoc();
1086 Register DstReg = I.getOperand(0).getReg();
1087 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1088 const bool Is64 = Size == 64;
1089
1090 if (Size != STI.getWavefrontSize())
1091 return false;
1092
1093 Optional<ValueAndVReg> Arg =
1094 getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1095
1096 if (Arg.hasValue()) {
1097 const int64_t Value = Arg.getValue().Value.getSExtValue();
1098 if (Value == 0) {
1099 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1100 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1101 } else if (Value == -1) { // all ones
1102 Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1103 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1104 } else
1105 return false;
1106 } else {
1107 Register SrcReg = I.getOperand(2).getReg();
1108 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1109 }
1110
1111 I.eraseFromParent();
1112 return true;
1113 }
1114
selectRelocConstant(MachineInstr & I) const1115 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1116 Register DstReg = I.getOperand(0).getReg();
1117 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1118 const TargetRegisterClass *DstRC =
1119 TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1120 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1121 return false;
1122
1123 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1124
1125 Module *M = MF->getFunction().getParent();
1126 const MDNode *Metadata = I.getOperand(2).getMetadata();
1127 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1128 auto RelocSymbol = cast<GlobalVariable>(
1129 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1130
1131 MachineBasicBlock *BB = I.getParent();
1132 BuildMI(*BB, &I, I.getDebugLoc(),
1133 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1134 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1135
1136 I.eraseFromParent();
1137 return true;
1138 }
1139
selectGroupStaticSize(MachineInstr & I) const1140 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1141 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1142
1143 Register DstReg = I.getOperand(0).getReg();
1144 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1145 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1146 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1147
1148 MachineBasicBlock *MBB = I.getParent();
1149 const DebugLoc &DL = I.getDebugLoc();
1150
1151 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1152
1153 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1154 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1155 MIB.addImm(MFI->getLDSSize());
1156 } else {
1157 Module *M = MF->getFunction().getParent();
1158 const GlobalValue *GV
1159 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1160 MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1161 }
1162
1163 I.eraseFromParent();
1164 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1165 }
1166
selectReturnAddress(MachineInstr & I) const1167 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1168 MachineBasicBlock *MBB = I.getParent();
1169 MachineFunction &MF = *MBB->getParent();
1170 const DebugLoc &DL = I.getDebugLoc();
1171
1172 MachineOperand &Dst = I.getOperand(0);
1173 Register DstReg = Dst.getReg();
1174 unsigned Depth = I.getOperand(2).getImm();
1175
1176 const TargetRegisterClass *RC
1177 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1178 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1179 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1180 return false;
1181
1182 // Check for kernel and shader functions
1183 if (Depth != 0 ||
1184 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1185 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1186 .addImm(0);
1187 I.eraseFromParent();
1188 return true;
1189 }
1190
1191 MachineFrameInfo &MFI = MF.getFrameInfo();
1192 // There is a call to @llvm.returnaddress in this function
1193 MFI.setReturnAddressIsTaken(true);
1194
1195 // Get the return address reg and mark it as an implicit live-in
1196 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1197 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1198 AMDGPU::SReg_64RegClass);
1199 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1200 .addReg(LiveIn);
1201 I.eraseFromParent();
1202 return true;
1203 }
1204
selectEndCfIntrinsic(MachineInstr & MI) const1205 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1206 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1207 // SelectionDAG uses for wave32 vs wave64.
1208 MachineBasicBlock *BB = MI.getParent();
1209 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1210 .add(MI.getOperand(1));
1211
1212 Register Reg = MI.getOperand(1).getReg();
1213 MI.eraseFromParent();
1214
1215 if (!MRI->getRegClassOrNull(Reg))
1216 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1217 return true;
1218 }
1219
selectDSOrderedIntrinsic(MachineInstr & MI,Intrinsic::ID IntrID) const1220 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1221 MachineInstr &MI, Intrinsic::ID IntrID) const {
1222 MachineBasicBlock *MBB = MI.getParent();
1223 MachineFunction *MF = MBB->getParent();
1224 const DebugLoc &DL = MI.getDebugLoc();
1225
1226 unsigned IndexOperand = MI.getOperand(7).getImm();
1227 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1228 bool WaveDone = MI.getOperand(9).getImm() != 0;
1229
1230 if (WaveDone && !WaveRelease)
1231 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1232
1233 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1234 IndexOperand &= ~0x3f;
1235 unsigned CountDw = 0;
1236
1237 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1238 CountDw = (IndexOperand >> 24) & 0xf;
1239 IndexOperand &= ~(0xf << 24);
1240
1241 if (CountDw < 1 || CountDw > 4) {
1242 report_fatal_error(
1243 "ds_ordered_count: dword count must be between 1 and 4");
1244 }
1245 }
1246
1247 if (IndexOperand)
1248 report_fatal_error("ds_ordered_count: bad index operand");
1249
1250 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1251 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1252
1253 unsigned Offset0 = OrderedCountIndex << 2;
1254 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1255 (Instruction << 4);
1256
1257 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1258 Offset1 |= (CountDw - 1) << 6;
1259
1260 unsigned Offset = Offset0 | (Offset1 << 8);
1261
1262 Register M0Val = MI.getOperand(2).getReg();
1263 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1264 .addReg(M0Val);
1265
1266 Register DstReg = MI.getOperand(0).getReg();
1267 Register ValReg = MI.getOperand(3).getReg();
1268 MachineInstrBuilder DS =
1269 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1270 .addReg(ValReg)
1271 .addImm(Offset)
1272 .cloneMemRefs(MI);
1273
1274 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1275 return false;
1276
1277 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1278 MI.eraseFromParent();
1279 return Ret;
1280 }
1281
gwsIntrinToOpcode(unsigned IntrID)1282 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1283 switch (IntrID) {
1284 case Intrinsic::amdgcn_ds_gws_init:
1285 return AMDGPU::DS_GWS_INIT;
1286 case Intrinsic::amdgcn_ds_gws_barrier:
1287 return AMDGPU::DS_GWS_BARRIER;
1288 case Intrinsic::amdgcn_ds_gws_sema_v:
1289 return AMDGPU::DS_GWS_SEMA_V;
1290 case Intrinsic::amdgcn_ds_gws_sema_br:
1291 return AMDGPU::DS_GWS_SEMA_BR;
1292 case Intrinsic::amdgcn_ds_gws_sema_p:
1293 return AMDGPU::DS_GWS_SEMA_P;
1294 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1295 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1296 default:
1297 llvm_unreachable("not a gws intrinsic");
1298 }
1299 }
1300
selectDSGWSIntrinsic(MachineInstr & MI,Intrinsic::ID IID) const1301 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1302 Intrinsic::ID IID) const {
1303 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1304 !STI.hasGWSSemaReleaseAll())
1305 return false;
1306
1307 // intrinsic ID, vsrc, offset
1308 const bool HasVSrc = MI.getNumOperands() == 3;
1309 assert(HasVSrc || MI.getNumOperands() == 2);
1310
1311 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1312 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1313 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1314 return false;
1315
1316 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1317 assert(OffsetDef);
1318
1319 unsigned ImmOffset;
1320
1321 MachineBasicBlock *MBB = MI.getParent();
1322 const DebugLoc &DL = MI.getDebugLoc();
1323
1324 MachineInstr *Readfirstlane = nullptr;
1325
1326 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1327 // incoming offset, in case there's an add of a constant. We'll have to put it
1328 // back later.
1329 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1330 Readfirstlane = OffsetDef;
1331 BaseOffset = OffsetDef->getOperand(1).getReg();
1332 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1333 }
1334
1335 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1336 // If we have a constant offset, try to use the 0 in m0 as the base.
1337 // TODO: Look into changing the default m0 initialization value. If the
1338 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1339 // the immediate offset.
1340
1341 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1342 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1343 .addImm(0);
1344 } else {
1345 std::tie(BaseOffset, ImmOffset) =
1346 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1347
1348 if (Readfirstlane) {
1349 // We have the constant offset now, so put the readfirstlane back on the
1350 // variable component.
1351 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1352 return false;
1353
1354 Readfirstlane->getOperand(1).setReg(BaseOffset);
1355 BaseOffset = Readfirstlane->getOperand(0).getReg();
1356 } else {
1357 if (!RBI.constrainGenericRegister(BaseOffset,
1358 AMDGPU::SReg_32RegClass, *MRI))
1359 return false;
1360 }
1361
1362 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1363 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1364 .addReg(BaseOffset)
1365 .addImm(16);
1366
1367 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1368 .addReg(M0Base);
1369 }
1370
1371 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1372 // offset field) % 64. Some versions of the programming guide omit the m0
1373 // part, or claim it's from offset 0.
1374 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1375
1376 if (HasVSrc) {
1377 Register VSrc = MI.getOperand(1).getReg();
1378 MIB.addReg(VSrc);
1379 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1380 return false;
1381 }
1382
1383 MIB.addImm(ImmOffset)
1384 .cloneMemRefs(MI);
1385
1386 MI.eraseFromParent();
1387 return true;
1388 }
1389
selectDSAppendConsume(MachineInstr & MI,bool IsAppend) const1390 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1391 bool IsAppend) const {
1392 Register PtrBase = MI.getOperand(2).getReg();
1393 LLT PtrTy = MRI->getType(PtrBase);
1394 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1395
1396 unsigned Offset;
1397 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1398
1399 // TODO: Should this try to look through readfirstlane like GWS?
1400 if (!isDSOffsetLegal(PtrBase, Offset)) {
1401 PtrBase = MI.getOperand(2).getReg();
1402 Offset = 0;
1403 }
1404
1405 MachineBasicBlock *MBB = MI.getParent();
1406 const DebugLoc &DL = MI.getDebugLoc();
1407 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1408
1409 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1410 .addReg(PtrBase);
1411 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1412 return false;
1413
1414 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1415 .addImm(Offset)
1416 .addImm(IsGDS ? -1 : 0)
1417 .cloneMemRefs(MI);
1418 MI.eraseFromParent();
1419 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1420 }
1421
selectSBarrier(MachineInstr & MI) const1422 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1423 if (TM.getOptLevel() > CodeGenOpt::None) {
1424 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1425 if (WGSize <= STI.getWavefrontSize()) {
1426 MachineBasicBlock *MBB = MI.getParent();
1427 const DebugLoc &DL = MI.getDebugLoc();
1428 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1429 MI.eraseFromParent();
1430 return true;
1431 }
1432 }
1433 return selectImpl(MI, *CoverageInfo);
1434 }
1435
parseTexFail(uint64_t TexFailCtrl,bool & TFE,bool & LWE,bool & IsTexFail)1436 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1437 bool &IsTexFail) {
1438 if (TexFailCtrl)
1439 IsTexFail = true;
1440
1441 TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1442 TexFailCtrl &= ~(uint64_t)0x1;
1443 LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1444 TexFailCtrl &= ~(uint64_t)0x2;
1445
1446 return TexFailCtrl == 0;
1447 }
1448
parseCachePolicy(uint64_t Value,bool * GLC,bool * SLC,bool * DLC)1449 static bool parseCachePolicy(uint64_t Value,
1450 bool *GLC, bool *SLC, bool *DLC) {
1451 if (GLC) {
1452 *GLC = (Value & 0x1) ? 1 : 0;
1453 Value &= ~(uint64_t)0x1;
1454 }
1455 if (SLC) {
1456 *SLC = (Value & 0x2) ? 1 : 0;
1457 Value &= ~(uint64_t)0x2;
1458 }
1459 if (DLC) {
1460 *DLC = (Value & 0x4) ? 1 : 0;
1461 Value &= ~(uint64_t)0x4;
1462 }
1463
1464 return Value == 0;
1465 }
1466
selectImageIntrinsic(MachineInstr & MI,const AMDGPU::ImageDimIntrinsicInfo * Intr) const1467 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1468 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1469 MachineBasicBlock *MBB = MI.getParent();
1470 const DebugLoc &DL = MI.getDebugLoc();
1471
1472 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1473 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1474
1475 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1476 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1477 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1478 const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1479 AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1480 unsigned IntrOpcode = Intr->BaseOpcode;
1481 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1482
1483 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1484
1485 Register VDataIn, VDataOut;
1486 LLT VDataTy;
1487 int NumVDataDwords = -1;
1488 bool IsD16 = false;
1489
1490 bool Unorm;
1491 if (!BaseOpcode->Sampler)
1492 Unorm = true;
1493 else
1494 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1495
1496 bool TFE;
1497 bool LWE;
1498 bool IsTexFail = false;
1499 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1500 TFE, LWE, IsTexFail))
1501 return false;
1502
1503 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1504 const bool IsA16 = (Flags & 1) != 0;
1505 const bool IsG16 = (Flags & 2) != 0;
1506
1507 // A16 implies 16 bit gradients
1508 if (IsA16 && !IsG16)
1509 return false;
1510
1511 unsigned DMask = 0;
1512 unsigned DMaskLanes = 0;
1513
1514 if (BaseOpcode->Atomic) {
1515 VDataOut = MI.getOperand(0).getReg();
1516 VDataIn = MI.getOperand(2).getReg();
1517 LLT Ty = MRI->getType(VDataIn);
1518
1519 // Be careful to allow atomic swap on 16-bit element vectors.
1520 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1521 Ty.getSizeInBits() == 128 :
1522 Ty.getSizeInBits() == 64;
1523
1524 if (BaseOpcode->AtomicX2) {
1525 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1526
1527 DMask = Is64Bit ? 0xf : 0x3;
1528 NumVDataDwords = Is64Bit ? 4 : 2;
1529 } else {
1530 DMask = Is64Bit ? 0x3 : 0x1;
1531 NumVDataDwords = Is64Bit ? 2 : 1;
1532 }
1533 } else {
1534 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1535 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1536
1537 // One memoperand is mandatory, except for getresinfo.
1538 // FIXME: Check this in verifier.
1539 if (!MI.memoperands_empty()) {
1540 const MachineMemOperand *MMO = *MI.memoperands_begin();
1541
1542 // Infer d16 from the memory size, as the register type will be mangled by
1543 // unpacked subtargets, or by TFE.
1544 IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1545 }
1546
1547 if (BaseOpcode->Store) {
1548 VDataIn = MI.getOperand(1).getReg();
1549 VDataTy = MRI->getType(VDataIn);
1550 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1551 } else {
1552 VDataOut = MI.getOperand(0).getReg();
1553 VDataTy = MRI->getType(VDataOut);
1554 NumVDataDwords = DMaskLanes;
1555
1556 if (IsD16 && !STI.hasUnpackedD16VMem())
1557 NumVDataDwords = (DMaskLanes + 1) / 2;
1558 }
1559 }
1560
1561 // Optimize _L to _LZ when _L is zero
1562 if (LZMappingInfo) {
1563 // The legalizer replaced the register with an immediate 0 if we need to
1564 // change the opcode.
1565 const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
1566 if (Lod.isImm()) {
1567 assert(Lod.getImm() == 0);
1568 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
1569 }
1570 }
1571
1572 // Optimize _mip away, when 'lod' is zero
1573 if (MIPMappingInfo) {
1574 const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex);
1575 if (Lod.isImm()) {
1576 assert(Lod.getImm() == 0);
1577 IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
1578 }
1579 }
1580
1581 // Set G16 opcode
1582 if (IsG16 && !IsA16) {
1583 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1584 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1585 assert(G16MappingInfo);
1586 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1587 }
1588
1589 // TODO: Check this in verifier.
1590 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1591
1592 bool GLC = false;
1593 bool SLC = false;
1594 bool DLC = false;
1595 if (BaseOpcode->Atomic) {
1596 GLC = true; // TODO no-return optimization
1597 if (!parseCachePolicy(
1598 MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), nullptr,
1599 &SLC, IsGFX10Plus ? &DLC : nullptr))
1600 return false;
1601 } else {
1602 if (!parseCachePolicy(
1603 MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), &GLC,
1604 &SLC, IsGFX10Plus ? &DLC : nullptr))
1605 return false;
1606 }
1607
1608 int NumVAddrRegs = 0;
1609 int NumVAddrDwords = 0;
1610 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1611 // Skip the $noregs and 0s inserted during legalization.
1612 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1613 if (!AddrOp.isReg())
1614 continue; // XXX - Break?
1615
1616 Register Addr = AddrOp.getReg();
1617 if (!Addr)
1618 break;
1619
1620 ++NumVAddrRegs;
1621 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1622 }
1623
1624 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1625 // NSA, these should have beeen packed into a single value in the first
1626 // address register
1627 const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1628 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1629 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1630 return false;
1631 }
1632
1633 if (IsTexFail)
1634 ++NumVDataDwords;
1635
1636 int Opcode = -1;
1637 if (IsGFX10Plus) {
1638 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1639 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1640 : AMDGPU::MIMGEncGfx10Default,
1641 NumVDataDwords, NumVAddrDwords);
1642 } else {
1643 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1644 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1645 NumVDataDwords, NumVAddrDwords);
1646 if (Opcode == -1)
1647 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1648 NumVDataDwords, NumVAddrDwords);
1649 }
1650 assert(Opcode != -1);
1651
1652 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1653 .cloneMemRefs(MI);
1654
1655 if (VDataOut) {
1656 if (BaseOpcode->AtomicX2) {
1657 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1658
1659 Register TmpReg = MRI->createVirtualRegister(
1660 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1661 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1662
1663 MIB.addDef(TmpReg);
1664 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1665 .addReg(TmpReg, RegState::Kill, SubReg);
1666
1667 } else {
1668 MIB.addDef(VDataOut); // vdata output
1669 }
1670 }
1671
1672 if (VDataIn)
1673 MIB.addReg(VDataIn); // vdata input
1674
1675 for (int I = 0; I != NumVAddrRegs; ++I) {
1676 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1677 if (SrcOp.isReg()) {
1678 assert(SrcOp.getReg() != 0);
1679 MIB.addReg(SrcOp.getReg());
1680 }
1681 }
1682
1683 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
1684 if (BaseOpcode->Sampler)
1685 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
1686
1687 MIB.addImm(DMask); // dmask
1688
1689 if (IsGFX10Plus)
1690 MIB.addImm(DimInfo->Encoding);
1691 MIB.addImm(Unorm);
1692 if (IsGFX10Plus)
1693 MIB.addImm(DLC);
1694
1695 MIB.addImm(GLC);
1696 MIB.addImm(SLC);
1697 MIB.addImm(IsA16 && // a16 or r128
1698 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1699 if (IsGFX10Plus)
1700 MIB.addImm(IsA16 ? -1 : 0);
1701
1702 MIB.addImm(TFE); // tfe
1703 MIB.addImm(LWE); // lwe
1704 if (!IsGFX10Plus)
1705 MIB.addImm(DimInfo->DA ? -1 : 0);
1706 if (BaseOpcode->HasD16)
1707 MIB.addImm(IsD16 ? -1 : 0);
1708
1709 MI.eraseFromParent();
1710 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1711 }
1712
selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr & I) const1713 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1714 MachineInstr &I) const {
1715 unsigned IntrinsicID = I.getIntrinsicID();
1716 switch (IntrinsicID) {
1717 case Intrinsic::amdgcn_end_cf:
1718 return selectEndCfIntrinsic(I);
1719 case Intrinsic::amdgcn_ds_ordered_add:
1720 case Intrinsic::amdgcn_ds_ordered_swap:
1721 return selectDSOrderedIntrinsic(I, IntrinsicID);
1722 case Intrinsic::amdgcn_ds_gws_init:
1723 case Intrinsic::amdgcn_ds_gws_barrier:
1724 case Intrinsic::amdgcn_ds_gws_sema_v:
1725 case Intrinsic::amdgcn_ds_gws_sema_br:
1726 case Intrinsic::amdgcn_ds_gws_sema_p:
1727 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1728 return selectDSGWSIntrinsic(I, IntrinsicID);
1729 case Intrinsic::amdgcn_ds_append:
1730 return selectDSAppendConsume(I, true);
1731 case Intrinsic::amdgcn_ds_consume:
1732 return selectDSAppendConsume(I, false);
1733 case Intrinsic::amdgcn_s_barrier:
1734 return selectSBarrier(I);
1735 case Intrinsic::amdgcn_global_atomic_fadd:
1736 return selectGlobalAtomicFaddIntrinsic(I);
1737 default: {
1738 return selectImpl(I, *CoverageInfo);
1739 }
1740 }
1741 }
1742
selectG_SELECT(MachineInstr & I) const1743 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1744 if (selectImpl(I, *CoverageInfo))
1745 return true;
1746
1747 MachineBasicBlock *BB = I.getParent();
1748 const DebugLoc &DL = I.getDebugLoc();
1749
1750 Register DstReg = I.getOperand(0).getReg();
1751 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1752 assert(Size <= 32 || Size == 64);
1753 const MachineOperand &CCOp = I.getOperand(1);
1754 Register CCReg = CCOp.getReg();
1755 if (!isVCC(CCReg, *MRI)) {
1756 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1757 AMDGPU::S_CSELECT_B32;
1758 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1759 .addReg(CCReg);
1760
1761 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1762 // bank, because it does not cover the register class that we used to represent
1763 // for it. So we need to manually set the register class here.
1764 if (!MRI->getRegClassOrNull(CCReg))
1765 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1766 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1767 .add(I.getOperand(2))
1768 .add(I.getOperand(3));
1769
1770 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1771 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1772 I.eraseFromParent();
1773 return Ret;
1774 }
1775
1776 // Wide VGPR select should have been split in RegBankSelect.
1777 if (Size > 32)
1778 return false;
1779
1780 MachineInstr *Select =
1781 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1782 .addImm(0)
1783 .add(I.getOperand(3))
1784 .addImm(0)
1785 .add(I.getOperand(2))
1786 .add(I.getOperand(1));
1787
1788 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1789 I.eraseFromParent();
1790 return Ret;
1791 }
1792
sizeToSubRegIndex(unsigned Size)1793 static int sizeToSubRegIndex(unsigned Size) {
1794 switch (Size) {
1795 case 32:
1796 return AMDGPU::sub0;
1797 case 64:
1798 return AMDGPU::sub0_sub1;
1799 case 96:
1800 return AMDGPU::sub0_sub1_sub2;
1801 case 128:
1802 return AMDGPU::sub0_sub1_sub2_sub3;
1803 case 256:
1804 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1805 default:
1806 if (Size < 32)
1807 return AMDGPU::sub0;
1808 if (Size > 256)
1809 return -1;
1810 return sizeToSubRegIndex(PowerOf2Ceil(Size));
1811 }
1812 }
1813
selectG_TRUNC(MachineInstr & I) const1814 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1815 Register DstReg = I.getOperand(0).getReg();
1816 Register SrcReg = I.getOperand(1).getReg();
1817 const LLT DstTy = MRI->getType(DstReg);
1818 const LLT SrcTy = MRI->getType(SrcReg);
1819 const LLT S1 = LLT::scalar(1);
1820
1821 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1822 const RegisterBank *DstRB;
1823 if (DstTy == S1) {
1824 // This is a special case. We don't treat s1 for legalization artifacts as
1825 // vcc booleans.
1826 DstRB = SrcRB;
1827 } else {
1828 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1829 if (SrcRB != DstRB)
1830 return false;
1831 }
1832
1833 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1834
1835 unsigned DstSize = DstTy.getSizeInBits();
1836 unsigned SrcSize = SrcTy.getSizeInBits();
1837
1838 const TargetRegisterClass *SrcRC
1839 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1840 const TargetRegisterClass *DstRC
1841 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1842 if (!SrcRC || !DstRC)
1843 return false;
1844
1845 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1846 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1847 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1848 return false;
1849 }
1850
1851 if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1852 MachineBasicBlock *MBB = I.getParent();
1853 const DebugLoc &DL = I.getDebugLoc();
1854
1855 Register LoReg = MRI->createVirtualRegister(DstRC);
1856 Register HiReg = MRI->createVirtualRegister(DstRC);
1857 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1858 .addReg(SrcReg, 0, AMDGPU::sub0);
1859 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1860 .addReg(SrcReg, 0, AMDGPU::sub1);
1861
1862 if (IsVALU && STI.hasSDWA()) {
1863 // Write the low 16-bits of the high element into the high 16-bits of the
1864 // low element.
1865 MachineInstr *MovSDWA =
1866 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1867 .addImm(0) // $src0_modifiers
1868 .addReg(HiReg) // $src0
1869 .addImm(0) // $clamp
1870 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
1871 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1872 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
1873 .addReg(LoReg, RegState::Implicit);
1874 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1875 } else {
1876 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1877 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1878 Register ImmReg = MRI->createVirtualRegister(DstRC);
1879 if (IsVALU) {
1880 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1881 .addImm(16)
1882 .addReg(HiReg);
1883 } else {
1884 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1885 .addReg(HiReg)
1886 .addImm(16);
1887 }
1888
1889 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1890 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1891 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1892
1893 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1894 .addImm(0xffff);
1895 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1896 .addReg(LoReg)
1897 .addReg(ImmReg);
1898 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1899 .addReg(TmpReg0)
1900 .addReg(TmpReg1);
1901 }
1902
1903 I.eraseFromParent();
1904 return true;
1905 }
1906
1907 if (!DstTy.isScalar())
1908 return false;
1909
1910 if (SrcSize > 32) {
1911 int SubRegIdx = sizeToSubRegIndex(DstSize);
1912 if (SubRegIdx == -1)
1913 return false;
1914
1915 // Deal with weird cases where the class only partially supports the subreg
1916 // index.
1917 const TargetRegisterClass *SrcWithSubRC
1918 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1919 if (!SrcWithSubRC)
1920 return false;
1921
1922 if (SrcWithSubRC != SrcRC) {
1923 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1924 return false;
1925 }
1926
1927 I.getOperand(1).setSubReg(SubRegIdx);
1928 }
1929
1930 I.setDesc(TII.get(TargetOpcode::COPY));
1931 return true;
1932 }
1933
1934 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
shouldUseAndMask(unsigned Size,unsigned & Mask)1935 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1936 Mask = maskTrailingOnes<unsigned>(Size);
1937 int SignedMask = static_cast<int>(Mask);
1938 return SignedMask >= -16 && SignedMask <= 64;
1939 }
1940
1941 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
getArtifactRegBank(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const1942 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1943 Register Reg, const MachineRegisterInfo &MRI,
1944 const TargetRegisterInfo &TRI) const {
1945 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1946 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1947 return RB;
1948
1949 // Ignore the type, since we don't use vcc in artifacts.
1950 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1951 return &RBI.getRegBankFromRegClass(*RC, LLT());
1952 return nullptr;
1953 }
1954
selectG_SZA_EXT(MachineInstr & I) const1955 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1956 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1957 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1958 const DebugLoc &DL = I.getDebugLoc();
1959 MachineBasicBlock &MBB = *I.getParent();
1960 const Register DstReg = I.getOperand(0).getReg();
1961 const Register SrcReg = I.getOperand(1).getReg();
1962
1963 const LLT DstTy = MRI->getType(DstReg);
1964 const LLT SrcTy = MRI->getType(SrcReg);
1965 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1966 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1967 const unsigned DstSize = DstTy.getSizeInBits();
1968 if (!DstTy.isScalar())
1969 return false;
1970
1971 // Artifact casts should never use vcc.
1972 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1973
1974 // FIXME: This should probably be illegal and split earlier.
1975 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
1976 if (DstSize <= 32)
1977 return selectCOPY(I);
1978
1979 const TargetRegisterClass *SrcRC =
1980 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
1981 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1982 const TargetRegisterClass *DstRC =
1983 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
1984
1985 Register UndefReg = MRI->createVirtualRegister(SrcRC);
1986 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1987 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1988 .addReg(SrcReg)
1989 .addImm(AMDGPU::sub0)
1990 .addReg(UndefReg)
1991 .addImm(AMDGPU::sub1);
1992 I.eraseFromParent();
1993
1994 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
1995 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
1996 }
1997
1998 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
1999 // 64-bit should have been split up in RegBankSelect
2000
2001 // Try to use an and with a mask if it will save code size.
2002 unsigned Mask;
2003 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2004 MachineInstr *ExtI =
2005 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2006 .addImm(Mask)
2007 .addReg(SrcReg);
2008 I.eraseFromParent();
2009 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2010 }
2011
2012 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2013 MachineInstr *ExtI =
2014 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2015 .addReg(SrcReg)
2016 .addImm(0) // Offset
2017 .addImm(SrcSize); // Width
2018 I.eraseFromParent();
2019 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2020 }
2021
2022 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2023 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2024 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2025 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2026 return false;
2027
2028 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2029 const unsigned SextOpc = SrcSize == 8 ?
2030 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2031 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2032 .addReg(SrcReg);
2033 I.eraseFromParent();
2034 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2035 }
2036
2037 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2038 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2039
2040 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2041 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2042 // We need a 64-bit register source, but the high bits don't matter.
2043 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2044 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2045 unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
2046
2047 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2048 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2049 .addReg(SrcReg, 0, SubReg)
2050 .addImm(AMDGPU::sub0)
2051 .addReg(UndefReg)
2052 .addImm(AMDGPU::sub1);
2053
2054 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2055 .addReg(ExtReg)
2056 .addImm(SrcSize << 16);
2057
2058 I.eraseFromParent();
2059 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2060 }
2061
2062 unsigned Mask;
2063 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2064 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2065 .addReg(SrcReg)
2066 .addImm(Mask);
2067 } else {
2068 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2069 .addReg(SrcReg)
2070 .addImm(SrcSize << 16);
2071 }
2072
2073 I.eraseFromParent();
2074 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2075 }
2076
2077 return false;
2078 }
2079
selectG_CONSTANT(MachineInstr & I) const2080 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2081 MachineBasicBlock *BB = I.getParent();
2082 MachineOperand &ImmOp = I.getOperand(1);
2083 Register DstReg = I.getOperand(0).getReg();
2084 unsigned Size = MRI->getType(DstReg).getSizeInBits();
2085
2086 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2087 if (ImmOp.isFPImm()) {
2088 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2089 ImmOp.ChangeToImmediate(Imm.getZExtValue());
2090 } else if (ImmOp.isCImm()) {
2091 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2092 } else {
2093 llvm_unreachable("Not supported by g_constants");
2094 }
2095
2096 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2097 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2098
2099 unsigned Opcode;
2100 if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2101 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2102 } else {
2103 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2104
2105 // We should never produce s1 values on banks other than VCC. If the user of
2106 // this already constrained the register, we may incorrectly think it's VCC
2107 // if it wasn't originally.
2108 if (Size == 1)
2109 return false;
2110 }
2111
2112 if (Size != 64) {
2113 I.setDesc(TII.get(Opcode));
2114 I.addImplicitDefUseOperands(*MF);
2115 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2116 }
2117
2118 const DebugLoc &DL = I.getDebugLoc();
2119
2120 APInt Imm(Size, I.getOperand(1).getImm());
2121
2122 MachineInstr *ResInst;
2123 if (IsSgpr && TII.isInlineConstant(Imm)) {
2124 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2125 .addImm(I.getOperand(1).getImm());
2126 } else {
2127 const TargetRegisterClass *RC = IsSgpr ?
2128 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2129 Register LoReg = MRI->createVirtualRegister(RC);
2130 Register HiReg = MRI->createVirtualRegister(RC);
2131
2132 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2133 .addImm(Imm.trunc(32).getZExtValue());
2134
2135 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2136 .addImm(Imm.ashr(32).getZExtValue());
2137
2138 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2139 .addReg(LoReg)
2140 .addImm(AMDGPU::sub0)
2141 .addReg(HiReg)
2142 .addImm(AMDGPU::sub1);
2143 }
2144
2145 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2146 // work for target independent opcodes
2147 I.eraseFromParent();
2148 const TargetRegisterClass *DstRC =
2149 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2150 if (!DstRC)
2151 return true;
2152 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2153 }
2154
selectG_FNEG(MachineInstr & MI) const2155 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2156 // Only manually handle the f64 SGPR case.
2157 //
2158 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2159 // the bit ops theoretically have a second result due to the implicit def of
2160 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2161 // that is easy by disabling the check. The result works, but uses a
2162 // nonsensical sreg32orlds_and_sreg_1 regclass.
2163 //
2164 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2165 // the variadic REG_SEQUENCE operands.
2166
2167 Register Dst = MI.getOperand(0).getReg();
2168 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2169 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2170 MRI->getType(Dst) != LLT::scalar(64))
2171 return false;
2172
2173 Register Src = MI.getOperand(1).getReg();
2174 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2175 if (Fabs)
2176 Src = Fabs->getOperand(1).getReg();
2177
2178 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2179 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2180 return false;
2181
2182 MachineBasicBlock *BB = MI.getParent();
2183 const DebugLoc &DL = MI.getDebugLoc();
2184 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2185 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2186 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2187 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2188
2189 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2190 .addReg(Src, 0, AMDGPU::sub0);
2191 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2192 .addReg(Src, 0, AMDGPU::sub1);
2193 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2194 .addImm(0x80000000);
2195
2196 // Set or toggle sign bit.
2197 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2198 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2199 .addReg(HiReg)
2200 .addReg(ConstReg);
2201 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2202 .addReg(LoReg)
2203 .addImm(AMDGPU::sub0)
2204 .addReg(OpReg)
2205 .addImm(AMDGPU::sub1);
2206 MI.eraseFromParent();
2207 return true;
2208 }
2209
2210 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
selectG_FABS(MachineInstr & MI) const2211 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2212 Register Dst = MI.getOperand(0).getReg();
2213 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2214 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2215 MRI->getType(Dst) != LLT::scalar(64))
2216 return false;
2217
2218 Register Src = MI.getOperand(1).getReg();
2219 MachineBasicBlock *BB = MI.getParent();
2220 const DebugLoc &DL = MI.getDebugLoc();
2221 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2222 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2223 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2224 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2225
2226 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2227 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2228 return false;
2229
2230 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2231 .addReg(Src, 0, AMDGPU::sub0);
2232 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2233 .addReg(Src, 0, AMDGPU::sub1);
2234 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2235 .addImm(0x7fffffff);
2236
2237 // Clear sign bit.
2238 // TODO: Should this used S_BITSET0_*?
2239 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2240 .addReg(HiReg)
2241 .addReg(ConstReg);
2242 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2243 .addReg(LoReg)
2244 .addImm(AMDGPU::sub0)
2245 .addReg(OpReg)
2246 .addImm(AMDGPU::sub1);
2247
2248 MI.eraseFromParent();
2249 return true;
2250 }
2251
isConstant(const MachineInstr & MI)2252 static bool isConstant(const MachineInstr &MI) {
2253 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2254 }
2255
getAddrModeInfo(const MachineInstr & Load,const MachineRegisterInfo & MRI,SmallVectorImpl<GEPInfo> & AddrInfo) const2256 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2257 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2258
2259 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2260
2261 assert(PtrMI);
2262
2263 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2264 return;
2265
2266 GEPInfo GEPInfo(*PtrMI);
2267
2268 for (unsigned i = 1; i != 3; ++i) {
2269 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2270 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2271 assert(OpDef);
2272 if (i == 2 && isConstant(*OpDef)) {
2273 // TODO: Could handle constant base + variable offset, but a combine
2274 // probably should have commuted it.
2275 assert(GEPInfo.Imm == 0);
2276 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2277 continue;
2278 }
2279 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2280 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2281 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2282 else
2283 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2284 }
2285
2286 AddrInfo.push_back(GEPInfo);
2287 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2288 }
2289
isSGPR(Register Reg) const2290 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2291 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2292 }
2293
isInstrUniform(const MachineInstr & MI) const2294 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2295 if (!MI.hasOneMemOperand())
2296 return false;
2297
2298 const MachineMemOperand *MMO = *MI.memoperands_begin();
2299 const Value *Ptr = MMO->getValue();
2300
2301 // UndefValue means this is a load of a kernel input. These are uniform.
2302 // Sometimes LDS instructions have constant pointers.
2303 // If Ptr is null, then that means this mem operand contains a
2304 // PseudoSourceValue like GOT.
2305 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2306 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2307 return true;
2308
2309 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2310 return true;
2311
2312 const Instruction *I = dyn_cast<Instruction>(Ptr);
2313 return I && I->getMetadata("amdgpu.uniform");
2314 }
2315
hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const2316 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2317 for (const GEPInfo &GEPInfo : AddrInfo) {
2318 if (!GEPInfo.VgprParts.empty())
2319 return true;
2320 }
2321 return false;
2322 }
2323
initM0(MachineInstr & I) const2324 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2325 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2326 unsigned AS = PtrTy.getAddressSpace();
2327 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2328 STI.ldsRequiresM0Init()) {
2329 MachineBasicBlock *BB = I.getParent();
2330
2331 // If DS instructions require M0 initializtion, insert it before selecting.
2332 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2333 .addImm(-1);
2334 }
2335 }
2336
selectG_LOAD_STORE_ATOMICRMW(MachineInstr & I) const2337 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2338 MachineInstr &I) const {
2339 initM0(I);
2340 return selectImpl(I, *CoverageInfo);
2341 }
2342
2343 // TODO: No rtn optimization.
selectG_AMDGPU_ATOMIC_CMPXCHG(MachineInstr & MI) const2344 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2345 MachineInstr &MI) const {
2346 Register PtrReg = MI.getOperand(1).getReg();
2347 const LLT PtrTy = MRI->getType(PtrReg);
2348 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2349 STI.useFlatForGlobal())
2350 return selectImpl(MI, *CoverageInfo);
2351
2352 Register DstReg = MI.getOperand(0).getReg();
2353 const LLT Ty = MRI->getType(DstReg);
2354 const bool Is64 = Ty.getSizeInBits() == 64;
2355 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2356 Register TmpReg = MRI->createVirtualRegister(
2357 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2358
2359 const DebugLoc &DL = MI.getDebugLoc();
2360 MachineBasicBlock *BB = MI.getParent();
2361
2362 Register VAddr, RSrcReg, SOffset;
2363 int64_t Offset = 0;
2364
2365 unsigned Opcode;
2366 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2367 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2368 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2369 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2370 RSrcReg, SOffset, Offset)) {
2371 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2372 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2373 } else
2374 return selectImpl(MI, *CoverageInfo);
2375
2376 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2377 .addReg(MI.getOperand(2).getReg());
2378
2379 if (VAddr)
2380 MIB.addReg(VAddr);
2381
2382 MIB.addReg(RSrcReg);
2383 if (SOffset)
2384 MIB.addReg(SOffset);
2385 else
2386 MIB.addImm(0);
2387
2388 MIB.addImm(Offset);
2389 MIB.addImm(1); // glc
2390 MIB.addImm(0); // slc
2391 MIB.cloneMemRefs(MI);
2392
2393 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2394 .addReg(TmpReg, RegState::Kill, SubReg);
2395
2396 MI.eraseFromParent();
2397
2398 MRI->setRegClass(
2399 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2400 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2401 }
2402
selectG_BRCOND(MachineInstr & I) const2403 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2404 MachineBasicBlock *BB = I.getParent();
2405 MachineOperand &CondOp = I.getOperand(0);
2406 Register CondReg = CondOp.getReg();
2407 const DebugLoc &DL = I.getDebugLoc();
2408
2409 unsigned BrOpcode;
2410 Register CondPhysReg;
2411 const TargetRegisterClass *ConstrainRC;
2412
2413 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2414 // whether the branch is uniform when selecting the instruction. In
2415 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2416 // RegBankSelect knows what it's doing if the branch condition is scc, even
2417 // though it currently does not.
2418 if (!isVCC(CondReg, *MRI)) {
2419 if (MRI->getType(CondReg) != LLT::scalar(32))
2420 return false;
2421
2422 CondPhysReg = AMDGPU::SCC;
2423 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2424 ConstrainRC = &AMDGPU::SReg_32RegClass;
2425 } else {
2426 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2427 // We sort of know that a VCC producer based on the register bank, that ands
2428 // inactive lanes with 0. What if there was a logical operation with vcc
2429 // producers in different blocks/with different exec masks?
2430 // FIXME: Should scc->vcc copies and with exec?
2431 CondPhysReg = TRI.getVCC();
2432 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2433 ConstrainRC = TRI.getBoolRC();
2434 }
2435
2436 if (!MRI->getRegClassOrNull(CondReg))
2437 MRI->setRegClass(CondReg, ConstrainRC);
2438
2439 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2440 .addReg(CondReg);
2441 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2442 .addMBB(I.getOperand(1).getMBB());
2443
2444 I.eraseFromParent();
2445 return true;
2446 }
2447
selectG_GLOBAL_VALUE(MachineInstr & I) const2448 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2449 MachineInstr &I) const {
2450 Register DstReg = I.getOperand(0).getReg();
2451 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2452 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2453 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2454 if (IsVGPR)
2455 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2456
2457 return RBI.constrainGenericRegister(
2458 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2459 }
2460
selectG_PTRMASK(MachineInstr & I) const2461 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2462 Register DstReg = I.getOperand(0).getReg();
2463 Register SrcReg = I.getOperand(1).getReg();
2464 Register MaskReg = I.getOperand(2).getReg();
2465 LLT Ty = MRI->getType(DstReg);
2466 LLT MaskTy = MRI->getType(MaskReg);
2467
2468 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2469 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2470 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2471 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2472 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2473 return false;
2474
2475 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2476 const TargetRegisterClass &RegRC
2477 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2478
2479 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2480 *MRI);
2481 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2482 *MRI);
2483 const TargetRegisterClass *MaskRC =
2484 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2485
2486 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2487 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2488 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2489 return false;
2490
2491 MachineBasicBlock *BB = I.getParent();
2492 const DebugLoc &DL = I.getDebugLoc();
2493 if (Ty.getSizeInBits() == 32) {
2494 assert(MaskTy.getSizeInBits() == 32 &&
2495 "ptrmask should have been narrowed during legalize");
2496
2497 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2498 .addReg(SrcReg)
2499 .addReg(MaskReg);
2500 I.eraseFromParent();
2501 return true;
2502 }
2503
2504 Register HiReg = MRI->createVirtualRegister(&RegRC);
2505 Register LoReg = MRI->createVirtualRegister(&RegRC);
2506
2507 // Extract the subregisters from the source pointer.
2508 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2509 .addReg(SrcReg, 0, AMDGPU::sub0);
2510 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2511 .addReg(SrcReg, 0, AMDGPU::sub1);
2512
2513 Register MaskedLo, MaskedHi;
2514
2515 // Try to avoid emitting a bit operation when we only need to touch half of
2516 // the 64-bit pointer.
2517 APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2518
2519 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2520 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2521 if ((MaskOnes & MaskLo32) == MaskLo32) {
2522 // If all the bits in the low half are 1, we only need a copy for it.
2523 MaskedLo = LoReg;
2524 } else {
2525 // Extract the mask subregister and apply the and.
2526 Register MaskLo = MRI->createVirtualRegister(&RegRC);
2527 MaskedLo = MRI->createVirtualRegister(&RegRC);
2528
2529 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2530 .addReg(MaskReg, 0, AMDGPU::sub0);
2531 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2532 .addReg(LoReg)
2533 .addReg(MaskLo);
2534 }
2535
2536 if ((MaskOnes & MaskHi32) == MaskHi32) {
2537 // If all the bits in the high half are 1, we only need a copy for it.
2538 MaskedHi = HiReg;
2539 } else {
2540 Register MaskHi = MRI->createVirtualRegister(&RegRC);
2541 MaskedHi = MRI->createVirtualRegister(&RegRC);
2542
2543 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2544 .addReg(MaskReg, 0, AMDGPU::sub1);
2545 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2546 .addReg(HiReg)
2547 .addReg(MaskHi);
2548 }
2549
2550 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2551 .addReg(MaskedLo)
2552 .addImm(AMDGPU::sub0)
2553 .addReg(MaskedHi)
2554 .addImm(AMDGPU::sub1);
2555 I.eraseFromParent();
2556 return true;
2557 }
2558
2559 /// Return the register to use for the index value, and the subregister to use
2560 /// for the indirectly accessed register.
2561 static std::pair<Register, unsigned>
computeIndirectRegIndex(MachineRegisterInfo & MRI,const SIRegisterInfo & TRI,const TargetRegisterClass * SuperRC,Register IdxReg,unsigned EltSize)2562 computeIndirectRegIndex(MachineRegisterInfo &MRI,
2563 const SIRegisterInfo &TRI,
2564 const TargetRegisterClass *SuperRC,
2565 Register IdxReg,
2566 unsigned EltSize) {
2567 Register IdxBaseReg;
2568 int Offset;
2569
2570 std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2571 if (IdxBaseReg == AMDGPU::NoRegister) {
2572 // This will happen if the index is a known constant. This should ordinarily
2573 // be legalized out, but handle it as a register just in case.
2574 assert(Offset == 0);
2575 IdxBaseReg = IdxReg;
2576 }
2577
2578 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2579
2580 // Skip out of bounds offsets, or else we would end up using an undefined
2581 // register.
2582 if (static_cast<unsigned>(Offset) >= SubRegs.size())
2583 return std::make_pair(IdxReg, SubRegs[0]);
2584 return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2585 }
2586
selectG_EXTRACT_VECTOR_ELT(MachineInstr & MI) const2587 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2588 MachineInstr &MI) const {
2589 Register DstReg = MI.getOperand(0).getReg();
2590 Register SrcReg = MI.getOperand(1).getReg();
2591 Register IdxReg = MI.getOperand(2).getReg();
2592
2593 LLT DstTy = MRI->getType(DstReg);
2594 LLT SrcTy = MRI->getType(SrcReg);
2595
2596 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2597 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2598 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2599
2600 // The index must be scalar. If it wasn't RegBankSelect should have moved this
2601 // into a waterfall loop.
2602 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2603 return false;
2604
2605 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2606 *MRI);
2607 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2608 *MRI);
2609 if (!SrcRC || !DstRC)
2610 return false;
2611 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2612 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2613 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2614 return false;
2615
2616 MachineBasicBlock *BB = MI.getParent();
2617 const DebugLoc &DL = MI.getDebugLoc();
2618 const bool Is64 = DstTy.getSizeInBits() == 64;
2619
2620 unsigned SubReg;
2621 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2622 DstTy.getSizeInBits() / 8);
2623
2624 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2625 if (DstTy.getSizeInBits() != 32 && !Is64)
2626 return false;
2627
2628 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2629 .addReg(IdxReg);
2630
2631 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2632 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2633 .addReg(SrcReg, 0, SubReg)
2634 .addReg(SrcReg, RegState::Implicit);
2635 MI.eraseFromParent();
2636 return true;
2637 }
2638
2639 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2640 return false;
2641
2642 if (!STI.useVGPRIndexMode()) {
2643 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2644 .addReg(IdxReg);
2645 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2646 .addReg(SrcReg, 0, SubReg)
2647 .addReg(SrcReg, RegState::Implicit);
2648 MI.eraseFromParent();
2649 return true;
2650 }
2651
2652 const MCInstrDesc &GPRIDXDesc =
2653 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
2654 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2655 .addReg(SrcReg)
2656 .addReg(IdxReg)
2657 .addImm(SubReg);
2658
2659 MI.eraseFromParent();
2660 return true;
2661 }
2662
2663 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
selectG_INSERT_VECTOR_ELT(MachineInstr & MI) const2664 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2665 MachineInstr &MI) const {
2666 Register DstReg = MI.getOperand(0).getReg();
2667 Register VecReg = MI.getOperand(1).getReg();
2668 Register ValReg = MI.getOperand(2).getReg();
2669 Register IdxReg = MI.getOperand(3).getReg();
2670
2671 LLT VecTy = MRI->getType(DstReg);
2672 LLT ValTy = MRI->getType(ValReg);
2673 unsigned VecSize = VecTy.getSizeInBits();
2674 unsigned ValSize = ValTy.getSizeInBits();
2675
2676 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2677 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2678 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2679
2680 assert(VecTy.getElementType() == ValTy);
2681
2682 // The index must be scalar. If it wasn't RegBankSelect should have moved this
2683 // into a waterfall loop.
2684 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2685 return false;
2686
2687 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2688 *MRI);
2689 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2690 *MRI);
2691
2692 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2693 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2694 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2695 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2696 return false;
2697
2698 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2699 return false;
2700
2701 unsigned SubReg;
2702 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2703 ValSize / 8);
2704
2705 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2706 STI.useVGPRIndexMode();
2707
2708 MachineBasicBlock *BB = MI.getParent();
2709 const DebugLoc &DL = MI.getDebugLoc();
2710
2711 if (!IndexMode) {
2712 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2713 .addReg(IdxReg);
2714
2715 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
2716 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
2717 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2718 .addReg(VecReg)
2719 .addReg(ValReg)
2720 .addImm(SubReg);
2721 MI.eraseFromParent();
2722 return true;
2723 }
2724
2725 const MCInstrDesc &GPRIDXDesc =
2726 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
2727 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2728 .addReg(VecReg)
2729 .addReg(ValReg)
2730 .addReg(IdxReg)
2731 .addImm(SubReg);
2732
2733 MI.eraseFromParent();
2734 return true;
2735 }
2736
isZeroOrUndef(int X)2737 static bool isZeroOrUndef(int X) {
2738 return X == 0 || X == -1;
2739 }
2740
isOneOrUndef(int X)2741 static bool isOneOrUndef(int X) {
2742 return X == 1 || X == -1;
2743 }
2744
isZeroOrOneOrUndef(int X)2745 static bool isZeroOrOneOrUndef(int X) {
2746 return X == 0 || X == 1 || X == -1;
2747 }
2748
2749 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2750 // 32-bit register.
normalizeVOP3PMask(int NewMask[2],Register Src0,Register Src1,ArrayRef<int> Mask)2751 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2752 ArrayRef<int> Mask) {
2753 NewMask[0] = Mask[0];
2754 NewMask[1] = Mask[1];
2755 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2756 return Src0;
2757
2758 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2759 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2760
2761 // Shift the mask inputs to be 0/1;
2762 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2763 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2764 return Src1;
2765 }
2766
2767 // This is only legal with VOP3P instructions as an aid to op_sel matching.
selectG_SHUFFLE_VECTOR(MachineInstr & MI) const2768 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2769 MachineInstr &MI) const {
2770 Register DstReg = MI.getOperand(0).getReg();
2771 Register Src0Reg = MI.getOperand(1).getReg();
2772 Register Src1Reg = MI.getOperand(2).getReg();
2773 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2774
2775 const LLT V2S16 = LLT::vector(2, 16);
2776 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2777 return false;
2778
2779 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2780 return false;
2781
2782 assert(ShufMask.size() == 2);
2783 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2784
2785 MachineBasicBlock *MBB = MI.getParent();
2786 const DebugLoc &DL = MI.getDebugLoc();
2787
2788 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2789 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2790 const TargetRegisterClass &RC = IsVALU ?
2791 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2792
2793 // Handle the degenerate case which should have folded out.
2794 if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2795 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2796
2797 MI.eraseFromParent();
2798 return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2799 }
2800
2801 // A legal VOP3P mask only reads one of the sources.
2802 int Mask[2];
2803 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2804
2805 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2806 !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2807 return false;
2808
2809 // TODO: This also should have been folded out
2810 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2811 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2812 .addReg(SrcVec);
2813
2814 MI.eraseFromParent();
2815 return true;
2816 }
2817
2818 if (Mask[0] == 1 && Mask[1] == -1) {
2819 if (IsVALU) {
2820 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2821 .addImm(16)
2822 .addReg(SrcVec);
2823 } else {
2824 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2825 .addReg(SrcVec)
2826 .addImm(16);
2827 }
2828 } else if (Mask[0] == -1 && Mask[1] == 0) {
2829 if (IsVALU) {
2830 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2831 .addImm(16)
2832 .addReg(SrcVec);
2833 } else {
2834 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2835 .addReg(SrcVec)
2836 .addImm(16);
2837 }
2838 } else if (Mask[0] == 0 && Mask[1] == 0) {
2839 if (IsVALU) {
2840 // Write low half of the register into the high half.
2841 MachineInstr *MovSDWA =
2842 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2843 .addImm(0) // $src0_modifiers
2844 .addReg(SrcVec) // $src0
2845 .addImm(0) // $clamp
2846 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2847 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2848 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2849 .addReg(SrcVec, RegState::Implicit);
2850 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2851 } else {
2852 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2853 .addReg(SrcVec)
2854 .addReg(SrcVec);
2855 }
2856 } else if (Mask[0] == 1 && Mask[1] == 1) {
2857 if (IsVALU) {
2858 // Write high half of the register into the low half.
2859 MachineInstr *MovSDWA =
2860 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2861 .addImm(0) // $src0_modifiers
2862 .addReg(SrcVec) // $src0
2863 .addImm(0) // $clamp
2864 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel
2865 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2866 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel
2867 .addReg(SrcVec, RegState::Implicit);
2868 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2869 } else {
2870 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2871 .addReg(SrcVec)
2872 .addReg(SrcVec);
2873 }
2874 } else if (Mask[0] == 1 && Mask[1] == 0) {
2875 if (IsVALU) {
2876 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg)
2877 .addReg(SrcVec)
2878 .addReg(SrcVec)
2879 .addImm(16);
2880 } else {
2881 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2882 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2883 .addReg(SrcVec)
2884 .addImm(16);
2885 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2886 .addReg(TmpReg)
2887 .addReg(SrcVec);
2888 }
2889 } else
2890 llvm_unreachable("all shuffle masks should be handled");
2891
2892 MI.eraseFromParent();
2893 return true;
2894 }
2895
selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr & MI) const2896 bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
2897 MachineInstr &MI) const {
2898
2899 MachineBasicBlock *MBB = MI.getParent();
2900 const DebugLoc &DL = MI.getDebugLoc();
2901
2902 if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
2903 Function &F = MBB->getParent()->getFunction();
2904 DiagnosticInfoUnsupported
2905 NoFpRet(F, "return versions of fp atomics not supported",
2906 MI.getDebugLoc(), DS_Error);
2907 F.getContext().diagnose(NoFpRet);
2908 return false;
2909 }
2910
2911 // FIXME: This is only needed because tablegen requires number of dst operands
2912 // in match and replace pattern to be the same. Otherwise patterns can be
2913 // exported from SDag path.
2914 MachineOperand &VDataIn = MI.getOperand(1);
2915 MachineOperand &VIndex = MI.getOperand(3);
2916 MachineOperand &VOffset = MI.getOperand(4);
2917 MachineOperand &SOffset = MI.getOperand(5);
2918 int16_t Offset = MI.getOperand(6).getImm();
2919
2920 bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI);
2921 bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI);
2922
2923 unsigned Opcode;
2924 if (HasVOffset) {
2925 Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
2926 : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN;
2927 } else {
2928 Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
2929 : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET;
2930 }
2931
2932 if (MRI->getType(VDataIn.getReg()).isVector()) {
2933 switch (Opcode) {
2934 case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN:
2935 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN;
2936 break;
2937 case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN:
2938 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN;
2939 break;
2940 case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN:
2941 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN;
2942 break;
2943 case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET:
2944 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET;
2945 break;
2946 }
2947 }
2948
2949 auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode));
2950 I.add(VDataIn);
2951
2952 if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
2953 Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
2954 Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
2955 BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
2956 .addReg(VIndex.getReg())
2957 .addImm(AMDGPU::sub0)
2958 .addReg(VOffset.getReg())
2959 .addImm(AMDGPU::sub1);
2960
2961 I.addReg(IdxReg);
2962 } else if (HasVIndex) {
2963 I.add(VIndex);
2964 } else if (HasVOffset) {
2965 I.add(VOffset);
2966 }
2967
2968 I.add(MI.getOperand(2)); // rsrc
2969 I.add(SOffset);
2970 I.addImm(Offset);
2971 renderExtractSLC(I, MI, 7);
2972 I.cloneMemRefs(MI);
2973
2974 MI.eraseFromParent();
2975
2976 return true;
2977 }
2978
selectGlobalAtomicFaddIntrinsic(MachineInstr & MI) const2979 bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
2980 MachineInstr &MI) const{
2981
2982 MachineBasicBlock *MBB = MI.getParent();
2983 const DebugLoc &DL = MI.getDebugLoc();
2984
2985 if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
2986 Function &F = MBB->getParent()->getFunction();
2987 DiagnosticInfoUnsupported
2988 NoFpRet(F, "return versions of fp atomics not supported",
2989 MI.getDebugLoc(), DS_Error);
2990 F.getContext().diagnose(NoFpRet);
2991 return false;
2992 }
2993
2994 // FIXME: This is only needed because tablegen requires number of dst operands
2995 // in match and replace pattern to be the same. Otherwise patterns can be
2996 // exported from SDag path.
2997 auto Addr = selectFlatOffsetImpl<true>(MI.getOperand(2));
2998
2999 Register Data = MI.getOperand(3).getReg();
3000 const unsigned Opc = MRI->getType(Data).isVector() ?
3001 AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
3002 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3003 .addReg(Addr.first)
3004 .addReg(Data)
3005 .addImm(Addr.second)
3006 .addImm(0) // SLC
3007 .cloneMemRefs(MI);
3008
3009 MI.eraseFromParent();
3010 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3011 }
3012
selectBVHIntrinsic(MachineInstr & MI) const3013 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3014 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3015 MI.RemoveOperand(1);
3016 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3017 return true;
3018 }
3019
select(MachineInstr & I)3020 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3021 if (I.isPHI())
3022 return selectPHI(I);
3023
3024 if (!I.isPreISelOpcode()) {
3025 if (I.isCopy())
3026 return selectCOPY(I);
3027 return true;
3028 }
3029
3030 switch (I.getOpcode()) {
3031 case TargetOpcode::G_AND:
3032 case TargetOpcode::G_OR:
3033 case TargetOpcode::G_XOR:
3034 if (selectImpl(I, *CoverageInfo))
3035 return true;
3036 return selectG_AND_OR_XOR(I);
3037 case TargetOpcode::G_ADD:
3038 case TargetOpcode::G_SUB:
3039 if (selectImpl(I, *CoverageInfo))
3040 return true;
3041 return selectG_ADD_SUB(I);
3042 case TargetOpcode::G_UADDO:
3043 case TargetOpcode::G_USUBO:
3044 case TargetOpcode::G_UADDE:
3045 case TargetOpcode::G_USUBE:
3046 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3047 case TargetOpcode::G_INTTOPTR:
3048 case TargetOpcode::G_BITCAST:
3049 case TargetOpcode::G_PTRTOINT:
3050 return selectCOPY(I);
3051 case TargetOpcode::G_CONSTANT:
3052 case TargetOpcode::G_FCONSTANT:
3053 return selectG_CONSTANT(I);
3054 case TargetOpcode::G_FNEG:
3055 if (selectImpl(I, *CoverageInfo))
3056 return true;
3057 return selectG_FNEG(I);
3058 case TargetOpcode::G_FABS:
3059 if (selectImpl(I, *CoverageInfo))
3060 return true;
3061 return selectG_FABS(I);
3062 case TargetOpcode::G_EXTRACT:
3063 return selectG_EXTRACT(I);
3064 case TargetOpcode::G_MERGE_VALUES:
3065 case TargetOpcode::G_BUILD_VECTOR:
3066 case TargetOpcode::G_CONCAT_VECTORS:
3067 return selectG_MERGE_VALUES(I);
3068 case TargetOpcode::G_UNMERGE_VALUES:
3069 return selectG_UNMERGE_VALUES(I);
3070 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3071 return selectG_BUILD_VECTOR_TRUNC(I);
3072 case TargetOpcode::G_PTR_ADD:
3073 return selectG_PTR_ADD(I);
3074 case TargetOpcode::G_IMPLICIT_DEF:
3075 return selectG_IMPLICIT_DEF(I);
3076 case TargetOpcode::G_FREEZE:
3077 return selectCOPY(I);
3078 case TargetOpcode::G_INSERT:
3079 return selectG_INSERT(I);
3080 case TargetOpcode::G_INTRINSIC:
3081 return selectG_INTRINSIC(I);
3082 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3083 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3084 case TargetOpcode::G_ICMP:
3085 if (selectG_ICMP(I))
3086 return true;
3087 return selectImpl(I, *CoverageInfo);
3088 case TargetOpcode::G_LOAD:
3089 case TargetOpcode::G_STORE:
3090 case TargetOpcode::G_ATOMIC_CMPXCHG:
3091 case TargetOpcode::G_ATOMICRMW_XCHG:
3092 case TargetOpcode::G_ATOMICRMW_ADD:
3093 case TargetOpcode::G_ATOMICRMW_SUB:
3094 case TargetOpcode::G_ATOMICRMW_AND:
3095 case TargetOpcode::G_ATOMICRMW_OR:
3096 case TargetOpcode::G_ATOMICRMW_XOR:
3097 case TargetOpcode::G_ATOMICRMW_MIN:
3098 case TargetOpcode::G_ATOMICRMW_MAX:
3099 case TargetOpcode::G_ATOMICRMW_UMIN:
3100 case TargetOpcode::G_ATOMICRMW_UMAX:
3101 case TargetOpcode::G_ATOMICRMW_FADD:
3102 case AMDGPU::G_AMDGPU_ATOMIC_INC:
3103 case AMDGPU::G_AMDGPU_ATOMIC_DEC:
3104 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3105 case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3106 return selectG_LOAD_STORE_ATOMICRMW(I);
3107 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
3108 return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
3109 case TargetOpcode::G_SELECT:
3110 return selectG_SELECT(I);
3111 case TargetOpcode::G_TRUNC:
3112 return selectG_TRUNC(I);
3113 case TargetOpcode::G_SEXT:
3114 case TargetOpcode::G_ZEXT:
3115 case TargetOpcode::G_ANYEXT:
3116 case TargetOpcode::G_SEXT_INREG:
3117 if (selectImpl(I, *CoverageInfo))
3118 return true;
3119 return selectG_SZA_EXT(I);
3120 case TargetOpcode::G_BRCOND:
3121 return selectG_BRCOND(I);
3122 case TargetOpcode::G_GLOBAL_VALUE:
3123 return selectG_GLOBAL_VALUE(I);
3124 case TargetOpcode::G_PTRMASK:
3125 return selectG_PTRMASK(I);
3126 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3127 return selectG_EXTRACT_VECTOR_ELT(I);
3128 case TargetOpcode::G_INSERT_VECTOR_ELT:
3129 return selectG_INSERT_VECTOR_ELT(I);
3130 case TargetOpcode::G_SHUFFLE_VECTOR:
3131 return selectG_SHUFFLE_VECTOR(I);
3132 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3133 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3134 const AMDGPU::ImageDimIntrinsicInfo *Intr
3135 = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
3136 assert(Intr && "not an image intrinsic with image pseudo");
3137 return selectImageIntrinsic(I, Intr);
3138 }
3139 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3140 return selectBVHIntrinsic(I);
3141 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3142 return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
3143 default:
3144 return selectImpl(I, *CoverageInfo);
3145 }
3146 return false;
3147 }
3148
3149 InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand & Root) const3150 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3151 return {{
3152 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3153 }};
3154
3155 }
3156
3157 std::pair<Register, unsigned>
selectVOP3ModsImpl(MachineOperand & Root,bool AllowAbs) const3158 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3159 bool AllowAbs) const {
3160 Register Src = Root.getReg();
3161 Register OrigSrc = Src;
3162 unsigned Mods = 0;
3163 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3164
3165 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
3166 Src = MI->getOperand(1).getReg();
3167 Mods |= SISrcMods::NEG;
3168 MI = getDefIgnoringCopies(Src, *MRI);
3169 }
3170
3171 if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) {
3172 Src = MI->getOperand(1).getReg();
3173 Mods |= SISrcMods::ABS;
3174 }
3175
3176 if (Mods != 0 &&
3177 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3178 MachineInstr *UseMI = Root.getParent();
3179
3180 // If we looked through copies to find source modifiers on an SGPR operand,
3181 // we now have an SGPR register source. To avoid potentially violating the
3182 // constant bus restriction, we need to insert a copy to a VGPR.
3183 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
3184 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
3185 TII.get(AMDGPU::COPY), VGPRSrc)
3186 .addReg(Src);
3187 Src = VGPRSrc;
3188 }
3189
3190 return std::make_pair(Src, Mods);
3191 }
3192
3193 ///
3194 /// This will select either an SGPR or VGPR operand and will save us from
3195 /// having to write an extra tablegen pattern.
3196 InstructionSelector::ComplexRendererFns
selectVSRC0(MachineOperand & Root) const3197 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3198 return {{
3199 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3200 }};
3201 }
3202
3203 InstructionSelector::ComplexRendererFns
selectVOP3Mods0(MachineOperand & Root) const3204 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3205 Register Src;
3206 unsigned Mods;
3207 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3208
3209 return {{
3210 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3211 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3212 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3213 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3214 }};
3215 }
3216
3217 InstructionSelector::ComplexRendererFns
selectVOP3BMods0(MachineOperand & Root) const3218 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3219 Register Src;
3220 unsigned Mods;
3221 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3222
3223 return {{
3224 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3225 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3226 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3227 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3228 }};
3229 }
3230
3231 InstructionSelector::ComplexRendererFns
selectVOP3OMods(MachineOperand & Root) const3232 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3233 return {{
3234 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3235 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3236 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3237 }};
3238 }
3239
3240 InstructionSelector::ComplexRendererFns
selectVOP3Mods(MachineOperand & Root) const3241 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3242 Register Src;
3243 unsigned Mods;
3244 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3245
3246 return {{
3247 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3248 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3249 }};
3250 }
3251
3252 InstructionSelector::ComplexRendererFns
selectVOP3BMods(MachineOperand & Root) const3253 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3254 Register Src;
3255 unsigned Mods;
3256 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3257
3258 return {{
3259 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3260 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3261 }};
3262 }
3263
3264 InstructionSelector::ComplexRendererFns
selectVOP3NoMods(MachineOperand & Root) const3265 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3266 Register Reg = Root.getReg();
3267 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3268 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3269 Def->getOpcode() == AMDGPU::G_FABS))
3270 return {};
3271 return {{
3272 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3273 }};
3274 }
3275
3276 std::pair<Register, unsigned>
selectVOP3PModsImpl(Register Src,const MachineRegisterInfo & MRI) const3277 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3278 Register Src, const MachineRegisterInfo &MRI) const {
3279 unsigned Mods = 0;
3280 MachineInstr *MI = MRI.getVRegDef(Src);
3281
3282 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3283 // It's possible to see an f32 fneg here, but unlikely.
3284 // TODO: Treat f32 fneg as only high bit.
3285 MRI.getType(Src) == LLT::vector(2, 16)) {
3286 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3287 Src = MI->getOperand(1).getReg();
3288 MI = MRI.getVRegDef(Src);
3289 }
3290
3291 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3292
3293 // Packed instructions do not have abs modifiers.
3294 Mods |= SISrcMods::OP_SEL_1;
3295
3296 return std::make_pair(Src, Mods);
3297 }
3298
3299 InstructionSelector::ComplexRendererFns
selectVOP3PMods(MachineOperand & Root) const3300 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3301 MachineRegisterInfo &MRI
3302 = Root.getParent()->getParent()->getParent()->getRegInfo();
3303
3304 Register Src;
3305 unsigned Mods;
3306 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3307
3308 return {{
3309 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3310 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3311 }};
3312 }
3313
3314 InstructionSelector::ComplexRendererFns
selectVOP3Mods_nnan(MachineOperand & Root) const3315 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3316 Register Src;
3317 unsigned Mods;
3318 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3319 if (!isKnownNeverNaN(Src, *MRI))
3320 return None;
3321
3322 return {{
3323 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3324 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3325 }};
3326 }
3327
3328 InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand & Root) const3329 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3330 // FIXME: Handle op_sel
3331 return {{
3332 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3333 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3334 }};
3335 }
3336
3337 InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand & Root) const3338 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3339 SmallVector<GEPInfo, 4> AddrInfo;
3340 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3341
3342 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3343 return None;
3344
3345 const GEPInfo &GEPInfo = AddrInfo[0];
3346 Optional<int64_t> EncodedImm =
3347 AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3348 if (!EncodedImm)
3349 return None;
3350
3351 unsigned PtrReg = GEPInfo.SgprParts[0];
3352 return {{
3353 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3354 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3355 }};
3356 }
3357
3358 InstructionSelector::ComplexRendererFns
selectSmrdImm32(MachineOperand & Root) const3359 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3360 SmallVector<GEPInfo, 4> AddrInfo;
3361 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3362
3363 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3364 return None;
3365
3366 const GEPInfo &GEPInfo = AddrInfo[0];
3367 Register PtrReg = GEPInfo.SgprParts[0];
3368 Optional<int64_t> EncodedImm =
3369 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3370 if (!EncodedImm)
3371 return None;
3372
3373 return {{
3374 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3375 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3376 }};
3377 }
3378
3379 InstructionSelector::ComplexRendererFns
selectSmrdSgpr(MachineOperand & Root) const3380 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3381 MachineInstr *MI = Root.getParent();
3382 MachineBasicBlock *MBB = MI->getParent();
3383
3384 SmallVector<GEPInfo, 4> AddrInfo;
3385 getAddrModeInfo(*MI, *MRI, AddrInfo);
3386
3387 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3388 // then we can select all ptr + 32-bit offsets not just immediate offsets.
3389 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3390 return None;
3391
3392 const GEPInfo &GEPInfo = AddrInfo[0];
3393 // SGPR offset is unsigned.
3394 if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3395 return None;
3396
3397 // If we make it this far we have a load with an 32-bit immediate offset.
3398 // It is OK to select this using a sgpr offset, because we have already
3399 // failed trying to select this load into one of the _IMM variants since
3400 // the _IMM Patterns are considered before the _SGPR patterns.
3401 Register PtrReg = GEPInfo.SgprParts[0];
3402 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3403 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3404 .addImm(GEPInfo.Imm);
3405 return {{
3406 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3407 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3408 }};
3409 }
3410
3411 template <bool Signed>
3412 std::pair<Register, int>
selectFlatOffsetImpl(MachineOperand & Root) const3413 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
3414 MachineInstr *MI = Root.getParent();
3415
3416 auto Default = std::make_pair(Root.getReg(), 0);
3417
3418 if (!STI.hasFlatInstOffsets())
3419 return Default;
3420
3421 Register PtrBase;
3422 int64_t ConstOffset;
3423 std::tie(PtrBase, ConstOffset) =
3424 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3425 if (ConstOffset == 0)
3426 return Default;
3427
3428 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3429 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, Signed))
3430 return Default;
3431
3432 return std::make_pair(PtrBase, ConstOffset);
3433 }
3434
3435 InstructionSelector::ComplexRendererFns
selectFlatOffset(MachineOperand & Root) const3436 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3437 auto PtrWithOffset = selectFlatOffsetImpl<false>(Root);
3438
3439 return {{
3440 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3441 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3442 }};
3443 }
3444
3445 InstructionSelector::ComplexRendererFns
selectFlatOffsetSigned(MachineOperand & Root) const3446 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
3447 auto PtrWithOffset = selectFlatOffsetImpl<true>(Root);
3448
3449 return {{
3450 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3451 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3452 }};
3453 }
3454
3455 /// Match a zero extend from a 32-bit value to 64-bits.
matchZeroExtendFromS32(MachineRegisterInfo & MRI,Register Reg)3456 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3457 Register ZExtSrc;
3458 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3459 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3460
3461 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3462 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3463 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3464 return false;
3465
3466 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3467 return Def->getOperand(1).getReg();
3468 }
3469
3470 return Register();
3471 }
3472
3473 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
3474 InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand & Root) const3475 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
3476 Register Addr = Root.getReg();
3477 Register PtrBase;
3478 int64_t ConstOffset;
3479 int64_t ImmOffset = 0;
3480
3481 // Match the immediate offset first, which canonically is moved as low as
3482 // possible.
3483 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3484
3485 if (ConstOffset != 0) {
3486 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) {
3487 Addr = PtrBase;
3488 ImmOffset = ConstOffset;
3489 } else if (ConstOffset > 0) {
3490 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
3491 if (!PtrBaseDef)
3492 return None;
3493
3494 if (isSGPR(PtrBaseDef->Reg)) {
3495 // Offset is too large.
3496 //
3497 // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset)
3498 // + (large_offset & MaxOffset);
3499 int64_t SplitImmOffset, RemainderOffset;
3500 std::tie(SplitImmOffset, RemainderOffset)
3501 = TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true);
3502
3503 if (isUInt<32>(RemainderOffset)) {
3504 MachineInstr *MI = Root.getParent();
3505 MachineBasicBlock *MBB = MI->getParent();
3506 Register HighBits
3507 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3508
3509 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3510 HighBits)
3511 .addImm(RemainderOffset);
3512
3513 return {{
3514 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
3515 [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset
3516 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
3517 }};
3518 }
3519 }
3520 }
3521 }
3522
3523 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3524 if (!AddrDef)
3525 return None;
3526
3527 // Match the variable offset.
3528 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) {
3529 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
3530 // drop this.
3531 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
3532 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT)
3533 return None;
3534
3535 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
3536 // moves required to copy a 64-bit SGPR to VGPR.
3537 const Register SAddr = AddrDef->Reg;
3538 if (!isSGPR(SAddr))
3539 return None;
3540
3541 MachineInstr *MI = Root.getParent();
3542 MachineBasicBlock *MBB = MI->getParent();
3543 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3544
3545 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3546 VOffset)
3547 .addImm(0);
3548
3549 return {{
3550 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
3551 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
3552 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3553 }};
3554 }
3555
3556 // Look through the SGPR->VGPR copy.
3557 Register SAddr =
3558 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3559 if (!SAddr || !isSGPR(SAddr))
3560 return None;
3561
3562 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3563
3564 // It's possible voffset is an SGPR here, but the copy to VGPR will be
3565 // inserted later.
3566 Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset);
3567 if (!VOffset)
3568 return None;
3569
3570 return {{[=](MachineInstrBuilder &MIB) { // saddr
3571 MIB.addReg(SAddr);
3572 },
3573 [=](MachineInstrBuilder &MIB) { // voffset
3574 MIB.addReg(VOffset);
3575 },
3576 [=](MachineInstrBuilder &MIB) { // offset
3577 MIB.addImm(ImmOffset);
3578 }}};
3579 }
3580
3581 InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand & Root) const3582 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
3583 Register Addr = Root.getReg();
3584 Register PtrBase;
3585 int64_t ConstOffset;
3586 int64_t ImmOffset = 0;
3587
3588 // Match the immediate offset first, which canonically is moved as low as
3589 // possible.
3590 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3591
3592 if (ConstOffset != 0 &&
3593 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
3594 Addr = PtrBase;
3595 ImmOffset = ConstOffset;
3596 }
3597
3598 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3599 if (!AddrDef)
3600 return None;
3601
3602 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3603 int FI = AddrDef->MI->getOperand(1).getIndex();
3604 return {{
3605 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
3606 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3607 }};
3608 }
3609
3610 Register SAddr = AddrDef->Reg;
3611
3612 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3613 Register LHS = AddrDef->MI->getOperand(1).getReg();
3614 Register RHS = AddrDef->MI->getOperand(2).getReg();
3615 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
3616 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
3617
3618 if (LHSDef && RHSDef &&
3619 LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
3620 isSGPR(RHSDef->Reg)) {
3621 int FI = LHSDef->MI->getOperand(1).getIndex();
3622 MachineInstr &I = *Root.getParent();
3623 MachineBasicBlock *BB = I.getParent();
3624 const DebugLoc &DL = I.getDebugLoc();
3625 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3626
3627 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), SAddr)
3628 .addFrameIndex(FI)
3629 .addReg(RHSDef->Reg);
3630 }
3631 }
3632
3633 if (!isSGPR(SAddr))
3634 return None;
3635
3636 return {{
3637 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
3638 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3639 }};
3640 }
3641
isStackPtrRelative(const MachinePointerInfo & PtrInfo)3642 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3643 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3644 return PSV && PSV->isStack();
3645 }
3646
3647 InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffen(MachineOperand & Root) const3648 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3649 MachineInstr *MI = Root.getParent();
3650 MachineBasicBlock *MBB = MI->getParent();
3651 MachineFunction *MF = MBB->getParent();
3652 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3653
3654 int64_t Offset = 0;
3655 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3656 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3657 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3658
3659 // TODO: Should this be inside the render function? The iterator seems to
3660 // move.
3661 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3662 HighBits)
3663 .addImm(Offset & ~4095);
3664
3665 return {{[=](MachineInstrBuilder &MIB) { // rsrc
3666 MIB.addReg(Info->getScratchRSrcReg());
3667 },
3668 [=](MachineInstrBuilder &MIB) { // vaddr
3669 MIB.addReg(HighBits);
3670 },
3671 [=](MachineInstrBuilder &MIB) { // soffset
3672 // Use constant zero for soffset and rely on eliminateFrameIndex
3673 // to choose the appropriate frame register if need be.
3674 MIB.addImm(0);
3675 },
3676 [=](MachineInstrBuilder &MIB) { // offset
3677 MIB.addImm(Offset & 4095);
3678 }}};
3679 }
3680
3681 assert(Offset == 0 || Offset == -1);
3682
3683 // Try to fold a frame index directly into the MUBUF vaddr field, and any
3684 // offsets.
3685 Optional<int> FI;
3686 Register VAddr = Root.getReg();
3687 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3688 if (isBaseWithConstantOffset(Root, *MRI)) {
3689 const MachineOperand &LHS = RootDef->getOperand(1);
3690 const MachineOperand &RHS = RootDef->getOperand(2);
3691 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
3692 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
3693 if (LHSDef && RHSDef) {
3694 int64_t PossibleOffset =
3695 RHSDef->getOperand(1).getCImm()->getSExtValue();
3696 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
3697 (!STI.privateMemoryResourceIsRangeChecked() ||
3698 KnownBits->signBitIsZero(LHS.getReg()))) {
3699 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3700 FI = LHSDef->getOperand(1).getIndex();
3701 else
3702 VAddr = LHS.getReg();
3703 Offset = PossibleOffset;
3704 }
3705 }
3706 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3707 FI = RootDef->getOperand(1).getIndex();
3708 }
3709 }
3710
3711 return {{[=](MachineInstrBuilder &MIB) { // rsrc
3712 MIB.addReg(Info->getScratchRSrcReg());
3713 },
3714 [=](MachineInstrBuilder &MIB) { // vaddr
3715 if (FI.hasValue())
3716 MIB.addFrameIndex(FI.getValue());
3717 else
3718 MIB.addReg(VAddr);
3719 },
3720 [=](MachineInstrBuilder &MIB) { // soffset
3721 // Use constant zero for soffset and rely on eliminateFrameIndex
3722 // to choose the appropriate frame register if need be.
3723 MIB.addImm(0);
3724 },
3725 [=](MachineInstrBuilder &MIB) { // offset
3726 MIB.addImm(Offset);
3727 }}};
3728 }
3729
isDSOffsetLegal(Register Base,int64_t Offset) const3730 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3731 int64_t Offset) const {
3732 if (!isUInt<16>(Offset))
3733 return false;
3734
3735 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3736 return true;
3737
3738 // On Southern Islands instruction with a negative base value and an offset
3739 // don't seem to work.
3740 return KnownBits->signBitIsZero(Base);
3741 }
3742
isDSOffset2Legal(Register Base,int64_t Offset0,int64_t Offset1,unsigned Size) const3743 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
3744 int64_t Offset1,
3745 unsigned Size) const {
3746 if (Offset0 % Size != 0 || Offset1 % Size != 0)
3747 return false;
3748 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
3749 return false;
3750
3751 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3752 return true;
3753
3754 // On Southern Islands instruction with a negative base value and an offset
3755 // don't seem to work.
3756 return KnownBits->signBitIsZero(Base);
3757 }
3758
3759 InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffset(MachineOperand & Root) const3760 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3761 MachineOperand &Root) const {
3762 MachineInstr *MI = Root.getParent();
3763 MachineBasicBlock *MBB = MI->getParent();
3764
3765 int64_t Offset = 0;
3766 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3767 !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3768 return {};
3769
3770 const MachineFunction *MF = MBB->getParent();
3771 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3772 const MachineMemOperand *MMO = *MI->memoperands_begin();
3773 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3774
3775 return {{
3776 [=](MachineInstrBuilder &MIB) { // rsrc
3777 MIB.addReg(Info->getScratchRSrcReg());
3778 },
3779 [=](MachineInstrBuilder &MIB) { // soffset
3780 if (isStackPtrRelative(PtrInfo))
3781 MIB.addReg(Info->getStackPtrOffsetReg());
3782 else
3783 MIB.addImm(0);
3784 },
3785 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3786 }};
3787 }
3788
3789 std::pair<Register, unsigned>
selectDS1Addr1OffsetImpl(MachineOperand & Root) const3790 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3791 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3792 if (!RootDef)
3793 return std::make_pair(Root.getReg(), 0);
3794
3795 int64_t ConstAddr = 0;
3796
3797 Register PtrBase;
3798 int64_t Offset;
3799 std::tie(PtrBase, Offset) =
3800 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3801
3802 if (Offset) {
3803 if (isDSOffsetLegal(PtrBase, Offset)) {
3804 // (add n0, c0)
3805 return std::make_pair(PtrBase, Offset);
3806 }
3807 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3808 // TODO
3809
3810
3811 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3812 // TODO
3813
3814 }
3815
3816 return std::make_pair(Root.getReg(), 0);
3817 }
3818
3819 InstructionSelector::ComplexRendererFns
selectDS1Addr1Offset(MachineOperand & Root) const3820 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3821 Register Reg;
3822 unsigned Offset;
3823 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3824 return {{
3825 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3826 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3827 }};
3828 }
3829
3830 InstructionSelector::ComplexRendererFns
selectDS64Bit4ByteAligned(MachineOperand & Root) const3831 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3832 return selectDSReadWrite2(Root, 4);
3833 }
3834
3835 InstructionSelector::ComplexRendererFns
selectDS128Bit8ByteAligned(MachineOperand & Root) const3836 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
3837 return selectDSReadWrite2(Root, 8);
3838 }
3839
3840 InstructionSelector::ComplexRendererFns
selectDSReadWrite2(MachineOperand & Root,unsigned Size) const3841 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
3842 unsigned Size) const {
3843 Register Reg;
3844 unsigned Offset;
3845 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
3846 return {{
3847 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3848 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3849 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3850 }};
3851 }
3852
3853 std::pair<Register, unsigned>
selectDSReadWrite2Impl(MachineOperand & Root,unsigned Size) const3854 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
3855 unsigned Size) const {
3856 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3857 if (!RootDef)
3858 return std::make_pair(Root.getReg(), 0);
3859
3860 int64_t ConstAddr = 0;
3861
3862 Register PtrBase;
3863 int64_t Offset;
3864 std::tie(PtrBase, Offset) =
3865 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3866
3867 if (Offset) {
3868 int64_t OffsetValue0 = Offset;
3869 int64_t OffsetValue1 = Offset + Size;
3870 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
3871 // (add n0, c0)
3872 return std::make_pair(PtrBase, OffsetValue0 / Size);
3873 }
3874 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3875 // TODO
3876
3877 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3878 // TODO
3879
3880 }
3881
3882 return std::make_pair(Root.getReg(), 0);
3883 }
3884
3885 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3886 /// the base value with the constant offset. There may be intervening copies
3887 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
3888 /// not match the pattern.
3889 std::pair<Register, int64_t>
getPtrBaseWithConstantOffset(Register Root,const MachineRegisterInfo & MRI) const3890 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3891 Register Root, const MachineRegisterInfo &MRI) const {
3892 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
3893 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3894 return {Root, 0};
3895
3896 MachineOperand &RHS = RootI->getOperand(2);
3897 Optional<ValueAndVReg> MaybeOffset
3898 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3899 if (!MaybeOffset)
3900 return {Root, 0};
3901 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
3902 }
3903
addZeroImm(MachineInstrBuilder & MIB)3904 static void addZeroImm(MachineInstrBuilder &MIB) {
3905 MIB.addImm(0);
3906 }
3907
3908 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3909 /// BasePtr is not valid, a null base pointer will be used.
buildRSRC(MachineIRBuilder & B,MachineRegisterInfo & MRI,uint32_t FormatLo,uint32_t FormatHi,Register BasePtr)3910 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3911 uint32_t FormatLo, uint32_t FormatHi,
3912 Register BasePtr) {
3913 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3914 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3915 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3916 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3917
3918 B.buildInstr(AMDGPU::S_MOV_B32)
3919 .addDef(RSrc2)
3920 .addImm(FormatLo);
3921 B.buildInstr(AMDGPU::S_MOV_B32)
3922 .addDef(RSrc3)
3923 .addImm(FormatHi);
3924
3925 // Build the half of the subregister with the constants before building the
3926 // full 128-bit register. If we are building multiple resource descriptors,
3927 // this will allow CSEing of the 2-component register.
3928 B.buildInstr(AMDGPU::REG_SEQUENCE)
3929 .addDef(RSrcHi)
3930 .addReg(RSrc2)
3931 .addImm(AMDGPU::sub0)
3932 .addReg(RSrc3)
3933 .addImm(AMDGPU::sub1);
3934
3935 Register RSrcLo = BasePtr;
3936 if (!BasePtr) {
3937 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3938 B.buildInstr(AMDGPU::S_MOV_B64)
3939 .addDef(RSrcLo)
3940 .addImm(0);
3941 }
3942
3943 B.buildInstr(AMDGPU::REG_SEQUENCE)
3944 .addDef(RSrc)
3945 .addReg(RSrcLo)
3946 .addImm(AMDGPU::sub0_sub1)
3947 .addReg(RSrcHi)
3948 .addImm(AMDGPU::sub2_sub3);
3949
3950 return RSrc;
3951 }
3952
buildAddr64RSrc(MachineIRBuilder & B,MachineRegisterInfo & MRI,const SIInstrInfo & TII,Register BasePtr)3953 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3954 const SIInstrInfo &TII, Register BasePtr) {
3955 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3956
3957 // FIXME: Why are half the "default" bits ignored based on the addressing
3958 // mode?
3959 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3960 }
3961
buildOffsetSrc(MachineIRBuilder & B,MachineRegisterInfo & MRI,const SIInstrInfo & TII,Register BasePtr)3962 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3963 const SIInstrInfo &TII, Register BasePtr) {
3964 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3965
3966 // FIXME: Why are half the "default" bits ignored based on the addressing
3967 // mode?
3968 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
3969 }
3970
3971 AMDGPUInstructionSelector::MUBUFAddressData
parseMUBUFAddress(Register Src) const3972 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
3973 MUBUFAddressData Data;
3974 Data.N0 = Src;
3975
3976 Register PtrBase;
3977 int64_t Offset;
3978
3979 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
3980 if (isUInt<32>(Offset)) {
3981 Data.N0 = PtrBase;
3982 Data.Offset = Offset;
3983 }
3984
3985 if (MachineInstr *InputAdd
3986 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
3987 Data.N2 = InputAdd->getOperand(1).getReg();
3988 Data.N3 = InputAdd->getOperand(2).getReg();
3989
3990 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
3991 // FIXME: Don't know this was defined by operand 0
3992 //
3993 // TODO: Remove this when we have copy folding optimizations after
3994 // RegBankSelect.
3995 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
3996 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
3997 }
3998
3999 return Data;
4000 }
4001
4002 /// Return if the addr64 mubuf mode should be used for the given address.
shouldUseAddr64(MUBUFAddressData Addr) const4003 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
4004 // (ptr_add N2, N3) -> addr64, or
4005 // (ptr_add (ptr_add N2, N3), C1) -> addr64
4006 if (Addr.N2)
4007 return true;
4008
4009 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
4010 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
4011 }
4012
4013 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
4014 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
4015 /// component.
splitIllegalMUBUFOffset(MachineIRBuilder & B,Register & SOffset,int64_t & ImmOffset) const4016 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
4017 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
4018 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
4019 return;
4020
4021 // Illegal offset, store it in soffset.
4022 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4023 B.buildInstr(AMDGPU::S_MOV_B32)
4024 .addDef(SOffset)
4025 .addImm(ImmOffset);
4026 ImmOffset = 0;
4027 }
4028
selectMUBUFAddr64Impl(MachineOperand & Root,Register & VAddr,Register & RSrcReg,Register & SOffset,int64_t & Offset) const4029 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
4030 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
4031 Register &SOffset, int64_t &Offset) const {
4032 // FIXME: Predicates should stop this from reaching here.
4033 // addr64 bit was removed for volcanic islands.
4034 if (!STI.hasAddr64() || STI.useFlatForGlobal())
4035 return false;
4036
4037 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4038 if (!shouldUseAddr64(AddrData))
4039 return false;
4040
4041 Register N0 = AddrData.N0;
4042 Register N2 = AddrData.N2;
4043 Register N3 = AddrData.N3;
4044 Offset = AddrData.Offset;
4045
4046 // Base pointer for the SRD.
4047 Register SRDPtr;
4048
4049 if (N2) {
4050 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4051 assert(N3);
4052 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4053 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
4054 // addr64, and construct the default resource from a 0 address.
4055 VAddr = N0;
4056 } else {
4057 SRDPtr = N3;
4058 VAddr = N2;
4059 }
4060 } else {
4061 // N2 is not divergent.
4062 SRDPtr = N2;
4063 VAddr = N3;
4064 }
4065 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4066 // Use the default null pointer in the resource
4067 VAddr = N0;
4068 } else {
4069 // N0 -> offset, or
4070 // (N0 + C1) -> offset
4071 SRDPtr = N0;
4072 }
4073
4074 MachineIRBuilder B(*Root.getParent());
4075 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
4076 splitIllegalMUBUFOffset(B, SOffset, Offset);
4077 return true;
4078 }
4079
selectMUBUFOffsetImpl(MachineOperand & Root,Register & RSrcReg,Register & SOffset,int64_t & Offset) const4080 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
4081 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
4082 int64_t &Offset) const {
4083
4084 // FIXME: Pattern should not reach here.
4085 if (STI.useFlatForGlobal())
4086 return false;
4087
4088 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4089 if (shouldUseAddr64(AddrData))
4090 return false;
4091
4092 // N0 -> offset, or
4093 // (N0 + C1) -> offset
4094 Register SRDPtr = AddrData.N0;
4095 Offset = AddrData.Offset;
4096
4097 // TODO: Look through extensions for 32-bit soffset.
4098 MachineIRBuilder B(*Root.getParent());
4099
4100 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
4101 splitIllegalMUBUFOffset(B, SOffset, Offset);
4102 return true;
4103 }
4104
4105 InstructionSelector::ComplexRendererFns
selectMUBUFAddr64(MachineOperand & Root) const4106 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
4107 Register VAddr;
4108 Register RSrcReg;
4109 Register SOffset;
4110 int64_t Offset = 0;
4111
4112 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4113 return {};
4114
4115 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4116 // pattern.
4117 return {{
4118 [=](MachineInstrBuilder &MIB) { // rsrc
4119 MIB.addReg(RSrcReg);
4120 },
4121 [=](MachineInstrBuilder &MIB) { // vaddr
4122 MIB.addReg(VAddr);
4123 },
4124 [=](MachineInstrBuilder &MIB) { // soffset
4125 if (SOffset)
4126 MIB.addReg(SOffset);
4127 else
4128 MIB.addImm(0);
4129 },
4130 [=](MachineInstrBuilder &MIB) { // offset
4131 MIB.addImm(Offset);
4132 },
4133 addZeroImm, // glc
4134 addZeroImm, // slc
4135 addZeroImm, // tfe
4136 addZeroImm, // dlc
4137 addZeroImm // swz
4138 }};
4139 }
4140
4141 InstructionSelector::ComplexRendererFns
selectMUBUFOffset(MachineOperand & Root) const4142 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
4143 Register RSrcReg;
4144 Register SOffset;
4145 int64_t Offset = 0;
4146
4147 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4148 return {};
4149
4150 return {{
4151 [=](MachineInstrBuilder &MIB) { // rsrc
4152 MIB.addReg(RSrcReg);
4153 },
4154 [=](MachineInstrBuilder &MIB) { // soffset
4155 if (SOffset)
4156 MIB.addReg(SOffset);
4157 else
4158 MIB.addImm(0);
4159 },
4160 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4161 addZeroImm, // glc
4162 addZeroImm, // slc
4163 addZeroImm, // tfe
4164 addZeroImm, // dlc
4165 addZeroImm // swz
4166 }};
4167 }
4168
4169 InstructionSelector::ComplexRendererFns
selectMUBUFAddr64Atomic(MachineOperand & Root) const4170 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
4171 Register VAddr;
4172 Register RSrcReg;
4173 Register SOffset;
4174 int64_t Offset = 0;
4175
4176 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4177 return {};
4178
4179 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4180 // pattern.
4181 return {{
4182 [=](MachineInstrBuilder &MIB) { // rsrc
4183 MIB.addReg(RSrcReg);
4184 },
4185 [=](MachineInstrBuilder &MIB) { // vaddr
4186 MIB.addReg(VAddr);
4187 },
4188 [=](MachineInstrBuilder &MIB) { // soffset
4189 if (SOffset)
4190 MIB.addReg(SOffset);
4191 else
4192 MIB.addImm(0);
4193 },
4194 [=](MachineInstrBuilder &MIB) { // offset
4195 MIB.addImm(Offset);
4196 },
4197 addZeroImm // slc
4198 }};
4199 }
4200
4201 InstructionSelector::ComplexRendererFns
selectMUBUFOffsetAtomic(MachineOperand & Root) const4202 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
4203 Register RSrcReg;
4204 Register SOffset;
4205 int64_t Offset = 0;
4206
4207 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4208 return {};
4209
4210 return {{
4211 [=](MachineInstrBuilder &MIB) { // rsrc
4212 MIB.addReg(RSrcReg);
4213 },
4214 [=](MachineInstrBuilder &MIB) { // soffset
4215 if (SOffset)
4216 MIB.addReg(SOffset);
4217 else
4218 MIB.addImm(0);
4219 },
4220 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4221 addZeroImm // slc
4222 }};
4223 }
4224
4225 /// Get an immediate that must be 32-bits, and treated as zero extended.
getConstantZext32Val(Register Reg,const MachineRegisterInfo & MRI)4226 static Optional<uint64_t> getConstantZext32Val(Register Reg,
4227 const MachineRegisterInfo &MRI) {
4228 // getConstantVRegVal sexts any values, so see if that matters.
4229 Optional<int64_t> OffsetVal = getConstantVRegSExtVal(Reg, MRI);
4230 if (!OffsetVal || !isInt<32>(*OffsetVal))
4231 return None;
4232 return Lo_32(*OffsetVal);
4233 }
4234
4235 InstructionSelector::ComplexRendererFns
selectSMRDBufferImm(MachineOperand & Root) const4236 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
4237 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4238 if (!OffsetVal)
4239 return {};
4240
4241 Optional<int64_t> EncodedImm =
4242 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
4243 if (!EncodedImm)
4244 return {};
4245
4246 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4247 }
4248
4249 InstructionSelector::ComplexRendererFns
selectSMRDBufferImm32(MachineOperand & Root) const4250 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
4251 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
4252
4253 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4254 if (!OffsetVal)
4255 return {};
4256
4257 Optional<int64_t> EncodedImm
4258 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
4259 if (!EncodedImm)
4260 return {};
4261
4262 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4263 }
4264
renderTruncImm32(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const4265 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
4266 const MachineInstr &MI,
4267 int OpIdx) const {
4268 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4269 "Expected G_CONSTANT");
4270 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
4271 }
4272
renderNegateImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const4273 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
4274 const MachineInstr &MI,
4275 int OpIdx) const {
4276 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4277 "Expected G_CONSTANT");
4278 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
4279 }
4280
renderBitcastImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const4281 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
4282 const MachineInstr &MI,
4283 int OpIdx) const {
4284 assert(OpIdx == -1);
4285
4286 const MachineOperand &Op = MI.getOperand(1);
4287 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
4288 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
4289 else {
4290 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
4291 MIB.addImm(Op.getCImm()->getSExtValue());
4292 }
4293 }
4294
renderPopcntImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const4295 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
4296 const MachineInstr &MI,
4297 int OpIdx) const {
4298 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4299 "Expected G_CONSTANT");
4300 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
4301 }
4302
4303 /// This only really exists to satisfy DAG type checking machinery, so is a
4304 /// no-op here.
renderTruncTImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const4305 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
4306 const MachineInstr &MI,
4307 int OpIdx) const {
4308 MIB.addImm(MI.getOperand(OpIdx).getImm());
4309 }
4310
renderExtractGLC(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const4311 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
4312 const MachineInstr &MI,
4313 int OpIdx) const {
4314 assert(OpIdx >= 0 && "expected to match an immediate operand");
4315 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
4316 }
4317
renderExtractSLC(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const4318 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
4319 const MachineInstr &MI,
4320 int OpIdx) const {
4321 assert(OpIdx >= 0 && "expected to match an immediate operand");
4322 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
4323 }
4324
renderExtractDLC(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const4325 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
4326 const MachineInstr &MI,
4327 int OpIdx) const {
4328 assert(OpIdx >= 0 && "expected to match an immediate operand");
4329 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
4330 }
4331
renderExtractSWZ(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const4332 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
4333 const MachineInstr &MI,
4334 int OpIdx) const {
4335 assert(OpIdx >= 0 && "expected to match an immediate operand");
4336 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
4337 }
4338
renderFrameIndex(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const4339 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
4340 const MachineInstr &MI,
4341 int OpIdx) const {
4342 MIB.addFrameIndex((MI.getOperand(1).getIndex()));
4343 }
4344
isInlineImmediate16(int64_t Imm) const4345 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
4346 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
4347 }
4348
isInlineImmediate32(int64_t Imm) const4349 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
4350 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
4351 }
4352
isInlineImmediate64(int64_t Imm) const4353 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
4354 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
4355 }
4356
isInlineImmediate(const APFloat & Imm) const4357 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
4358 return TII.isInlineConstant(Imm);
4359 }
4360