1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Defines an instruction selector for the AMDGPU target.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUISelDAGToDAG.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "MCTargetDesc/R600MCTargetDesc.h"
18 #include "R600RegisterInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/CodeGen/FunctionLoweringInfo.h"
23 #include "llvm/CodeGen/SelectionDAG.h"
24 #include "llvm/CodeGen/SelectionDAGISel.h"
25 #include "llvm/CodeGen/SelectionDAGNodes.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/InitializePasses.h"
28 
29 #ifdef EXPENSIVE_CHECKS
30 #include "llvm/Analysis/LoopInfo.h"
31 #include "llvm/IR/Dominators.h"
32 #endif
33 
34 #define DEBUG_TYPE "isel"
35 
36 using namespace llvm;
37 
38 //===----------------------------------------------------------------------===//
39 // Instruction Selector Implementation
40 //===----------------------------------------------------------------------===//
41 
42 namespace {
43 
stripBitcast(SDValue Val)44 static SDValue stripBitcast(SDValue Val) {
45   return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
46 }
47 
48 // Figure out if this is really an extract of the high 16-bits of a dword.
isExtractHiElt(SDValue In,SDValue & Out)49 static bool isExtractHiElt(SDValue In, SDValue &Out) {
50   In = stripBitcast(In);
51 
52   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
53     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
54       if (!Idx->isOne())
55         return false;
56       Out = In.getOperand(0);
57       return true;
58     }
59   }
60 
61   if (In.getOpcode() != ISD::TRUNCATE)
62     return false;
63 
64   SDValue Srl = In.getOperand(0);
65   if (Srl.getOpcode() == ISD::SRL) {
66     if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
67       if (ShiftAmt->getZExtValue() == 16) {
68         Out = stripBitcast(Srl.getOperand(0));
69         return true;
70       }
71     }
72   }
73 
74   return false;
75 }
76 
77 // Look through operations that obscure just looking at the low 16-bits of the
78 // same register.
stripExtractLoElt(SDValue In)79 static SDValue stripExtractLoElt(SDValue In) {
80   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
81     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
82       if (Idx->isZero() && In.getValueSizeInBits() <= 32)
83         return In.getOperand(0);
84     }
85   }
86 
87   if (In.getOpcode() == ISD::TRUNCATE) {
88     SDValue Src = In.getOperand(0);
89     if (Src.getValueType().getSizeInBits() == 32)
90       return stripBitcast(Src);
91   }
92 
93   return In;
94 }
95 
96 }  // end anonymous namespace
97 
98 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
99                       "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)100 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
101 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
102 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
103 #ifdef EXPENSIVE_CHECKS
104 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
105 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
106 #endif
107 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
108                     "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
109 
110 /// This pass converts a legalized DAG into a AMDGPU-specific
111 // DAG, ready for instruction scheduling.
112 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
113                                         CodeGenOpt::Level OptLevel) {
114   return new AMDGPUDAGToDAGISel(TM, OptLevel);
115 }
116 
AMDGPUDAGToDAGISel(TargetMachine * TM,CodeGenOpt::Level OptLevel)117 AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(
118     TargetMachine *TM /*= nullptr*/,
119     CodeGenOpt::Level OptLevel /*= CodeGenOpt::Default*/)
120     : SelectionDAGISel(*TM, OptLevel) {
121   EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
122 }
123 
runOnMachineFunction(MachineFunction & MF)124 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
125 #ifdef EXPENSIVE_CHECKS
126   DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
127   LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
128   for (auto &L : LI->getLoopsInPreorder()) {
129     assert(L->isLCSSAForm(DT));
130   }
131 #endif
132   Subtarget = &MF.getSubtarget<GCNSubtarget>();
133   Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction());
134   return SelectionDAGISel::runOnMachineFunction(MF);
135 }
136 
fp16SrcZerosHighBits(unsigned Opc) const137 bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
138   // XXX - only need to list legal operations.
139   switch (Opc) {
140   case ISD::FADD:
141   case ISD::FSUB:
142   case ISD::FMUL:
143   case ISD::FDIV:
144   case ISD::FREM:
145   case ISD::FCANONICALIZE:
146   case ISD::UINT_TO_FP:
147   case ISD::SINT_TO_FP:
148   case ISD::FABS:
149     // Fabs is lowered to a bit operation, but it's an and which will clear the
150     // high bits anyway.
151   case ISD::FSQRT:
152   case ISD::FSIN:
153   case ISD::FCOS:
154   case ISD::FPOWI:
155   case ISD::FPOW:
156   case ISD::FLOG:
157   case ISD::FLOG2:
158   case ISD::FLOG10:
159   case ISD::FEXP:
160   case ISD::FEXP2:
161   case ISD::FCEIL:
162   case ISD::FTRUNC:
163   case ISD::FRINT:
164   case ISD::FNEARBYINT:
165   case ISD::FROUND:
166   case ISD::FFLOOR:
167   case ISD::FMINNUM:
168   case ISD::FMAXNUM:
169   case AMDGPUISD::FRACT:
170   case AMDGPUISD::CLAMP:
171   case AMDGPUISD::COS_HW:
172   case AMDGPUISD::SIN_HW:
173   case AMDGPUISD::FMIN3:
174   case AMDGPUISD::FMAX3:
175   case AMDGPUISD::FMED3:
176   case AMDGPUISD::FMAD_FTZ:
177   case AMDGPUISD::RCP:
178   case AMDGPUISD::RSQ:
179   case AMDGPUISD::RCP_IFLAG:
180   case AMDGPUISD::LDEXP:
181     // On gfx10, all 16-bit instructions preserve the high bits.
182     return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
183   case ISD::FP_ROUND:
184     // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
185     // high bits on gfx9.
186     // TODO: If we had the source node we could see if the source was fma/mad
187     return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
188   case ISD::FMA:
189   case ISD::FMAD:
190   case AMDGPUISD::DIV_FIXUP:
191     return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
192   default:
193     // fcopysign, select and others may be lowered to 32-bit bit operations
194     // which don't zero the high bits.
195     return false;
196   }
197 }
198 
getAnalysisUsage(AnalysisUsage & AU) const199 void AMDGPUDAGToDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
200   AU.addRequired<AMDGPUArgumentUsageInfo>();
201   AU.addRequired<LegacyDivergenceAnalysis>();
202 #ifdef EXPENSIVE_CHECKS
203   AU.addRequired<DominatorTreeWrapperPass>();
204   AU.addRequired<LoopInfoWrapperPass>();
205 #endif
206   SelectionDAGISel::getAnalysisUsage(AU);
207 }
208 
matchLoadD16FromBuildVector(SDNode * N) const209 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
210   assert(Subtarget->d16PreservesUnusedBits());
211   MVT VT = N->getValueType(0).getSimpleVT();
212   if (VT != MVT::v2i16 && VT != MVT::v2f16)
213     return false;
214 
215   SDValue Lo = N->getOperand(0);
216   SDValue Hi = N->getOperand(1);
217 
218   LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
219 
220   // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
221   // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
222   // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
223 
224   // Need to check for possible indirect dependencies on the other half of the
225   // vector to avoid introducing a cycle.
226   if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
227     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
228 
229     SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
230     SDValue Ops[] = {
231       LdHi->getChain(), LdHi->getBasePtr(), TiedIn
232     };
233 
234     unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
235     if (LdHi->getMemoryVT() == MVT::i8) {
236       LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
237         AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
238     } else {
239       assert(LdHi->getMemoryVT() == MVT::i16);
240     }
241 
242     SDValue NewLoadHi =
243       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
244                                   Ops, LdHi->getMemoryVT(),
245                                   LdHi->getMemOperand());
246 
247     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
248     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
249     return true;
250   }
251 
252   // build_vector (load ptr), hi -> load_d16_lo ptr, hi
253   // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
254   // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
255   LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
256   if (LdLo && Lo.hasOneUse()) {
257     SDValue TiedIn = getHi16Elt(Hi);
258     if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
259       return false;
260 
261     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
262     unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
263     if (LdLo->getMemoryVT() == MVT::i8) {
264       LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
265         AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
266     } else {
267       assert(LdLo->getMemoryVT() == MVT::i16);
268     }
269 
270     TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
271 
272     SDValue Ops[] = {
273       LdLo->getChain(), LdLo->getBasePtr(), TiedIn
274     };
275 
276     SDValue NewLoadLo =
277       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
278                                   Ops, LdLo->getMemoryVT(),
279                                   LdLo->getMemOperand());
280 
281     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
282     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
283     return true;
284   }
285 
286   return false;
287 }
288 
PreprocessISelDAG()289 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
290   if (!Subtarget->d16PreservesUnusedBits())
291     return;
292 
293   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
294 
295   bool MadeChange = false;
296   while (Position != CurDAG->allnodes_begin()) {
297     SDNode *N = &*--Position;
298     if (N->use_empty())
299       continue;
300 
301     switch (N->getOpcode()) {
302     case ISD::BUILD_VECTOR:
303       MadeChange |= matchLoadD16FromBuildVector(N);
304       break;
305     default:
306       break;
307     }
308   }
309 
310   if (MadeChange) {
311     CurDAG->RemoveDeadNodes();
312     LLVM_DEBUG(dbgs() << "After PreProcess:\n";
313                CurDAG->dump(););
314   }
315 }
316 
isNoNanSrc(SDValue N) const317 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
318   if (TM.Options.NoNaNsFPMath)
319     return true;
320 
321   // TODO: Move into isKnownNeverNaN
322   if (N->getFlags().hasNoNaNs())
323     return true;
324 
325   return CurDAG->isKnownNeverNaN(N);
326 }
327 
isInlineImmediate(const SDNode * N,bool Negated) const328 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
329                                            bool Negated) const {
330   if (N->isUndef())
331     return true;
332 
333   const SIInstrInfo *TII = Subtarget->getInstrInfo();
334   if (Negated) {
335     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
336       return TII->isInlineConstant(-C->getAPIntValue());
337 
338     if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
339       return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
340 
341   } else {
342     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
343       return TII->isInlineConstant(C->getAPIntValue());
344 
345     if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
346       return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
347   }
348 
349   return false;
350 }
351 
352 /// Determine the register class for \p OpNo
353 /// \returns The register class of the virtual register that will be used for
354 /// the given operand number \OpNo or NULL if the register class cannot be
355 /// determined.
getOperandRegClass(SDNode * N,unsigned OpNo) const356 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
357                                                           unsigned OpNo) const {
358   if (!N->isMachineOpcode()) {
359     if (N->getOpcode() == ISD::CopyToReg) {
360       Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
361       if (Reg.isVirtual()) {
362         MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
363         return MRI.getRegClass(Reg);
364       }
365 
366       const SIRegisterInfo *TRI
367         = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
368       return TRI->getPhysRegClass(Reg);
369     }
370 
371     return nullptr;
372   }
373 
374   switch (N->getMachineOpcode()) {
375   default: {
376     const MCInstrDesc &Desc =
377         Subtarget->getInstrInfo()->get(N->getMachineOpcode());
378     unsigned OpIdx = Desc.getNumDefs() + OpNo;
379     if (OpIdx >= Desc.getNumOperands())
380       return nullptr;
381     int RegClass = Desc.OpInfo[OpIdx].RegClass;
382     if (RegClass == -1)
383       return nullptr;
384 
385     return Subtarget->getRegisterInfo()->getRegClass(RegClass);
386   }
387   case AMDGPU::REG_SEQUENCE: {
388     unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
389     const TargetRegisterClass *SuperRC =
390         Subtarget->getRegisterInfo()->getRegClass(RCID);
391 
392     SDValue SubRegOp = N->getOperand(OpNo + 1);
393     unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
394     return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
395                                                               SubRegIdx);
396   }
397   }
398 }
399 
glueCopyToOp(SDNode * N,SDValue NewChain,SDValue Glue) const400 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
401                                          SDValue Glue) const {
402   SmallVector <SDValue, 8> Ops;
403   Ops.push_back(NewChain); // Replace the chain.
404   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
405     Ops.push_back(N->getOperand(i));
406 
407   Ops.push_back(Glue);
408   return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
409 }
410 
glueCopyToM0(SDNode * N,SDValue Val) const411 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
412   const SITargetLowering& Lowering =
413     *static_cast<const SITargetLowering*>(getTargetLowering());
414 
415   assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
416 
417   SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
418   return glueCopyToOp(N, M0, M0.getValue(1));
419 }
420 
glueCopyToM0LDSInit(SDNode * N) const421 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
422   unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
423   if (AS == AMDGPUAS::LOCAL_ADDRESS) {
424     if (Subtarget->ldsRequiresM0Init())
425       return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
426   } else if (AS == AMDGPUAS::REGION_ADDRESS) {
427     MachineFunction &MF = CurDAG->getMachineFunction();
428     unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
429     return
430         glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
431   }
432   return N;
433 }
434 
buildSMovImm64(SDLoc & DL,uint64_t Imm,EVT VT) const435 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
436                                                   EVT VT) const {
437   SDNode *Lo = CurDAG->getMachineNode(
438       AMDGPU::S_MOV_B32, DL, MVT::i32,
439       CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
440   SDNode *Hi =
441       CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
442                              CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
443   const SDValue Ops[] = {
444       CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
445       SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
446       SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
447 
448   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
449 }
450 
SelectBuildVector(SDNode * N,unsigned RegClassID)451 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
452   EVT VT = N->getValueType(0);
453   unsigned NumVectorElts = VT.getVectorNumElements();
454   EVT EltVT = VT.getVectorElementType();
455   SDLoc DL(N);
456   SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
457 
458   if (NumVectorElts == 1) {
459     CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
460                          RegClass);
461     return;
462   }
463 
464   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
465                                   "supported yet");
466   // 32 = Max Num Vector Elements
467   // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
468   // 1 = Vector Register Class
469   SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
470 
471   bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
472                Triple::amdgcn;
473   RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
474   bool IsRegSeq = true;
475   unsigned NOps = N->getNumOperands();
476   for (unsigned i = 0; i < NOps; i++) {
477     // XXX: Why is this here?
478     if (isa<RegisterSDNode>(N->getOperand(i))) {
479       IsRegSeq = false;
480       break;
481     }
482     unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
483                          : R600RegisterInfo::getSubRegFromChannel(i);
484     RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
485     RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
486   }
487   if (NOps != NumVectorElts) {
488     // Fill in the missing undef elements if this was a scalar_to_vector.
489     assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
490     MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
491                                                    DL, EltVT);
492     for (unsigned i = NOps; i < NumVectorElts; ++i) {
493       unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
494                            : R600RegisterInfo::getSubRegFromChannel(i);
495       RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
496       RegSeqArgs[1 + (2 * i) + 1] =
497           CurDAG->getTargetConstant(Sub, DL, MVT::i32);
498     }
499   }
500 
501   if (!IsRegSeq)
502     SelectCode(N);
503   CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
504 }
505 
Select(SDNode * N)506 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
507   unsigned int Opc = N->getOpcode();
508   if (N->isMachineOpcode()) {
509     N->setNodeId(-1);
510     return;   // Already selected.
511   }
512 
513   // isa<MemSDNode> almost works but is slightly too permissive for some DS
514   // intrinsics.
515   if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
516       (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
517        Opc == ISD::ATOMIC_LOAD_FADD ||
518        Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
519        Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
520     N = glueCopyToM0LDSInit(N);
521     SelectCode(N);
522     return;
523   }
524 
525   switch (Opc) {
526   default:
527     break;
528   // We are selecting i64 ADD here instead of custom lower it during
529   // DAG legalization, so we can fold some i64 ADDs used for address
530   // calculation into the LOAD and STORE instructions.
531   case ISD::ADDC:
532   case ISD::ADDE:
533   case ISD::SUBC:
534   case ISD::SUBE: {
535     if (N->getValueType(0) != MVT::i64)
536       break;
537 
538     SelectADD_SUB_I64(N);
539     return;
540   }
541   case ISD::ADDCARRY:
542   case ISD::SUBCARRY:
543     if (N->getValueType(0) != MVT::i32)
544       break;
545 
546     SelectAddcSubb(N);
547     return;
548   case ISD::UADDO:
549   case ISD::USUBO: {
550     SelectUADDO_USUBO(N);
551     return;
552   }
553   case AMDGPUISD::FMUL_W_CHAIN: {
554     SelectFMUL_W_CHAIN(N);
555     return;
556   }
557   case AMDGPUISD::FMA_W_CHAIN: {
558     SelectFMA_W_CHAIN(N);
559     return;
560   }
561 
562   case ISD::SCALAR_TO_VECTOR:
563   case ISD::BUILD_VECTOR: {
564     EVT VT = N->getValueType(0);
565     unsigned NumVectorElts = VT.getVectorNumElements();
566     if (VT.getScalarSizeInBits() == 16) {
567       if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
568         if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
569           ReplaceNode(N, Packed);
570           return;
571         }
572       }
573 
574       break;
575     }
576 
577     assert(VT.getVectorElementType().bitsEq(MVT::i32));
578     unsigned RegClassID =
579         SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
580     SelectBuildVector(N, RegClassID);
581     return;
582   }
583   case ISD::BUILD_PAIR: {
584     SDValue RC, SubReg0, SubReg1;
585     SDLoc DL(N);
586     if (N->getValueType(0) == MVT::i128) {
587       RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
588       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
589       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
590     } else if (N->getValueType(0) == MVT::i64) {
591       RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
592       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
593       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
594     } else {
595       llvm_unreachable("Unhandled value type for BUILD_PAIR");
596     }
597     const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
598                             N->getOperand(1), SubReg1 };
599     ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
600                                           N->getValueType(0), Ops));
601     return;
602   }
603 
604   case ISD::Constant:
605   case ISD::ConstantFP: {
606     if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
607       break;
608 
609     uint64_t Imm;
610     if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
611       Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
612     else {
613       ConstantSDNode *C = cast<ConstantSDNode>(N);
614       Imm = C->getZExtValue();
615     }
616 
617     SDLoc DL(N);
618     ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
619     return;
620   }
621   case AMDGPUISD::BFE_I32:
622   case AMDGPUISD::BFE_U32: {
623     // There is a scalar version available, but unlike the vector version which
624     // has a separate operand for the offset and width, the scalar version packs
625     // the width and offset into a single operand. Try to move to the scalar
626     // version if the offsets are constant, so that we can try to keep extended
627     // loads of kernel arguments in SGPRs.
628 
629     // TODO: Technically we could try to pattern match scalar bitshifts of
630     // dynamic values, but it's probably not useful.
631     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
632     if (!Offset)
633       break;
634 
635     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
636     if (!Width)
637       break;
638 
639     bool Signed = Opc == AMDGPUISD::BFE_I32;
640 
641     uint32_t OffsetVal = Offset->getZExtValue();
642     uint32_t WidthVal = Width->getZExtValue();
643 
644     ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
645                             SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
646     return;
647   }
648   case AMDGPUISD::DIV_SCALE: {
649     SelectDIV_SCALE(N);
650     return;
651   }
652   case AMDGPUISD::MAD_I64_I32:
653   case AMDGPUISD::MAD_U64_U32: {
654     SelectMAD_64_32(N);
655     return;
656   }
657   case ISD::CopyToReg: {
658     const SITargetLowering& Lowering =
659       *static_cast<const SITargetLowering*>(getTargetLowering());
660     N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
661     break;
662   }
663   case ISD::AND:
664   case ISD::SRL:
665   case ISD::SRA:
666   case ISD::SIGN_EXTEND_INREG:
667     if (N->getValueType(0) != MVT::i32)
668       break;
669 
670     SelectS_BFE(N);
671     return;
672   case ISD::BRCOND:
673     SelectBRCOND(N);
674     return;
675   case ISD::FMAD:
676   case ISD::FMA:
677     SelectFMAD_FMA(N);
678     return;
679   case AMDGPUISD::ATOMIC_CMP_SWAP:
680     SelectATOMIC_CMP_SWAP(N);
681     return;
682   case AMDGPUISD::CVT_PKRTZ_F16_F32:
683   case AMDGPUISD::CVT_PKNORM_I16_F32:
684   case AMDGPUISD::CVT_PKNORM_U16_F32:
685   case AMDGPUISD::CVT_PK_U16_U32:
686   case AMDGPUISD::CVT_PK_I16_I32: {
687     // Hack around using a legal type if f16 is illegal.
688     if (N->getValueType(0) == MVT::i32) {
689       MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
690       N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
691                               { N->getOperand(0), N->getOperand(1) });
692       SelectCode(N);
693       return;
694     }
695 
696     break;
697   }
698   case ISD::INTRINSIC_W_CHAIN: {
699     SelectINTRINSIC_W_CHAIN(N);
700     return;
701   }
702   case ISD::INTRINSIC_WO_CHAIN: {
703     SelectINTRINSIC_WO_CHAIN(N);
704     return;
705   }
706   case ISD::INTRINSIC_VOID: {
707     SelectINTRINSIC_VOID(N);
708     return;
709   }
710   }
711 
712   SelectCode(N);
713 }
714 
isUniformBr(const SDNode * N) const715 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
716   const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
717   const Instruction *Term = BB->getTerminator();
718   return Term->getMetadata("amdgpu.uniform") ||
719          Term->getMetadata("structurizecfg.uniform");
720 }
721 
getBaseWithOffsetUsingSplitOR(SelectionDAG & DAG,SDValue Addr,SDValue & N0,SDValue & N1)722 static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
723                                           SDValue &N0, SDValue &N1) {
724   if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
725       Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
726     // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
727     // (i64 (bitcast (v2i32 (build_vector
728     //                        (or (extract_vector_elt V, 0), OFFSET),
729     //                        (extract_vector_elt V, 1)))))
730     SDValue Lo = Addr.getOperand(0).getOperand(0);
731     if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
732       SDValue BaseLo = Lo.getOperand(0);
733       SDValue BaseHi = Addr.getOperand(0).getOperand(1);
734       // Check that split base (Lo and Hi) are extracted from the same one.
735       if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
736           BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
737           BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
738           // Lo is statically extracted from index 0.
739           isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
740           BaseLo.getConstantOperandVal(1) == 0 &&
741           // Hi is statically extracted from index 0.
742           isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
743           BaseHi.getConstantOperandVal(1) == 1) {
744         N0 = BaseLo.getOperand(0).getOperand(0);
745         N1 = Lo.getOperand(1);
746         return true;
747       }
748     }
749   }
750   return false;
751 }
752 
isBaseWithConstantOffset64(SDValue Addr,SDValue & LHS,SDValue & RHS) const753 bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
754                                                     SDValue &RHS) const {
755   if (CurDAG->isBaseWithConstantOffset(Addr)) {
756     LHS = Addr.getOperand(0);
757     RHS = Addr.getOperand(1);
758     return true;
759   }
760 
761   if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
762     assert(LHS && RHS && isa<ConstantSDNode>(RHS));
763     return true;
764   }
765 
766   return false;
767 }
768 
getPassName() const769 StringRef AMDGPUDAGToDAGISel::getPassName() const {
770   return "AMDGPU DAG->DAG Pattern Instruction Selection";
771 }
772 
773 //===----------------------------------------------------------------------===//
774 // Complex Patterns
775 //===----------------------------------------------------------------------===//
776 
SelectADDRVTX_READ(SDValue Addr,SDValue & Base,SDValue & Offset)777 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
778                                             SDValue &Offset) {
779   return false;
780 }
781 
SelectADDRIndirect(SDValue Addr,SDValue & Base,SDValue & Offset)782 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
783                                             SDValue &Offset) {
784   ConstantSDNode *C;
785   SDLoc DL(Addr);
786 
787   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
788     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
789     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
790   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
791              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
792     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
793     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
794   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
795             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
796     Base = Addr.getOperand(0);
797     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
798   } else {
799     Base = Addr;
800     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
801   }
802 
803   return true;
804 }
805 
getMaterializedScalarImm32(int64_t Val,const SDLoc & DL) const806 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
807                                                        const SDLoc &DL) const {
808   SDNode *Mov = CurDAG->getMachineNode(
809     AMDGPU::S_MOV_B32, DL, MVT::i32,
810     CurDAG->getTargetConstant(Val, DL, MVT::i32));
811   return SDValue(Mov, 0);
812 }
813 
814 // FIXME: Should only handle addcarry/subcarry
SelectADD_SUB_I64(SDNode * N)815 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
816   SDLoc DL(N);
817   SDValue LHS = N->getOperand(0);
818   SDValue RHS = N->getOperand(1);
819 
820   unsigned Opcode = N->getOpcode();
821   bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
822   bool ProduceCarry =
823       ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
824   bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
825 
826   SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
827   SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
828 
829   SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
830                                        DL, MVT::i32, LHS, Sub0);
831   SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
832                                        DL, MVT::i32, LHS, Sub1);
833 
834   SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
835                                        DL, MVT::i32, RHS, Sub0);
836   SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
837                                        DL, MVT::i32, RHS, Sub1);
838 
839   SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
840 
841   static const unsigned OpcMap[2][2][2] = {
842       {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
843        {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
844       {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
845        {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
846 
847   unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
848   unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
849 
850   SDNode *AddLo;
851   if (!ConsumeCarry) {
852     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
853     AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
854   } else {
855     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
856     AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
857   }
858   SDValue AddHiArgs[] = {
859     SDValue(Hi0, 0),
860     SDValue(Hi1, 0),
861     SDValue(AddLo, 1)
862   };
863   SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
864 
865   SDValue RegSequenceArgs[] = {
866     CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
867     SDValue(AddLo,0),
868     Sub0,
869     SDValue(AddHi,0),
870     Sub1,
871   };
872   SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
873                                                MVT::i64, RegSequenceArgs);
874 
875   if (ProduceCarry) {
876     // Replace the carry-use
877     ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
878   }
879 
880   // Replace the remaining uses.
881   ReplaceNode(N, RegSequence);
882 }
883 
SelectAddcSubb(SDNode * N)884 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
885   SDLoc DL(N);
886   SDValue LHS = N->getOperand(0);
887   SDValue RHS = N->getOperand(1);
888   SDValue CI = N->getOperand(2);
889 
890   if (N->isDivergent()) {
891     unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
892                                                    : AMDGPU::V_SUBB_U32_e64;
893     CurDAG->SelectNodeTo(
894         N, Opc, N->getVTList(),
895         {LHS, RHS, CI,
896          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
897   } else {
898     unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO
899                                                    : AMDGPU::S_SUB_CO_PSEUDO;
900     CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
901   }
902 }
903 
SelectUADDO_USUBO(SDNode * N)904 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
905   // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
906   // carry out despite the _i32 name. These were renamed in VI to _U32.
907   // FIXME: We should probably rename the opcodes here.
908   bool IsAdd = N->getOpcode() == ISD::UADDO;
909   bool IsVALU = N->isDivergent();
910 
911   for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
912        ++UI)
913     if (UI.getUse().getResNo() == 1) {
914       if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) ||
915           (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) {
916         IsVALU = true;
917         break;
918       }
919     }
920 
921   if (IsVALU) {
922     unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
923 
924     CurDAG->SelectNodeTo(
925         N, Opc, N->getVTList(),
926         {N->getOperand(0), N->getOperand(1),
927          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
928   } else {
929     unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
930                                                 : AMDGPU::S_USUBO_PSEUDO;
931 
932     CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
933                          {N->getOperand(0), N->getOperand(1)});
934   }
935 }
936 
SelectFMA_W_CHAIN(SDNode * N)937 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
938   SDLoc SL(N);
939   //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
940   SDValue Ops[10];
941 
942   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
943   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
944   SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
945   Ops[8] = N->getOperand(0);
946   Ops[9] = N->getOperand(4);
947 
948   // If there are no source modifiers, prefer fmac over fma because it can use
949   // the smaller VOP2 encoding.
950   bool UseFMAC = Subtarget->hasDLInsts() &&
951                  cast<ConstantSDNode>(Ops[0])->isZero() &&
952                  cast<ConstantSDNode>(Ops[2])->isZero() &&
953                  cast<ConstantSDNode>(Ops[4])->isZero();
954   unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
955   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
956 }
957 
SelectFMUL_W_CHAIN(SDNode * N)958 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
959   SDLoc SL(N);
960   //    src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
961   SDValue Ops[8];
962 
963   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
964   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
965   Ops[6] = N->getOperand(0);
966   Ops[7] = N->getOperand(3);
967 
968   CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
969 }
970 
971 // We need to handle this here because tablegen doesn't support matching
972 // instructions with multiple outputs.
SelectDIV_SCALE(SDNode * N)973 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
974   SDLoc SL(N);
975   EVT VT = N->getValueType(0);
976 
977   assert(VT == MVT::f32 || VT == MVT::f64);
978 
979   unsigned Opc
980     = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
981 
982   // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
983   // omod
984   SDValue Ops[8];
985   SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
986   SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
987   SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
988   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
989 }
990 
991 // We need to handle this here because tablegen doesn't support matching
992 // instructions with multiple outputs.
SelectMAD_64_32(SDNode * N)993 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
994   SDLoc SL(N);
995   bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
996   unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
997 
998   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
999   SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1000                     Clamp };
1001   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1002 }
1003 
isDSOffsetLegal(SDValue Base,unsigned Offset) const1004 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1005   if (!isUInt<16>(Offset))
1006     return false;
1007 
1008   if (!Base || Subtarget->hasUsableDSOffset() ||
1009       Subtarget->unsafeDSOffsetFoldingEnabled())
1010     return true;
1011 
1012   // On Southern Islands instruction with a negative base value and an offset
1013   // don't seem to work.
1014   return CurDAG->SignBitIsZero(Base);
1015 }
1016 
SelectDS1Addr1Offset(SDValue Addr,SDValue & Base,SDValue & Offset) const1017 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1018                                               SDValue &Offset) const {
1019   SDLoc DL(Addr);
1020   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1021     SDValue N0 = Addr.getOperand(0);
1022     SDValue N1 = Addr.getOperand(1);
1023     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1024     if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1025       // (add n0, c0)
1026       Base = N0;
1027       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1028       return true;
1029     }
1030   } else if (Addr.getOpcode() == ISD::SUB) {
1031     // sub C, x -> add (sub 0, x), C
1032     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1033       int64_t ByteOffset = C->getSExtValue();
1034       if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1035         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1036 
1037         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1038         // the known bits in isDSOffsetLegal. We need to emit the selected node
1039         // here, so this is thrown away.
1040         SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1041                                       Zero, Addr.getOperand(1));
1042 
1043         if (isDSOffsetLegal(Sub, ByteOffset)) {
1044           SmallVector<SDValue, 3> Opnds;
1045           Opnds.push_back(Zero);
1046           Opnds.push_back(Addr.getOperand(1));
1047 
1048           // FIXME: Select to VOP3 version for with-carry.
1049           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1050           if (Subtarget->hasAddNoCarry()) {
1051             SubOp = AMDGPU::V_SUB_U32_e64;
1052             Opnds.push_back(
1053                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1054           }
1055 
1056           MachineSDNode *MachineSub =
1057               CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1058 
1059           Base = SDValue(MachineSub, 0);
1060           Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1061           return true;
1062         }
1063       }
1064     }
1065   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1066     // If we have a constant address, prefer to put the constant into the
1067     // offset. This can save moves to load the constant address since multiple
1068     // operations can share the zero base address register, and enables merging
1069     // into read2 / write2 instructions.
1070 
1071     SDLoc DL(Addr);
1072 
1073     if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1074       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1075       MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1076                                  DL, MVT::i32, Zero);
1077       Base = SDValue(MovZero, 0);
1078       Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1079       return true;
1080     }
1081   }
1082 
1083   // default case
1084   Base = Addr;
1085   Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1086   return true;
1087 }
1088 
isDSOffset2Legal(SDValue Base,unsigned Offset0,unsigned Offset1,unsigned Size) const1089 bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1090                                           unsigned Offset1,
1091                                           unsigned Size) const {
1092   if (Offset0 % Size != 0 || Offset1 % Size != 0)
1093     return false;
1094   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1095     return false;
1096 
1097   if (!Base || Subtarget->hasUsableDSOffset() ||
1098       Subtarget->unsafeDSOffsetFoldingEnabled())
1099     return true;
1100 
1101   // On Southern Islands instruction with a negative base value and an offset
1102   // don't seem to work.
1103   return CurDAG->SignBitIsZero(Base);
1104 }
1105 
1106 // TODO: If offset is too big, put low 16-bit into offset.
SelectDS64Bit4ByteAligned(SDValue Addr,SDValue & Base,SDValue & Offset0,SDValue & Offset1) const1107 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1108                                                    SDValue &Offset0,
1109                                                    SDValue &Offset1) const {
1110   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1111 }
1112 
SelectDS128Bit8ByteAligned(SDValue Addr,SDValue & Base,SDValue & Offset0,SDValue & Offset1) const1113 bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1114                                                     SDValue &Offset0,
1115                                                     SDValue &Offset1) const {
1116   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1117 }
1118 
SelectDSReadWrite2(SDValue Addr,SDValue & Base,SDValue & Offset0,SDValue & Offset1,unsigned Size) const1119 bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1120                                             SDValue &Offset0, SDValue &Offset1,
1121                                             unsigned Size) const {
1122   SDLoc DL(Addr);
1123 
1124   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1125     SDValue N0 = Addr.getOperand(0);
1126     SDValue N1 = Addr.getOperand(1);
1127     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1128     unsigned OffsetValue0 = C1->getZExtValue();
1129     unsigned OffsetValue1 = OffsetValue0 + Size;
1130 
1131     // (add n0, c0)
1132     if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1133       Base = N0;
1134       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1135       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1136       return true;
1137     }
1138   } else if (Addr.getOpcode() == ISD::SUB) {
1139     // sub C, x -> add (sub 0, x), C
1140     if (const ConstantSDNode *C =
1141             dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1142       unsigned OffsetValue0 = C->getZExtValue();
1143       unsigned OffsetValue1 = OffsetValue0 + Size;
1144 
1145       if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1146         SDLoc DL(Addr);
1147         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1148 
1149         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1150         // the known bits in isDSOffsetLegal. We need to emit the selected node
1151         // here, so this is thrown away.
1152         SDValue Sub =
1153             CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1154 
1155         if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1156           SmallVector<SDValue, 3> Opnds;
1157           Opnds.push_back(Zero);
1158           Opnds.push_back(Addr.getOperand(1));
1159           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1160           if (Subtarget->hasAddNoCarry()) {
1161             SubOp = AMDGPU::V_SUB_U32_e64;
1162             Opnds.push_back(
1163                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1164           }
1165 
1166           MachineSDNode *MachineSub = CurDAG->getMachineNode(
1167               SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1168 
1169           Base = SDValue(MachineSub, 0);
1170           Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1171           Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1172           return true;
1173         }
1174       }
1175     }
1176   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1177     unsigned OffsetValue0 = CAddr->getZExtValue();
1178     unsigned OffsetValue1 = OffsetValue0 + Size;
1179 
1180     if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1181       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1182       MachineSDNode *MovZero =
1183           CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1184       Base = SDValue(MovZero, 0);
1185       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1186       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1187       return true;
1188     }
1189   }
1190 
1191   // default case
1192 
1193   Base = Addr;
1194   Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1195   Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1196   return true;
1197 }
1198 
SelectMUBUF(SDValue Addr,SDValue & Ptr,SDValue & VAddr,SDValue & SOffset,SDValue & Offset,SDValue & Offen,SDValue & Idxen,SDValue & Addr64) const1199 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1200                                      SDValue &SOffset, SDValue &Offset,
1201                                      SDValue &Offen, SDValue &Idxen,
1202                                      SDValue &Addr64) const {
1203   // Subtarget prefers to use flat instruction
1204   // FIXME: This should be a pattern predicate and not reach here
1205   if (Subtarget->useFlatForGlobal())
1206     return false;
1207 
1208   SDLoc DL(Addr);
1209 
1210   Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1211   Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1212   Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1213   SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1214 
1215   ConstantSDNode *C1 = nullptr;
1216   SDValue N0 = Addr;
1217   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1218     C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1219     if (isUInt<32>(C1->getZExtValue()))
1220       N0 = Addr.getOperand(0);
1221     else
1222       C1 = nullptr;
1223   }
1224 
1225   if (N0.getOpcode() == ISD::ADD) {
1226     // (add N2, N3) -> addr64, or
1227     // (add (add N2, N3), C1) -> addr64
1228     SDValue N2 = N0.getOperand(0);
1229     SDValue N3 = N0.getOperand(1);
1230     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1231 
1232     if (N2->isDivergent()) {
1233       if (N3->isDivergent()) {
1234         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1235         // addr64, and construct the resource from a 0 address.
1236         Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1237         VAddr = N0;
1238       } else {
1239         // N2 is divergent, N3 is not.
1240         Ptr = N3;
1241         VAddr = N2;
1242       }
1243     } else {
1244       // N2 is not divergent.
1245       Ptr = N2;
1246       VAddr = N3;
1247     }
1248     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1249   } else if (N0->isDivergent()) {
1250     // N0 is divergent. Use it as the addr64, and construct the resource from a
1251     // 0 address.
1252     Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1253     VAddr = N0;
1254     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1255   } else {
1256     // N0 -> offset, or
1257     // (N0 + C1) -> offset
1258     VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1259     Ptr = N0;
1260   }
1261 
1262   if (!C1) {
1263     // No offset.
1264     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1265     return true;
1266   }
1267 
1268   if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
1269     // Legal offset for instruction.
1270     Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1271     return true;
1272   }
1273 
1274   // Illegal offset, store it in soffset.
1275   Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1276   SOffset =
1277       SDValue(CurDAG->getMachineNode(
1278                   AMDGPU::S_MOV_B32, DL, MVT::i32,
1279                   CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1280               0);
1281   return true;
1282 }
1283 
SelectMUBUFAddr64(SDValue Addr,SDValue & SRsrc,SDValue & VAddr,SDValue & SOffset,SDValue & Offset) const1284 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1285                                            SDValue &VAddr, SDValue &SOffset,
1286                                            SDValue &Offset) const {
1287   SDValue Ptr, Offen, Idxen, Addr64;
1288 
1289   // addr64 bit was removed for volcanic islands.
1290   // FIXME: This should be a pattern predicate and not reach here
1291   if (!Subtarget->hasAddr64())
1292     return false;
1293 
1294   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1295     return false;
1296 
1297   ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1298   if (C->getSExtValue()) {
1299     SDLoc DL(Addr);
1300 
1301     const SITargetLowering& Lowering =
1302       *static_cast<const SITargetLowering*>(getTargetLowering());
1303 
1304     SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1305     return true;
1306   }
1307 
1308   return false;
1309 }
1310 
foldFrameIndex(SDValue N) const1311 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1312   SDLoc DL(N);
1313 
1314   auto *FI = dyn_cast<FrameIndexSDNode>(N);
1315   SDValue TFI =
1316       FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1317 
1318   // We rebase the base address into an absolute stack address and hence
1319   // use constant 0 for soffset. This value must be retained until
1320   // frame elimination and eliminateFrameIndex will choose the appropriate
1321   // frame register if need be.
1322   return std::make_pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1323 }
1324 
SelectMUBUFScratchOffen(SDNode * Parent,SDValue Addr,SDValue & Rsrc,SDValue & VAddr,SDValue & SOffset,SDValue & ImmOffset) const1325 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1326                                                  SDValue Addr, SDValue &Rsrc,
1327                                                  SDValue &VAddr, SDValue &SOffset,
1328                                                  SDValue &ImmOffset) const {
1329 
1330   SDLoc DL(Addr);
1331   MachineFunction &MF = CurDAG->getMachineFunction();
1332   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1333 
1334   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1335 
1336   if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1337     int64_t Imm = CAddr->getSExtValue();
1338     const int64_t NullPtr =
1339         AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
1340     // Don't fold null pointer.
1341     if (Imm != NullPtr) {
1342       SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
1343       MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1344         AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1345       VAddr = SDValue(MovHighBits, 0);
1346 
1347       SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1348       ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
1349       return true;
1350     }
1351   }
1352 
1353   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1354     // (add n0, c1)
1355 
1356     SDValue N0 = Addr.getOperand(0);
1357     SDValue N1 = Addr.getOperand(1);
1358 
1359     // Offsets in vaddr must be positive if range checking is enabled.
1360     //
1361     // The total computation of vaddr + soffset + offset must not overflow.  If
1362     // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1363     // overflowing.
1364     //
1365     // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1366     // always perform a range check. If a negative vaddr base index was used,
1367     // this would fail the range check. The overall address computation would
1368     // compute a valid address, but this doesn't happen due to the range
1369     // check. For out-of-bounds MUBUF loads, a 0 is returned.
1370     //
1371     // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1372     // MUBUF vaddr, but not on older subtargets which can only do this if the
1373     // sign bit is known 0.
1374     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1375     if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) &&
1376         (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1377          CurDAG->SignBitIsZero(N0))) {
1378       std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1379       ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1380       return true;
1381     }
1382   }
1383 
1384   // (node)
1385   std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1386   ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1387   return true;
1388 }
1389 
IsCopyFromSGPR(const SIRegisterInfo & TRI,SDValue Val)1390 static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1391   if (Val.getOpcode() != ISD::CopyFromReg)
1392     return false;
1393   auto RC =
1394       TRI.getPhysRegClass(cast<RegisterSDNode>(Val.getOperand(1))->getReg());
1395   return RC && TRI.isSGPRClass(RC);
1396 }
1397 
SelectMUBUFScratchOffset(SDNode * Parent,SDValue Addr,SDValue & SRsrc,SDValue & SOffset,SDValue & Offset) const1398 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1399                                                   SDValue Addr,
1400                                                   SDValue &SRsrc,
1401                                                   SDValue &SOffset,
1402                                                   SDValue &Offset) const {
1403   const SIRegisterInfo *TRI =
1404       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1405   MachineFunction &MF = CurDAG->getMachineFunction();
1406   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1407   SDLoc DL(Addr);
1408 
1409   // CopyFromReg <sgpr>
1410   if (IsCopyFromSGPR(*TRI, Addr)) {
1411     SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1412     SOffset = Addr;
1413     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1414     return true;
1415   }
1416 
1417   ConstantSDNode *CAddr;
1418   if (Addr.getOpcode() == ISD::ADD) {
1419     // Add (CopyFromReg <sgpr>) <constant>
1420     CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1421     if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1422       return false;
1423     if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1424       return false;
1425 
1426     SOffset = Addr.getOperand(0);
1427   } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1428              SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1429     // <constant>
1430     SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1431   } else {
1432     return false;
1433   }
1434 
1435   SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1436 
1437   Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1438   return true;
1439 }
1440 
SelectMUBUFOffset(SDValue Addr,SDValue & SRsrc,SDValue & SOffset,SDValue & Offset) const1441 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1442                                            SDValue &SOffset, SDValue &Offset
1443                                            ) const {
1444   SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1445   const SIInstrInfo *TII =
1446     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1447 
1448   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1449     return false;
1450 
1451   if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1452       !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1453       !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1454     uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1455                     APInt::getAllOnes(32).getZExtValue(); // Size
1456     SDLoc DL(Addr);
1457 
1458     const SITargetLowering& Lowering =
1459       *static_cast<const SITargetLowering*>(getTargetLowering());
1460 
1461     SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1462     return true;
1463   }
1464   return false;
1465 }
1466 
1467 // Find a load or store from corresponding pattern root.
1468 // Roots may be build_vector, bitconvert or their combinations.
findMemSDNode(SDNode * N)1469 static MemSDNode* findMemSDNode(SDNode *N) {
1470   N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
1471   if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1472     return MN;
1473   assert(isa<BuildVectorSDNode>(N));
1474   for (SDValue V : N->op_values())
1475     if (MemSDNode *MN =
1476           dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1477       return MN;
1478   llvm_unreachable("cannot find MemSDNode in the pattern!");
1479 }
1480 
SelectFlatOffsetImpl(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset,uint64_t FlatVariant) const1481 bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1482                                               SDValue &VAddr, SDValue &Offset,
1483                                               uint64_t FlatVariant) const {
1484   int64_t OffsetVal = 0;
1485 
1486   unsigned AS = findMemSDNode(N)->getAddressSpace();
1487 
1488   bool CanHaveFlatSegmentOffsetBug =
1489       Subtarget->hasFlatSegmentOffsetBug() &&
1490       FlatVariant == SIInstrFlags::FLAT &&
1491       (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
1492 
1493   if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1494     SDValue N0, N1;
1495     if (isBaseWithConstantOffset64(Addr, N0, N1)) {
1496       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1497 
1498       const SIInstrInfo *TII = Subtarget->getInstrInfo();
1499       if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1500         Addr = N0;
1501         OffsetVal = COffsetVal;
1502       } else {
1503         // If the offset doesn't fit, put the low bits into the offset field and
1504         // add the rest.
1505         //
1506         // For a FLAT instruction the hardware decides whether to access
1507         // global/scratch/shared memory based on the high bits of vaddr,
1508         // ignoring the offset field, so we have to ensure that when we add
1509         // remainder to vaddr it still points into the same underlying object.
1510         // The easiest way to do that is to make sure that we split the offset
1511         // into two pieces that are both >= 0 or both <= 0.
1512 
1513         SDLoc DL(N);
1514         uint64_t RemainderOffset;
1515 
1516         std::tie(OffsetVal, RemainderOffset) =
1517             TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1518 
1519         SDValue AddOffsetLo =
1520             getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1521         SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1522 
1523         if (Addr.getValueType().getSizeInBits() == 32) {
1524           SmallVector<SDValue, 3> Opnds;
1525           Opnds.push_back(N0);
1526           Opnds.push_back(AddOffsetLo);
1527           unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1528           if (Subtarget->hasAddNoCarry()) {
1529             AddOp = AMDGPU::V_ADD_U32_e64;
1530             Opnds.push_back(Clamp);
1531           }
1532           Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1533         } else {
1534           // TODO: Should this try to use a scalar add pseudo if the base address
1535           // is uniform and saddr is usable?
1536           SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1537           SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1538 
1539           SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1540                                                 DL, MVT::i32, N0, Sub0);
1541           SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1542                                                 DL, MVT::i32, N0, Sub1);
1543 
1544           SDValue AddOffsetHi =
1545               getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1546 
1547           SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1548 
1549           SDNode *Add =
1550               CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1551                                      {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1552 
1553           SDNode *Addc = CurDAG->getMachineNode(
1554               AMDGPU::V_ADDC_U32_e64, DL, VTs,
1555               {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1556 
1557           SDValue RegSequenceArgs[] = {
1558               CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1559               SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1560 
1561           Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1562                                                 MVT::i64, RegSequenceArgs),
1563                          0);
1564         }
1565       }
1566     }
1567   }
1568 
1569   VAddr = Addr;
1570   Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
1571   return true;
1572 }
1573 
SelectFlatOffset(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset) const1574 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1575                                           SDValue &VAddr,
1576                                           SDValue &Offset) const {
1577   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1578 }
1579 
SelectGlobalOffset(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset) const1580 bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1581                                             SDValue &VAddr,
1582                                             SDValue &Offset) const {
1583   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1584 }
1585 
SelectScratchOffset(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset) const1586 bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1587                                              SDValue &VAddr,
1588                                              SDValue &Offset) const {
1589   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1590                               SIInstrFlags::FlatScratch);
1591 }
1592 
1593 // If this matches zero_extend i32:x, return x
matchZExtFromI32(SDValue Op)1594 static SDValue matchZExtFromI32(SDValue Op) {
1595   if (Op.getOpcode() != ISD::ZERO_EXTEND)
1596     return SDValue();
1597 
1598   SDValue ExtSrc = Op.getOperand(0);
1599   return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1600 }
1601 
1602 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
SelectGlobalSAddr(SDNode * N,SDValue Addr,SDValue & SAddr,SDValue & VOffset,SDValue & Offset) const1603 bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1604                                            SDValue Addr,
1605                                            SDValue &SAddr,
1606                                            SDValue &VOffset,
1607                                            SDValue &Offset) const {
1608   int64_t ImmOffset = 0;
1609 
1610   // Match the immediate offset first, which canonically is moved as low as
1611   // possible.
1612 
1613   SDValue LHS, RHS;
1614   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1615     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1616     const SIInstrInfo *TII = Subtarget->getInstrInfo();
1617 
1618     if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1619                                SIInstrFlags::FlatGlobal)) {
1620       Addr = LHS;
1621       ImmOffset = COffsetVal;
1622     } else if (!LHS->isDivergent()) {
1623       if (COffsetVal > 0) {
1624         SDLoc SL(N);
1625         // saddr + large_offset -> saddr +
1626         //                         (voffset = large_offset & ~MaxOffset) +
1627         //                         (large_offset & MaxOffset);
1628         int64_t SplitImmOffset, RemainderOffset;
1629         std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1630             COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
1631 
1632         if (isUInt<32>(RemainderOffset)) {
1633           SDNode *VMov = CurDAG->getMachineNode(
1634               AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1635               CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1636           VOffset = SDValue(VMov, 0);
1637           SAddr = LHS;
1638           Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1639           return true;
1640         }
1641       }
1642 
1643       // We are adding a 64 bit SGPR and a constant. If constant bus limit
1644       // is 1 we would need to perform 1 or 2 extra moves for each half of
1645       // the constant and it is better to do a scalar add and then issue a
1646       // single VALU instruction to materialize zero. Otherwise it is less
1647       // instructions to perform VALU adds with immediates or inline literals.
1648       unsigned NumLiterals =
1649           !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1650           !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1651       if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1652         return false;
1653     }
1654   }
1655 
1656   // Match the variable offset.
1657   if (Addr.getOpcode() == ISD::ADD) {
1658     LHS = Addr.getOperand(0);
1659     RHS = Addr.getOperand(1);
1660 
1661     if (!LHS->isDivergent()) {
1662       // add (i64 sgpr), (zero_extend (i32 vgpr))
1663       if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1664         SAddr = LHS;
1665         VOffset = ZextRHS;
1666       }
1667     }
1668 
1669     if (!SAddr && !RHS->isDivergent()) {
1670       // add (zero_extend (i32 vgpr)), (i64 sgpr)
1671       if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1672         SAddr = RHS;
1673         VOffset = ZextLHS;
1674       }
1675     }
1676 
1677     if (SAddr) {
1678       Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1679       return true;
1680     }
1681   }
1682 
1683   if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1684       isa<ConstantSDNode>(Addr))
1685     return false;
1686 
1687   // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1688   // moves required to copy a 64-bit SGPR to VGPR.
1689   SAddr = Addr;
1690   SDNode *VMov =
1691       CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1692                              CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1693   VOffset = SDValue(VMov, 0);
1694   Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1695   return true;
1696 }
1697 
SelectSAddrFI(SelectionDAG * CurDAG,SDValue SAddr)1698 static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
1699   if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1700     SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1701   } else if (SAddr.getOpcode() == ISD::ADD &&
1702              isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1703     // Materialize this into a scalar move for scalar address to avoid
1704     // readfirstlane.
1705     auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1706     SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1707                                               FI->getValueType(0));
1708     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1709                                            MVT::i32, TFI, SAddr.getOperand(1)),
1710                     0);
1711   }
1712 
1713   return SAddr;
1714 }
1715 
1716 // Match (32-bit SGPR base) + sext(imm offset)
SelectScratchSAddr(SDNode * Parent,SDValue Addr,SDValue & SAddr,SDValue & Offset) const1717 bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1718                                             SDValue &SAddr,
1719                                             SDValue &Offset) const {
1720   if (Addr->isDivergent())
1721     return false;
1722 
1723   SDLoc DL(Addr);
1724 
1725   int64_t COffsetVal = 0;
1726 
1727   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1728     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1729     SAddr = Addr.getOperand(0);
1730   } else {
1731     SAddr = Addr;
1732   }
1733 
1734   SAddr = SelectSAddrFI(CurDAG, SAddr);
1735 
1736   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1737 
1738   if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1739                               SIInstrFlags::FlatScratch)) {
1740     int64_t SplitImmOffset, RemainderOffset;
1741     std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1742         COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
1743 
1744     COffsetVal = SplitImmOffset;
1745 
1746     SDValue AddOffset =
1747         SAddr.getOpcode() == ISD::TargetFrameIndex
1748             ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1749             : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1750     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1751                                            SAddr, AddOffset),
1752                     0);
1753   }
1754 
1755   Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1756 
1757   return true;
1758 }
1759 
SelectSMRDOffset(SDValue ByteOffsetNode,SDValue & Offset,bool & Imm) const1760 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
1761                                           SDValue &Offset, bool &Imm) const {
1762   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
1763   if (!C) {
1764     if (ByteOffsetNode.getValueType().isScalarInteger() &&
1765         ByteOffsetNode.getValueType().getSizeInBits() == 32) {
1766       Offset = ByteOffsetNode;
1767       Imm = false;
1768       return true;
1769     }
1770     if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
1771       if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
1772         Offset = ByteOffsetNode.getOperand(0);
1773         Imm = false;
1774         return true;
1775       }
1776     }
1777     return false;
1778   }
1779 
1780   SDLoc SL(ByteOffsetNode);
1781   // GFX9 and GFX10 have signed byte immediate offsets.
1782   int64_t ByteOffset = C->getSExtValue();
1783   Optional<int64_t> EncodedOffset =
1784       AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
1785   if (EncodedOffset) {
1786     Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1787     Imm = true;
1788     return true;
1789   }
1790 
1791   // SGPR and literal offsets are unsigned.
1792   if (ByteOffset < 0)
1793     return false;
1794 
1795   EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
1796   if (EncodedOffset) {
1797     Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1798     return true;
1799   }
1800 
1801   if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
1802     return false;
1803 
1804   SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
1805   Offset = SDValue(
1806       CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
1807 
1808   return true;
1809 }
1810 
Expand32BitAddress(SDValue Addr) const1811 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
1812   if (Addr.getValueType() != MVT::i32)
1813     return Addr;
1814 
1815   // Zero-extend a 32-bit address.
1816   SDLoc SL(Addr);
1817 
1818   const MachineFunction &MF = CurDAG->getMachineFunction();
1819   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1820   unsigned AddrHiVal = Info->get32BitAddressHighBits();
1821   SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
1822 
1823   const SDValue Ops[] = {
1824     CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
1825     Addr,
1826     CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
1827     SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
1828             0),
1829     CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
1830   };
1831 
1832   return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
1833                                         Ops), 0);
1834 }
1835 
SelectSMRD(SDValue Addr,SDValue & SBase,SDValue & Offset,bool & Imm) const1836 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
1837                                      SDValue &Offset, bool &Imm) const {
1838   SDLoc SL(Addr);
1839 
1840   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
1841   // wraparound, because s_load instructions perform the addition in 64 bits.
1842   if ((Addr.getValueType() != MVT::i32 ||
1843        Addr->getFlags().hasNoUnsignedWrap())) {
1844     SDValue N0, N1;
1845     // Extract the base and offset if possible.
1846     if (CurDAG->isBaseWithConstantOffset(Addr) ||
1847         Addr.getOpcode() == ISD::ADD) {
1848       N0 = Addr.getOperand(0);
1849       N1 = Addr.getOperand(1);
1850     } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
1851       assert(N0 && N1 && isa<ConstantSDNode>(N1));
1852     }
1853     if (N0 && N1) {
1854       if (SelectSMRDOffset(N1, Offset, Imm)) {
1855         SBase = Expand32BitAddress(N0);
1856         return true;
1857       }
1858     }
1859   }
1860   SBase = Expand32BitAddress(Addr);
1861   Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
1862   Imm = true;
1863   return true;
1864 }
1865 
SelectSMRDImm(SDValue Addr,SDValue & SBase,SDValue & Offset) const1866 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
1867                                        SDValue &Offset) const {
1868   bool Imm = false;
1869   return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
1870 }
1871 
SelectSMRDImm32(SDValue Addr,SDValue & SBase,SDValue & Offset) const1872 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
1873                                          SDValue &Offset) const {
1874 
1875   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
1876 
1877   bool Imm = false;
1878   if (!SelectSMRD(Addr, SBase, Offset, Imm))
1879     return false;
1880 
1881   return !Imm && isa<ConstantSDNode>(Offset);
1882 }
1883 
SelectSMRDSgpr(SDValue Addr,SDValue & SBase,SDValue & Offset) const1884 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
1885                                         SDValue &Offset) const {
1886   bool Imm = false;
1887   return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
1888          !isa<ConstantSDNode>(Offset);
1889 }
1890 
SelectSMRDBufferImm(SDValue Addr,SDValue & Offset) const1891 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
1892                                              SDValue &Offset) const {
1893   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
1894     // The immediate offset for S_BUFFER instructions is unsigned.
1895     if (auto Imm =
1896             AMDGPU::getSMRDEncodedOffset(*Subtarget, C->getZExtValue(), true)) {
1897       Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
1898       return true;
1899     }
1900   }
1901 
1902   return false;
1903 }
1904 
SelectSMRDBufferImm32(SDValue Addr,SDValue & Offset) const1905 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
1906                                                SDValue &Offset) const {
1907   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
1908 
1909   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
1910     if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget,
1911                                                          C->getZExtValue())) {
1912       Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
1913       return true;
1914     }
1915   }
1916 
1917   return false;
1918 }
1919 
SelectMOVRELOffset(SDValue Index,SDValue & Base,SDValue & Offset) const1920 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
1921                                             SDValue &Base,
1922                                             SDValue &Offset) const {
1923   SDLoc DL(Index);
1924 
1925   if (CurDAG->isBaseWithConstantOffset(Index)) {
1926     SDValue N0 = Index.getOperand(0);
1927     SDValue N1 = Index.getOperand(1);
1928     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1929 
1930     // (add n0, c0)
1931     // Don't peel off the offset (c0) if doing so could possibly lead
1932     // the base (n0) to be negative.
1933     // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
1934     if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
1935         (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
1936       Base = N0;
1937       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1938       return true;
1939     }
1940   }
1941 
1942   if (isa<ConstantSDNode>(Index))
1943     return false;
1944 
1945   Base = Index;
1946   Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1947   return true;
1948 }
1949 
getS_BFE(unsigned Opcode,const SDLoc & DL,SDValue Val,uint32_t Offset,uint32_t Width)1950 SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
1951                                      SDValue Val, uint32_t Offset,
1952                                      uint32_t Width) {
1953   // Transformation function, pack the offset and width of a BFE into
1954   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1955   // source, bits [5:0] contain the offset and bits [22:16] the width.
1956   uint32_t PackedVal = Offset | (Width << 16);
1957   SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
1958 
1959   return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
1960 }
1961 
SelectS_BFEFromShifts(SDNode * N)1962 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
1963   // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
1964   // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
1965   // Predicate: 0 < b <= c < 32
1966 
1967   const SDValue &Shl = N->getOperand(0);
1968   ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
1969   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
1970 
1971   if (B && C) {
1972     uint32_t BVal = B->getZExtValue();
1973     uint32_t CVal = C->getZExtValue();
1974 
1975     if (0 < BVal && BVal <= CVal && CVal < 32) {
1976       bool Signed = N->getOpcode() == ISD::SRA;
1977       unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
1978 
1979       ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal,
1980                               32 - CVal));
1981       return;
1982     }
1983   }
1984   SelectCode(N);
1985 }
1986 
SelectS_BFE(SDNode * N)1987 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
1988   switch (N->getOpcode()) {
1989   case ISD::AND:
1990     if (N->getOperand(0).getOpcode() == ISD::SRL) {
1991       // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
1992       // Predicate: isMask(mask)
1993       const SDValue &Srl = N->getOperand(0);
1994       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
1995       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
1996 
1997       if (Shift && Mask) {
1998         uint32_t ShiftVal = Shift->getZExtValue();
1999         uint32_t MaskVal = Mask->getZExtValue();
2000 
2001         if (isMask_32(MaskVal)) {
2002           uint32_t WidthVal = countPopulation(MaskVal);
2003 
2004           ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
2005                                   Srl.getOperand(0), ShiftVal, WidthVal));
2006           return;
2007         }
2008       }
2009     }
2010     break;
2011   case ISD::SRL:
2012     if (N->getOperand(0).getOpcode() == ISD::AND) {
2013       // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2014       // Predicate: isMask(mask >> b)
2015       const SDValue &And = N->getOperand(0);
2016       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2017       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2018 
2019       if (Shift && Mask) {
2020         uint32_t ShiftVal = Shift->getZExtValue();
2021         uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2022 
2023         if (isMask_32(MaskVal)) {
2024           uint32_t WidthVal = countPopulation(MaskVal);
2025 
2026           ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
2027                                   And.getOperand(0), ShiftVal, WidthVal));
2028           return;
2029         }
2030       }
2031     } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2032       SelectS_BFEFromShifts(N);
2033       return;
2034     }
2035     break;
2036   case ISD::SRA:
2037     if (N->getOperand(0).getOpcode() == ISD::SHL) {
2038       SelectS_BFEFromShifts(N);
2039       return;
2040     }
2041     break;
2042 
2043   case ISD::SIGN_EXTEND_INREG: {
2044     // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2045     SDValue Src = N->getOperand(0);
2046     if (Src.getOpcode() != ISD::SRL)
2047       break;
2048 
2049     const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2050     if (!Amt)
2051       break;
2052 
2053     unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2054     ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0),
2055                             Amt->getZExtValue(), Width));
2056     return;
2057   }
2058   }
2059 
2060   SelectCode(N);
2061 }
2062 
isCBranchSCC(const SDNode * N) const2063 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2064   assert(N->getOpcode() == ISD::BRCOND);
2065   if (!N->hasOneUse())
2066     return false;
2067 
2068   SDValue Cond = N->getOperand(1);
2069   if (Cond.getOpcode() == ISD::CopyToReg)
2070     Cond = Cond.getOperand(2);
2071 
2072   if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2073     return false;
2074 
2075   MVT VT = Cond.getOperand(0).getSimpleValueType();
2076   if (VT == MVT::i32)
2077     return true;
2078 
2079   if (VT == MVT::i64) {
2080     auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2081 
2082     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2083     return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2084   }
2085 
2086   return false;
2087 }
2088 
SelectBRCOND(SDNode * N)2089 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2090   SDValue Cond = N->getOperand(1);
2091 
2092   if (Cond.isUndef()) {
2093     CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2094                          N->getOperand(2), N->getOperand(0));
2095     return;
2096   }
2097 
2098   const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2099   const SIRegisterInfo *TRI = ST->getRegisterInfo();
2100 
2101   bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2102   unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
2103   Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2104   SDLoc SL(N);
2105 
2106   if (!UseSCCBr) {
2107     // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not
2108     // analyzed what generates the vcc value, so we do not know whether vcc
2109     // bits for disabled lanes are 0.  Thus we need to mask out bits for
2110     // disabled lanes.
2111     //
2112     // For the case that we select S_CBRANCH_SCC1 and it gets
2113     // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2114     // SIInstrInfo::moveToVALU which inserts the S_AND).
2115     //
2116     // We could add an analysis of what generates the vcc value here and omit
2117     // the S_AND when is unnecessary. But it would be better to add a separate
2118     // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2119     // catches both cases.
2120     Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2121                                                          : AMDGPU::S_AND_B64,
2122                      SL, MVT::i1,
2123                      CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2124                                                         : AMDGPU::EXEC,
2125                                          MVT::i1),
2126                     Cond),
2127                    0);
2128   }
2129 
2130   SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2131   CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2132                        N->getOperand(2), // Basic Block
2133                        VCC.getValue(0));
2134 }
2135 
SelectFMAD_FMA(SDNode * N)2136 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
2137   MVT VT = N->getSimpleValueType(0);
2138   bool IsFMA = N->getOpcode() == ISD::FMA;
2139   if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
2140                          !Subtarget->hasFmaMixInsts()) ||
2141       ((IsFMA && Subtarget->hasMadMixInsts()) ||
2142        (!IsFMA && Subtarget->hasFmaMixInsts()))) {
2143     SelectCode(N);
2144     return;
2145   }
2146 
2147   SDValue Src0 = N->getOperand(0);
2148   SDValue Src1 = N->getOperand(1);
2149   SDValue Src2 = N->getOperand(2);
2150   unsigned Src0Mods, Src1Mods, Src2Mods;
2151 
2152   // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
2153   // using the conversion from f16.
2154   bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
2155   bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
2156   bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
2157 
2158   assert((IsFMA || !Mode.allFP32Denormals()) &&
2159          "fmad selected with denormals enabled");
2160   // TODO: We can select this with f32 denormals enabled if all the sources are
2161   // converted from f16 (in which case fmad isn't legal).
2162 
2163   if (Sel0 || Sel1 || Sel2) {
2164     // For dummy operands.
2165     SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2166     SDValue Ops[] = {
2167       CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0,
2168       CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1,
2169       CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2,
2170       CurDAG->getTargetConstant(0, SDLoc(), MVT::i1),
2171       Zero, Zero
2172     };
2173 
2174     CurDAG->SelectNodeTo(N,
2175                          IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
2176                          MVT::f32, Ops);
2177   } else {
2178     SelectCode(N);
2179   }
2180 }
2181 
2182 // This is here because there isn't a way to use the generated sub0_sub1 as the
2183 // subreg index to EXTRACT_SUBREG in tablegen.
SelectATOMIC_CMP_SWAP(SDNode * N)2184 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
2185   MemSDNode *Mem = cast<MemSDNode>(N);
2186   unsigned AS = Mem->getAddressSpace();
2187   if (AS == AMDGPUAS::FLAT_ADDRESS) {
2188     SelectCode(N);
2189     return;
2190   }
2191 
2192   MVT VT = N->getSimpleValueType(0);
2193   bool Is32 = (VT == MVT::i32);
2194   SDLoc SL(N);
2195 
2196   MachineSDNode *CmpSwap = nullptr;
2197   if (Subtarget->hasAddr64()) {
2198     SDValue SRsrc, VAddr, SOffset, Offset;
2199 
2200     if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset)) {
2201       unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
2202         AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
2203       SDValue CmpVal = Mem->getOperand(2);
2204       SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
2205 
2206       // XXX - Do we care about glue operands?
2207 
2208       SDValue Ops[] = {CmpVal, VAddr, SRsrc, SOffset, Offset, CPol,
2209                        Mem->getChain()};
2210 
2211       CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
2212     }
2213   }
2214 
2215   if (!CmpSwap) {
2216     SDValue SRsrc, SOffset, Offset;
2217     if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset)) {
2218       unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN :
2219         AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
2220 
2221       SDValue CmpVal = Mem->getOperand(2);
2222       SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
2223       SDValue Ops[] = {CmpVal, SRsrc, SOffset, Offset, CPol, Mem->getChain()};
2224 
2225       CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
2226     }
2227   }
2228 
2229   if (!CmpSwap) {
2230     SelectCode(N);
2231     return;
2232   }
2233 
2234   MachineMemOperand *MMO = Mem->getMemOperand();
2235   CurDAG->setNodeMemRefs(CmpSwap, {MMO});
2236 
2237   unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
2238   SDValue Extract
2239     = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));
2240 
2241   ReplaceUses(SDValue(N, 0), Extract);
2242   ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));
2243   CurDAG->RemoveDeadNode(N);
2244 }
2245 
SelectDSAppendConsume(SDNode * N,unsigned IntrID)2246 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2247   // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2248   // be copied to an SGPR with readfirstlane.
2249   unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2250     AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2251 
2252   SDValue Chain = N->getOperand(0);
2253   SDValue Ptr = N->getOperand(2);
2254   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2255   MachineMemOperand *MMO = M->getMemOperand();
2256   bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2257 
2258   SDValue Offset;
2259   if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2260     SDValue PtrBase = Ptr.getOperand(0);
2261     SDValue PtrOffset = Ptr.getOperand(1);
2262 
2263     const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
2264     if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2265       N = glueCopyToM0(N, PtrBase);
2266       Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2267     }
2268   }
2269 
2270   if (!Offset) {
2271     N = glueCopyToM0(N, Ptr);
2272     Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2273   }
2274 
2275   SDValue Ops[] = {
2276     Offset,
2277     CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2278     Chain,
2279     N->getOperand(N->getNumOperands() - 1) // New glue
2280   };
2281 
2282   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2283   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2284 }
2285 
gwsIntrinToOpcode(unsigned IntrID)2286 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2287   switch (IntrID) {
2288   case Intrinsic::amdgcn_ds_gws_init:
2289     return AMDGPU::DS_GWS_INIT;
2290   case Intrinsic::amdgcn_ds_gws_barrier:
2291     return AMDGPU::DS_GWS_BARRIER;
2292   case Intrinsic::amdgcn_ds_gws_sema_v:
2293     return AMDGPU::DS_GWS_SEMA_V;
2294   case Intrinsic::amdgcn_ds_gws_sema_br:
2295     return AMDGPU::DS_GWS_SEMA_BR;
2296   case Intrinsic::amdgcn_ds_gws_sema_p:
2297     return AMDGPU::DS_GWS_SEMA_P;
2298   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2299     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2300   default:
2301     llvm_unreachable("not a gws intrinsic");
2302   }
2303 }
2304 
SelectDS_GWS(SDNode * N,unsigned IntrID)2305 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2306   if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2307       !Subtarget->hasGWSSemaReleaseAll()) {
2308     // Let this error.
2309     SelectCode(N);
2310     return;
2311   }
2312 
2313   // Chain, intrinsic ID, vsrc, offset
2314   const bool HasVSrc = N->getNumOperands() == 4;
2315   assert(HasVSrc || N->getNumOperands() == 3);
2316 
2317   SDLoc SL(N);
2318   SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2319   int ImmOffset = 0;
2320   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2321   MachineMemOperand *MMO = M->getMemOperand();
2322 
2323   // Don't worry if the offset ends up in a VGPR. Only one lane will have
2324   // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2325 
2326   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2327   // offset field) % 64. Some versions of the programming guide omit the m0
2328   // part, or claim it's from offset 0.
2329   if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2330     // If we have a constant offset, try to use the 0 in m0 as the base.
2331     // TODO: Look into changing the default m0 initialization value. If the
2332     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2333     // the immediate offset.
2334     glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2335     ImmOffset = ConstOffset->getZExtValue();
2336   } else {
2337     if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2338       ImmOffset = BaseOffset.getConstantOperandVal(1);
2339       BaseOffset = BaseOffset.getOperand(0);
2340     }
2341 
2342     // Prefer to do the shift in an SGPR since it should be possible to use m0
2343     // as the result directly. If it's already an SGPR, it will be eliminated
2344     // later.
2345     SDNode *SGPROffset
2346       = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2347                                BaseOffset);
2348     // Shift to offset in m0
2349     SDNode *M0Base
2350       = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2351                                SDValue(SGPROffset, 0),
2352                                CurDAG->getTargetConstant(16, SL, MVT::i32));
2353     glueCopyToM0(N, SDValue(M0Base, 0));
2354   }
2355 
2356   SDValue Chain = N->getOperand(0);
2357   SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2358 
2359   const unsigned Opc = gwsIntrinToOpcode(IntrID);
2360   SmallVector<SDValue, 5> Ops;
2361   if (HasVSrc)
2362     Ops.push_back(N->getOperand(2));
2363   Ops.push_back(OffsetField);
2364   Ops.push_back(Chain);
2365 
2366   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2367   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2368 }
2369 
SelectInterpP1F16(SDNode * N)2370 void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2371   if (Subtarget->getLDSBankCount() != 16) {
2372     // This is a single instruction with a pattern.
2373     SelectCode(N);
2374     return;
2375   }
2376 
2377   SDLoc DL(N);
2378 
2379   // This requires 2 instructions. It is possible to write a pattern to support
2380   // this, but the generated isel emitter doesn't correctly deal with multiple
2381   // output instructions using the same physical register input. The copy to m0
2382   // is incorrectly placed before the second instruction.
2383   //
2384   // TODO: Match source modifiers.
2385   //
2386   // def : Pat <
2387   //   (int_amdgcn_interp_p1_f16
2388   //    (VOP3Mods f32:$src0, i32:$src0_modifiers),
2389   //                             (i32 timm:$attrchan), (i32 timm:$attr),
2390   //                             (i1 timm:$high), M0),
2391   //   (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2392   //       timm:$attrchan, 0,
2393   //       (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2394   //   let Predicates = [has16BankLDS];
2395   // }
2396 
2397   // 16 bank LDS
2398   SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2399                                       N->getOperand(5), SDValue());
2400 
2401   SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2402 
2403   SDNode *InterpMov =
2404     CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2405         CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2406         N->getOperand(3),  // Attr
2407         N->getOperand(2),  // Attrchan
2408         ToM0.getValue(1) // In glue
2409   });
2410 
2411   SDNode *InterpP1LV =
2412     CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2413         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2414         N->getOperand(1), // Src0
2415         N->getOperand(3), // Attr
2416         N->getOperand(2), // Attrchan
2417         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2418         SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2419         N->getOperand(4), // high
2420         CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2421         CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2422         SDValue(InterpMov, 1)
2423   });
2424 
2425   CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2426 }
2427 
SelectINTRINSIC_W_CHAIN(SDNode * N)2428 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2429   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2430   switch (IntrID) {
2431   case Intrinsic::amdgcn_ds_append:
2432   case Intrinsic::amdgcn_ds_consume: {
2433     if (N->getValueType(0) != MVT::i32)
2434       break;
2435     SelectDSAppendConsume(N, IntrID);
2436     return;
2437   }
2438   }
2439 
2440   SelectCode(N);
2441 }
2442 
SelectINTRINSIC_WO_CHAIN(SDNode * N)2443 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2444   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2445   unsigned Opcode;
2446   switch (IntrID) {
2447   case Intrinsic::amdgcn_wqm:
2448     Opcode = AMDGPU::WQM;
2449     break;
2450   case Intrinsic::amdgcn_softwqm:
2451     Opcode = AMDGPU::SOFT_WQM;
2452     break;
2453   case Intrinsic::amdgcn_wwm:
2454   case Intrinsic::amdgcn_strict_wwm:
2455     Opcode = AMDGPU::STRICT_WWM;
2456     break;
2457   case Intrinsic::amdgcn_strict_wqm:
2458     Opcode = AMDGPU::STRICT_WQM;
2459     break;
2460   case Intrinsic::amdgcn_interp_p1_f16:
2461     SelectInterpP1F16(N);
2462     return;
2463   default:
2464     SelectCode(N);
2465     return;
2466   }
2467 
2468   SDValue Src = N->getOperand(1);
2469   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2470 }
2471 
SelectINTRINSIC_VOID(SDNode * N)2472 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2473   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2474   switch (IntrID) {
2475   case Intrinsic::amdgcn_ds_gws_init:
2476   case Intrinsic::amdgcn_ds_gws_barrier:
2477   case Intrinsic::amdgcn_ds_gws_sema_v:
2478   case Intrinsic::amdgcn_ds_gws_sema_br:
2479   case Intrinsic::amdgcn_ds_gws_sema_p:
2480   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2481     SelectDS_GWS(N, IntrID);
2482     return;
2483   default:
2484     break;
2485   }
2486 
2487   SelectCode(N);
2488 }
2489 
SelectVOP3ModsImpl(SDValue In,SDValue & Src,unsigned & Mods,bool AllowAbs) const2490 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2491                                             unsigned &Mods,
2492                                             bool AllowAbs) const {
2493   Mods = 0;
2494   Src = In;
2495 
2496   if (Src.getOpcode() == ISD::FNEG) {
2497     Mods |= SISrcMods::NEG;
2498     Src = Src.getOperand(0);
2499   }
2500 
2501   if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2502     Mods |= SISrcMods::ABS;
2503     Src = Src.getOperand(0);
2504   }
2505 
2506   return true;
2507 }
2508 
SelectVOP3Mods(SDValue In,SDValue & Src,SDValue & SrcMods) const2509 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2510                                         SDValue &SrcMods) const {
2511   unsigned Mods;
2512   if (SelectVOP3ModsImpl(In, Src, Mods)) {
2513     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2514     return true;
2515   }
2516 
2517   return false;
2518 }
2519 
SelectVOP3BMods(SDValue In,SDValue & Src,SDValue & SrcMods) const2520 bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2521                                          SDValue &SrcMods) const {
2522   unsigned Mods;
2523   if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
2524     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2525     return true;
2526   }
2527 
2528   return false;
2529 }
2530 
SelectVOP3Mods_NNaN(SDValue In,SDValue & Src,SDValue & SrcMods) const2531 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
2532                                              SDValue &SrcMods) const {
2533   SelectVOP3Mods(In, Src, SrcMods);
2534   return isNoNanSrc(Src);
2535 }
2536 
SelectVOP3NoMods(SDValue In,SDValue & Src) const2537 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2538   if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2539     return false;
2540 
2541   Src = In;
2542   return true;
2543 }
2544 
SelectVOP3Mods0(SDValue In,SDValue & Src,SDValue & SrcMods,SDValue & Clamp,SDValue & Omod) const2545 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2546                                          SDValue &SrcMods, SDValue &Clamp,
2547                                          SDValue &Omod) const {
2548   SDLoc DL(In);
2549   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2550   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2551 
2552   return SelectVOP3Mods(In, Src, SrcMods);
2553 }
2554 
SelectVOP3BMods0(SDValue In,SDValue & Src,SDValue & SrcMods,SDValue & Clamp,SDValue & Omod) const2555 bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2556                                           SDValue &SrcMods, SDValue &Clamp,
2557                                           SDValue &Omod) const {
2558   SDLoc DL(In);
2559   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2560   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2561 
2562   return SelectVOP3BMods(In, Src, SrcMods);
2563 }
2564 
SelectVOP3OMods(SDValue In,SDValue & Src,SDValue & Clamp,SDValue & Omod) const2565 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2566                                          SDValue &Clamp, SDValue &Omod) const {
2567   Src = In;
2568 
2569   SDLoc DL(In);
2570   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2571   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2572 
2573   return true;
2574 }
2575 
SelectVOP3PMods(SDValue In,SDValue & Src,SDValue & SrcMods) const2576 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2577                                          SDValue &SrcMods) const {
2578   unsigned Mods = 0;
2579   Src = In;
2580 
2581   if (Src.getOpcode() == ISD::FNEG) {
2582     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
2583     Src = Src.getOperand(0);
2584   }
2585 
2586   if (Src.getOpcode() == ISD::BUILD_VECTOR) {
2587     unsigned VecMods = Mods;
2588 
2589     SDValue Lo = stripBitcast(Src.getOperand(0));
2590     SDValue Hi = stripBitcast(Src.getOperand(1));
2591 
2592     if (Lo.getOpcode() == ISD::FNEG) {
2593       Lo = stripBitcast(Lo.getOperand(0));
2594       Mods ^= SISrcMods::NEG;
2595     }
2596 
2597     if (Hi.getOpcode() == ISD::FNEG) {
2598       Hi = stripBitcast(Hi.getOperand(0));
2599       Mods ^= SISrcMods::NEG_HI;
2600     }
2601 
2602     if (isExtractHiElt(Lo, Lo))
2603       Mods |= SISrcMods::OP_SEL_0;
2604 
2605     if (isExtractHiElt(Hi, Hi))
2606       Mods |= SISrcMods::OP_SEL_1;
2607 
2608     unsigned VecSize = Src.getValueSizeInBits();
2609     Lo = stripExtractLoElt(Lo);
2610     Hi = stripExtractLoElt(Hi);
2611 
2612     if (Lo.getValueSizeInBits() > VecSize) {
2613       Lo = CurDAG->getTargetExtractSubreg(
2614         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2615         MVT::getIntegerVT(VecSize), Lo);
2616     }
2617 
2618     if (Hi.getValueSizeInBits() > VecSize) {
2619       Hi = CurDAG->getTargetExtractSubreg(
2620         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2621         MVT::getIntegerVT(VecSize), Hi);
2622     }
2623 
2624     assert(Lo.getValueSizeInBits() <= VecSize &&
2625            Hi.getValueSizeInBits() <= VecSize);
2626 
2627     if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
2628       // Really a scalar input. Just select from the low half of the register to
2629       // avoid packing.
2630 
2631       if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
2632         Src = Lo;
2633       } else {
2634         assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
2635 
2636         SDLoc SL(In);
2637         SDValue Undef = SDValue(
2638           CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
2639                                  Lo.getValueType()), 0);
2640         auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
2641                                     : AMDGPU::SReg_64RegClassID;
2642         const SDValue Ops[] = {
2643           CurDAG->getTargetConstant(RC, SL, MVT::i32),
2644           Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2645           Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
2646 
2647         Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
2648                                              Src.getValueType(), Ops), 0);
2649       }
2650       SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2651       return true;
2652     }
2653 
2654     if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
2655       uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
2656                       .bitcastToAPInt().getZExtValue();
2657       if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
2658         Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);;
2659         SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2660         return true;
2661       }
2662     }
2663 
2664     Mods = VecMods;
2665   }
2666 
2667   // Packed instructions do not have abs modifiers.
2668   Mods |= SISrcMods::OP_SEL_1;
2669 
2670   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2671   return true;
2672 }
2673 
SelectVOP3OpSel(SDValue In,SDValue & Src,SDValue & SrcMods) const2674 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
2675                                          SDValue &SrcMods) const {
2676   Src = In;
2677   // FIXME: Handle op_sel
2678   SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
2679   return true;
2680 }
2681 
SelectVOP3OpSelMods(SDValue In,SDValue & Src,SDValue & SrcMods) const2682 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
2683                                              SDValue &SrcMods) const {
2684   // FIXME: Handle op_sel
2685   return SelectVOP3Mods(In, Src, SrcMods);
2686 }
2687 
2688 // The return value is not whether the match is possible (which it always is),
2689 // but whether or not it a conversion is really used.
SelectVOP3PMadMixModsImpl(SDValue In,SDValue & Src,unsigned & Mods) const2690 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
2691                                                    unsigned &Mods) const {
2692   Mods = 0;
2693   SelectVOP3ModsImpl(In, Src, Mods);
2694 
2695   if (Src.getOpcode() == ISD::FP_EXTEND) {
2696     Src = Src.getOperand(0);
2697     assert(Src.getValueType() == MVT::f16);
2698     Src = stripBitcast(Src);
2699 
2700     // Be careful about folding modifiers if we already have an abs. fneg is
2701     // applied last, so we don't want to apply an earlier fneg.
2702     if ((Mods & SISrcMods::ABS) == 0) {
2703       unsigned ModsTmp;
2704       SelectVOP3ModsImpl(Src, Src, ModsTmp);
2705 
2706       if ((ModsTmp & SISrcMods::NEG) != 0)
2707         Mods ^= SISrcMods::NEG;
2708 
2709       if ((ModsTmp & SISrcMods::ABS) != 0)
2710         Mods |= SISrcMods::ABS;
2711     }
2712 
2713     // op_sel/op_sel_hi decide the source type and source.
2714     // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
2715     // If the sources's op_sel is set, it picks the high half of the source
2716     // register.
2717 
2718     Mods |= SISrcMods::OP_SEL_1;
2719     if (isExtractHiElt(Src, Src)) {
2720       Mods |= SISrcMods::OP_SEL_0;
2721 
2722       // TODO: Should we try to look for neg/abs here?
2723     }
2724 
2725     return true;
2726   }
2727 
2728   return false;
2729 }
2730 
SelectVOP3PMadMixMods(SDValue In,SDValue & Src,SDValue & SrcMods) const2731 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
2732                                                SDValue &SrcMods) const {
2733   unsigned Mods = 0;
2734   SelectVOP3PMadMixModsImpl(In, Src, Mods);
2735   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2736   return true;
2737 }
2738 
getHi16Elt(SDValue In) const2739 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
2740   if (In.isUndef())
2741     return CurDAG->getUNDEF(MVT::i32);
2742 
2743   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
2744     SDLoc SL(In);
2745     return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
2746   }
2747 
2748   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
2749     SDLoc SL(In);
2750     return CurDAG->getConstant(
2751       C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
2752   }
2753 
2754   SDValue Src;
2755   if (isExtractHiElt(In, Src))
2756     return Src;
2757 
2758   return SDValue();
2759 }
2760 
isVGPRImm(const SDNode * N) const2761 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
2762   assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
2763 
2764   const SIRegisterInfo *SIRI =
2765     static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
2766   const SIInstrInfo * SII =
2767     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
2768 
2769   unsigned Limit = 0;
2770   bool AllUsesAcceptSReg = true;
2771   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
2772     Limit < 10 && U != E; ++U, ++Limit) {
2773     const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
2774 
2775     // If the register class is unknown, it could be an unknown
2776     // register class that needs to be an SGPR, e.g. an inline asm
2777     // constraint
2778     if (!RC || SIRI->isSGPRClass(RC))
2779       return false;
2780 
2781     if (RC != &AMDGPU::VS_32RegClass) {
2782       AllUsesAcceptSReg = false;
2783       SDNode * User = *U;
2784       if (User->isMachineOpcode()) {
2785         unsigned Opc = User->getMachineOpcode();
2786         MCInstrDesc Desc = SII->get(Opc);
2787         if (Desc.isCommutable()) {
2788           unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
2789           unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
2790           if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
2791             unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
2792             const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
2793             if (CommutedRC == &AMDGPU::VS_32RegClass)
2794               AllUsesAcceptSReg = true;
2795           }
2796         }
2797       }
2798       // If "AllUsesAcceptSReg == false" so far we haven't suceeded
2799       // commuting current user. This means have at least one use
2800       // that strictly require VGPR. Thus, we will not attempt to commute
2801       // other user instructions.
2802       if (!AllUsesAcceptSReg)
2803         break;
2804     }
2805   }
2806   return !AllUsesAcceptSReg && (Limit < 10);
2807 }
2808 
isUniformLoad(const SDNode * N) const2809 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
2810   auto Ld = cast<LoadSDNode>(N);
2811 
2812   return Ld->getAlignment() >= 4 &&
2813         (
2814           (
2815             (
2816               Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS       ||
2817               Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
2818             )
2819             &&
2820             !N->isDivergent()
2821           )
2822           ||
2823           (
2824             Subtarget->getScalarizeGlobalBehavior() &&
2825             Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
2826             Ld->isSimple() &&
2827             !N->isDivergent() &&
2828             static_cast<const SITargetLowering *>(
2829               getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
2830           )
2831         );
2832 }
2833 
PostprocessISelDAG()2834 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
2835   const AMDGPUTargetLowering& Lowering =
2836     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
2837   bool IsModified = false;
2838   do {
2839     IsModified = false;
2840 
2841     // Go over all selected nodes and try to fold them a bit more
2842     SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
2843     while (Position != CurDAG->allnodes_end()) {
2844       SDNode *Node = &*Position++;
2845       MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
2846       if (!MachineNode)
2847         continue;
2848 
2849       SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
2850       if (ResNode != Node) {
2851         if (ResNode)
2852           ReplaceUses(Node, ResNode);
2853         IsModified = true;
2854       }
2855     }
2856     CurDAG->RemoveDeadNodes();
2857   } while (IsModified);
2858 }
2859