1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Defines an instruction selector for the AMDGPU target.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUISelDAGToDAG.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "MCTargetDesc/R600MCTargetDesc.h"
20 #include "R600RegisterInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/CodeGen/FunctionLoweringInfo.h"
25 #include "llvm/CodeGen/SelectionDAG.h"
26 #include "llvm/CodeGen/SelectionDAGISel.h"
27 #include "llvm/CodeGen/SelectionDAGNodes.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/InitializePasses.h"
30 
31 #ifdef EXPENSIVE_CHECKS
32 #include "llvm/Analysis/LoopInfo.h"
33 #include "llvm/IR/Dominators.h"
34 #endif
35 
36 #define DEBUG_TYPE "amdgpu-isel"
37 
38 using namespace llvm;
39 
40 //===----------------------------------------------------------------------===//
41 // Instruction Selector Implementation
42 //===----------------------------------------------------------------------===//
43 
44 namespace {
45 
46 static SDValue stripBitcast(SDValue Val) {
47   return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
48 }
49 
50 // Figure out if this is really an extract of the high 16-bits of a dword.
51 static bool isExtractHiElt(SDValue In, SDValue &Out) {
52   In = stripBitcast(In);
53 
54   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
55     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
56       if (!Idx->isOne())
57         return false;
58       Out = In.getOperand(0);
59       return true;
60     }
61   }
62 
63   if (In.getOpcode() != ISD::TRUNCATE)
64     return false;
65 
66   SDValue Srl = In.getOperand(0);
67   if (Srl.getOpcode() == ISD::SRL) {
68     if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
69       if (ShiftAmt->getZExtValue() == 16) {
70         Out = stripBitcast(Srl.getOperand(0));
71         return true;
72       }
73     }
74   }
75 
76   return false;
77 }
78 
79 // Look through operations that obscure just looking at the low 16-bits of the
80 // same register.
81 static SDValue stripExtractLoElt(SDValue In) {
82   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
83     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
84       if (Idx->isZero() && In.getValueSizeInBits() <= 32)
85         return In.getOperand(0);
86     }
87   }
88 
89   if (In.getOpcode() == ISD::TRUNCATE) {
90     SDValue Src = In.getOperand(0);
91     if (Src.getValueType().getSizeInBits() == 32)
92       return stripBitcast(Src);
93   }
94 
95   return In;
96 }
97 
98 }  // end anonymous namespace
99 
100 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
101                       "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
102 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
103 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
104 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
105 #ifdef EXPENSIVE_CHECKS
106 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
107 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
108 #endif
109 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
110                     "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
111 
112 /// This pass converts a legalized DAG into a AMDGPU-specific
113 // DAG, ready for instruction scheduling.
114 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
115                                         CodeGenOpt::Level OptLevel) {
116   return new AMDGPUDAGToDAGISel(TM, OptLevel);
117 }
118 
119 AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(
120     TargetMachine *TM /*= nullptr*/,
121     CodeGenOpt::Level OptLevel /*= CodeGenOpt::Default*/)
122     : SelectionDAGISel(*TM, OptLevel) {
123   EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
124 }
125 
126 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
127 #ifdef EXPENSIVE_CHECKS
128   DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
129   LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
130   for (auto &L : LI->getLoopsInPreorder()) {
131     assert(L->isLCSSAForm(DT));
132   }
133 #endif
134   Subtarget = &MF.getSubtarget<GCNSubtarget>();
135   Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction());
136   return SelectionDAGISel::runOnMachineFunction(MF);
137 }
138 
139 bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
140   // XXX - only need to list legal operations.
141   switch (Opc) {
142   case ISD::FADD:
143   case ISD::FSUB:
144   case ISD::FMUL:
145   case ISD::FDIV:
146   case ISD::FREM:
147   case ISD::FCANONICALIZE:
148   case ISD::UINT_TO_FP:
149   case ISD::SINT_TO_FP:
150   case ISD::FABS:
151     // Fabs is lowered to a bit operation, but it's an and which will clear the
152     // high bits anyway.
153   case ISD::FSQRT:
154   case ISD::FSIN:
155   case ISD::FCOS:
156   case ISD::FPOWI:
157   case ISD::FPOW:
158   case ISD::FLOG:
159   case ISD::FLOG2:
160   case ISD::FLOG10:
161   case ISD::FEXP:
162   case ISD::FEXP2:
163   case ISD::FCEIL:
164   case ISD::FTRUNC:
165   case ISD::FRINT:
166   case ISD::FNEARBYINT:
167   case ISD::FROUND:
168   case ISD::FFLOOR:
169   case ISD::FMINNUM:
170   case ISD::FMAXNUM:
171   case AMDGPUISD::FRACT:
172   case AMDGPUISD::CLAMP:
173   case AMDGPUISD::COS_HW:
174   case AMDGPUISD::SIN_HW:
175   case AMDGPUISD::FMIN3:
176   case AMDGPUISD::FMAX3:
177   case AMDGPUISD::FMED3:
178   case AMDGPUISD::FMAD_FTZ:
179   case AMDGPUISD::RCP:
180   case AMDGPUISD::RSQ:
181   case AMDGPUISD::RCP_IFLAG:
182   case AMDGPUISD::LDEXP:
183     // On gfx10, all 16-bit instructions preserve the high bits.
184     return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
185   case ISD::FP_ROUND:
186     // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
187     // high bits on gfx9.
188     // TODO: If we had the source node we could see if the source was fma/mad
189     return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
190   case ISD::FMA:
191   case ISD::FMAD:
192   case AMDGPUISD::DIV_FIXUP:
193     return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
194   default:
195     // fcopysign, select and others may be lowered to 32-bit bit operations
196     // which don't zero the high bits.
197     return false;
198   }
199 }
200 
201 void AMDGPUDAGToDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
202   AU.addRequired<AMDGPUArgumentUsageInfo>();
203   AU.addRequired<LegacyDivergenceAnalysis>();
204 #ifdef EXPENSIVE_CHECKS
205   AU.addRequired<DominatorTreeWrapperPass>();
206   AU.addRequired<LoopInfoWrapperPass>();
207 #endif
208   SelectionDAGISel::getAnalysisUsage(AU);
209 }
210 
211 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
212   assert(Subtarget->d16PreservesUnusedBits());
213   MVT VT = N->getValueType(0).getSimpleVT();
214   if (VT != MVT::v2i16 && VT != MVT::v2f16)
215     return false;
216 
217   SDValue Lo = N->getOperand(0);
218   SDValue Hi = N->getOperand(1);
219 
220   LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
221 
222   // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
223   // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
224   // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
225 
226   // Need to check for possible indirect dependencies on the other half of the
227   // vector to avoid introducing a cycle.
228   if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
229     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
230 
231     SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
232     SDValue Ops[] = {
233       LdHi->getChain(), LdHi->getBasePtr(), TiedIn
234     };
235 
236     unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
237     if (LdHi->getMemoryVT() == MVT::i8) {
238       LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
239         AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
240     } else {
241       assert(LdHi->getMemoryVT() == MVT::i16);
242     }
243 
244     SDValue NewLoadHi =
245       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
246                                   Ops, LdHi->getMemoryVT(),
247                                   LdHi->getMemOperand());
248 
249     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
250     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
251     return true;
252   }
253 
254   // build_vector (load ptr), hi -> load_d16_lo ptr, hi
255   // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
256   // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
257   LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
258   if (LdLo && Lo.hasOneUse()) {
259     SDValue TiedIn = getHi16Elt(Hi);
260     if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
261       return false;
262 
263     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
264     unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
265     if (LdLo->getMemoryVT() == MVT::i8) {
266       LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
267         AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
268     } else {
269       assert(LdLo->getMemoryVT() == MVT::i16);
270     }
271 
272     TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
273 
274     SDValue Ops[] = {
275       LdLo->getChain(), LdLo->getBasePtr(), TiedIn
276     };
277 
278     SDValue NewLoadLo =
279       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
280                                   Ops, LdLo->getMemoryVT(),
281                                   LdLo->getMemOperand());
282 
283     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
284     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
285     return true;
286   }
287 
288   return false;
289 }
290 
291 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
292   if (!Subtarget->d16PreservesUnusedBits())
293     return;
294 
295   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
296 
297   bool MadeChange = false;
298   while (Position != CurDAG->allnodes_begin()) {
299     SDNode *N = &*--Position;
300     if (N->use_empty())
301       continue;
302 
303     switch (N->getOpcode()) {
304     case ISD::BUILD_VECTOR:
305       MadeChange |= matchLoadD16FromBuildVector(N);
306       break;
307     default:
308       break;
309     }
310   }
311 
312   if (MadeChange) {
313     CurDAG->RemoveDeadNodes();
314     LLVM_DEBUG(dbgs() << "After PreProcess:\n";
315                CurDAG->dump(););
316   }
317 }
318 
319 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
320   if (TM.Options.NoNaNsFPMath)
321     return true;
322 
323   // TODO: Move into isKnownNeverNaN
324   if (N->getFlags().hasNoNaNs())
325     return true;
326 
327   return CurDAG->isKnownNeverNaN(N);
328 }
329 
330 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
331                                            bool Negated) const {
332   if (N->isUndef())
333     return true;
334 
335   const SIInstrInfo *TII = Subtarget->getInstrInfo();
336   if (Negated) {
337     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
338       return TII->isInlineConstant(-C->getAPIntValue());
339 
340     if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
341       return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
342 
343   } else {
344     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
345       return TII->isInlineConstant(C->getAPIntValue());
346 
347     if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
348       return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
349   }
350 
351   return false;
352 }
353 
354 /// Determine the register class for \p OpNo
355 /// \returns The register class of the virtual register that will be used for
356 /// the given operand number \OpNo or NULL if the register class cannot be
357 /// determined.
358 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
359                                                           unsigned OpNo) const {
360   if (!N->isMachineOpcode()) {
361     if (N->getOpcode() == ISD::CopyToReg) {
362       Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
363       if (Reg.isVirtual()) {
364         MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
365         return MRI.getRegClass(Reg);
366       }
367 
368       const SIRegisterInfo *TRI
369         = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
370       return TRI->getPhysRegClass(Reg);
371     }
372 
373     return nullptr;
374   }
375 
376   switch (N->getMachineOpcode()) {
377   default: {
378     const MCInstrDesc &Desc =
379         Subtarget->getInstrInfo()->get(N->getMachineOpcode());
380     unsigned OpIdx = Desc.getNumDefs() + OpNo;
381     if (OpIdx >= Desc.getNumOperands())
382       return nullptr;
383     int RegClass = Desc.OpInfo[OpIdx].RegClass;
384     if (RegClass == -1)
385       return nullptr;
386 
387     return Subtarget->getRegisterInfo()->getRegClass(RegClass);
388   }
389   case AMDGPU::REG_SEQUENCE: {
390     unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
391     const TargetRegisterClass *SuperRC =
392         Subtarget->getRegisterInfo()->getRegClass(RCID);
393 
394     SDValue SubRegOp = N->getOperand(OpNo + 1);
395     unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
396     return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
397                                                               SubRegIdx);
398   }
399   }
400 }
401 
402 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
403                                          SDValue Glue) const {
404   SmallVector <SDValue, 8> Ops;
405   Ops.push_back(NewChain); // Replace the chain.
406   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
407     Ops.push_back(N->getOperand(i));
408 
409   Ops.push_back(Glue);
410   return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
411 }
412 
413 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
414   const SITargetLowering& Lowering =
415     *static_cast<const SITargetLowering*>(getTargetLowering());
416 
417   assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
418 
419   SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
420   return glueCopyToOp(N, M0, M0.getValue(1));
421 }
422 
423 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
424   unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
425   if (AS == AMDGPUAS::LOCAL_ADDRESS) {
426     if (Subtarget->ldsRequiresM0Init())
427       return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
428   } else if (AS == AMDGPUAS::REGION_ADDRESS) {
429     MachineFunction &MF = CurDAG->getMachineFunction();
430     unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
431     return
432         glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
433   }
434   return N;
435 }
436 
437 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
438                                                   EVT VT) const {
439   SDNode *Lo = CurDAG->getMachineNode(
440       AMDGPU::S_MOV_B32, DL, MVT::i32,
441       CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
442   SDNode *Hi =
443       CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
444                              CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
445   const SDValue Ops[] = {
446       CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
447       SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
448       SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
449 
450   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
451 }
452 
453 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
454   EVT VT = N->getValueType(0);
455   unsigned NumVectorElts = VT.getVectorNumElements();
456   EVT EltVT = VT.getVectorElementType();
457   SDLoc DL(N);
458   SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
459 
460   if (NumVectorElts == 1) {
461     CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
462                          RegClass);
463     return;
464   }
465 
466   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
467                                   "supported yet");
468   // 32 = Max Num Vector Elements
469   // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
470   // 1 = Vector Register Class
471   SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
472 
473   bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
474                Triple::amdgcn;
475   RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
476   bool IsRegSeq = true;
477   unsigned NOps = N->getNumOperands();
478   for (unsigned i = 0; i < NOps; i++) {
479     // XXX: Why is this here?
480     if (isa<RegisterSDNode>(N->getOperand(i))) {
481       IsRegSeq = false;
482       break;
483     }
484     unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
485                          : R600RegisterInfo::getSubRegFromChannel(i);
486     RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
487     RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
488   }
489   if (NOps != NumVectorElts) {
490     // Fill in the missing undef elements if this was a scalar_to_vector.
491     assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
492     MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
493                                                    DL, EltVT);
494     for (unsigned i = NOps; i < NumVectorElts; ++i) {
495       unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
496                            : R600RegisterInfo::getSubRegFromChannel(i);
497       RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
498       RegSeqArgs[1 + (2 * i) + 1] =
499           CurDAG->getTargetConstant(Sub, DL, MVT::i32);
500     }
501   }
502 
503   if (!IsRegSeq)
504     SelectCode(N);
505   CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
506 }
507 
508 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
509   unsigned int Opc = N->getOpcode();
510   if (N->isMachineOpcode()) {
511     N->setNodeId(-1);
512     return;   // Already selected.
513   }
514 
515   // isa<MemSDNode> almost works but is slightly too permissive for some DS
516   // intrinsics.
517   if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
518       (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
519        Opc == ISD::ATOMIC_LOAD_FADD ||
520        Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
521        Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
522     N = glueCopyToM0LDSInit(N);
523     SelectCode(N);
524     return;
525   }
526 
527   switch (Opc) {
528   default:
529     break;
530   // We are selecting i64 ADD here instead of custom lower it during
531   // DAG legalization, so we can fold some i64 ADDs used for address
532   // calculation into the LOAD and STORE instructions.
533   case ISD::ADDC:
534   case ISD::ADDE:
535   case ISD::SUBC:
536   case ISD::SUBE: {
537     if (N->getValueType(0) != MVT::i64)
538       break;
539 
540     SelectADD_SUB_I64(N);
541     return;
542   }
543   case ISD::ADDCARRY:
544   case ISD::SUBCARRY:
545     if (N->getValueType(0) != MVT::i32)
546       break;
547 
548     SelectAddcSubb(N);
549     return;
550   case ISD::UADDO:
551   case ISD::USUBO: {
552     SelectUADDO_USUBO(N);
553     return;
554   }
555   case AMDGPUISD::FMUL_W_CHAIN: {
556     SelectFMUL_W_CHAIN(N);
557     return;
558   }
559   case AMDGPUISD::FMA_W_CHAIN: {
560     SelectFMA_W_CHAIN(N);
561     return;
562   }
563 
564   case ISD::SCALAR_TO_VECTOR:
565   case ISD::BUILD_VECTOR: {
566     EVT VT = N->getValueType(0);
567     unsigned NumVectorElts = VT.getVectorNumElements();
568     if (VT.getScalarSizeInBits() == 16) {
569       if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
570         if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
571           ReplaceNode(N, Packed);
572           return;
573         }
574       }
575 
576       break;
577     }
578 
579     assert(VT.getVectorElementType().bitsEq(MVT::i32));
580     unsigned RegClassID =
581         SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
582     SelectBuildVector(N, RegClassID);
583     return;
584   }
585   case ISD::BUILD_PAIR: {
586     SDValue RC, SubReg0, SubReg1;
587     SDLoc DL(N);
588     if (N->getValueType(0) == MVT::i128) {
589       RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
590       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
591       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
592     } else if (N->getValueType(0) == MVT::i64) {
593       RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
594       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
595       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
596     } else {
597       llvm_unreachable("Unhandled value type for BUILD_PAIR");
598     }
599     const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
600                             N->getOperand(1), SubReg1 };
601     ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
602                                           N->getValueType(0), Ops));
603     return;
604   }
605 
606   case ISD::Constant:
607   case ISD::ConstantFP: {
608     if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
609       break;
610 
611     uint64_t Imm;
612     if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
613       Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
614     else {
615       ConstantSDNode *C = cast<ConstantSDNode>(N);
616       Imm = C->getZExtValue();
617     }
618 
619     SDLoc DL(N);
620     ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
621     return;
622   }
623   case AMDGPUISD::BFE_I32:
624   case AMDGPUISD::BFE_U32: {
625     // There is a scalar version available, but unlike the vector version which
626     // has a separate operand for the offset and width, the scalar version packs
627     // the width and offset into a single operand. Try to move to the scalar
628     // version if the offsets are constant, so that we can try to keep extended
629     // loads of kernel arguments in SGPRs.
630 
631     // TODO: Technically we could try to pattern match scalar bitshifts of
632     // dynamic values, but it's probably not useful.
633     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
634     if (!Offset)
635       break;
636 
637     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
638     if (!Width)
639       break;
640 
641     bool Signed = Opc == AMDGPUISD::BFE_I32;
642 
643     uint32_t OffsetVal = Offset->getZExtValue();
644     uint32_t WidthVal = Width->getZExtValue();
645 
646     ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
647                             WidthVal));
648     return;
649   }
650   case AMDGPUISD::DIV_SCALE: {
651     SelectDIV_SCALE(N);
652     return;
653   }
654   case AMDGPUISD::MAD_I64_I32:
655   case AMDGPUISD::MAD_U64_U32: {
656     SelectMAD_64_32(N);
657     return;
658   }
659   case ISD::SMUL_LOHI:
660   case ISD::UMUL_LOHI:
661     return SelectMUL_LOHI(N);
662   case ISD::CopyToReg: {
663     const SITargetLowering& Lowering =
664       *static_cast<const SITargetLowering*>(getTargetLowering());
665     N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
666     break;
667   }
668   case ISD::AND:
669   case ISD::SRL:
670   case ISD::SRA:
671   case ISD::SIGN_EXTEND_INREG:
672     if (N->getValueType(0) != MVT::i32)
673       break;
674 
675     SelectS_BFE(N);
676     return;
677   case ISD::BRCOND:
678     SelectBRCOND(N);
679     return;
680   case ISD::FMAD:
681   case ISD::FMA:
682     SelectFMAD_FMA(N);
683     return;
684   case AMDGPUISD::CVT_PKRTZ_F16_F32:
685   case AMDGPUISD::CVT_PKNORM_I16_F32:
686   case AMDGPUISD::CVT_PKNORM_U16_F32:
687   case AMDGPUISD::CVT_PK_U16_U32:
688   case AMDGPUISD::CVT_PK_I16_I32: {
689     // Hack around using a legal type if f16 is illegal.
690     if (N->getValueType(0) == MVT::i32) {
691       MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
692       N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
693                               { N->getOperand(0), N->getOperand(1) });
694       SelectCode(N);
695       return;
696     }
697 
698     break;
699   }
700   case ISD::INTRINSIC_W_CHAIN: {
701     SelectINTRINSIC_W_CHAIN(N);
702     return;
703   }
704   case ISD::INTRINSIC_WO_CHAIN: {
705     SelectINTRINSIC_WO_CHAIN(N);
706     return;
707   }
708   case ISD::INTRINSIC_VOID: {
709     SelectINTRINSIC_VOID(N);
710     return;
711   }
712   }
713 
714   SelectCode(N);
715 }
716 
717 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
718   const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
719   const Instruction *Term = BB->getTerminator();
720   return Term->getMetadata("amdgpu.uniform") ||
721          Term->getMetadata("structurizecfg.uniform");
722 }
723 
724 bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
725                                              unsigned ShAmtBits) const {
726   assert(N->getOpcode() == ISD::AND);
727 
728   const APInt &RHS = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
729   if (RHS.countTrailingOnes() >= ShAmtBits)
730     return true;
731 
732   const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
733   return (LHSKnownZeros | RHS).countTrailingOnes() >= ShAmtBits;
734 }
735 
736 static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
737                                           SDValue &N0, SDValue &N1) {
738   if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
739       Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
740     // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
741     // (i64 (bitcast (v2i32 (build_vector
742     //                        (or (extract_vector_elt V, 0), OFFSET),
743     //                        (extract_vector_elt V, 1)))))
744     SDValue Lo = Addr.getOperand(0).getOperand(0);
745     if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
746       SDValue BaseLo = Lo.getOperand(0);
747       SDValue BaseHi = Addr.getOperand(0).getOperand(1);
748       // Check that split base (Lo and Hi) are extracted from the same one.
749       if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
750           BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
751           BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
752           // Lo is statically extracted from index 0.
753           isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
754           BaseLo.getConstantOperandVal(1) == 0 &&
755           // Hi is statically extracted from index 0.
756           isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
757           BaseHi.getConstantOperandVal(1) == 1) {
758         N0 = BaseLo.getOperand(0).getOperand(0);
759         N1 = Lo.getOperand(1);
760         return true;
761       }
762     }
763   }
764   return false;
765 }
766 
767 bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
768                                                     SDValue &RHS) const {
769   if (CurDAG->isBaseWithConstantOffset(Addr)) {
770     LHS = Addr.getOperand(0);
771     RHS = Addr.getOperand(1);
772     return true;
773   }
774 
775   if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
776     assert(LHS && RHS && isa<ConstantSDNode>(RHS));
777     return true;
778   }
779 
780   return false;
781 }
782 
783 StringRef AMDGPUDAGToDAGISel::getPassName() const {
784   return "AMDGPU DAG->DAG Pattern Instruction Selection";
785 }
786 
787 //===----------------------------------------------------------------------===//
788 // Complex Patterns
789 //===----------------------------------------------------------------------===//
790 
791 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
792                                             SDValue &Offset) {
793   return false;
794 }
795 
796 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
797                                             SDValue &Offset) {
798   ConstantSDNode *C;
799   SDLoc DL(Addr);
800 
801   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
802     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
803     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
804   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
805              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
806     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
807     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
808   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
809             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
810     Base = Addr.getOperand(0);
811     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
812   } else {
813     Base = Addr;
814     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
815   }
816 
817   return true;
818 }
819 
820 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
821                                                        const SDLoc &DL) const {
822   SDNode *Mov = CurDAG->getMachineNode(
823     AMDGPU::S_MOV_B32, DL, MVT::i32,
824     CurDAG->getTargetConstant(Val, DL, MVT::i32));
825   return SDValue(Mov, 0);
826 }
827 
828 // FIXME: Should only handle addcarry/subcarry
829 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
830   SDLoc DL(N);
831   SDValue LHS = N->getOperand(0);
832   SDValue RHS = N->getOperand(1);
833 
834   unsigned Opcode = N->getOpcode();
835   bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
836   bool ProduceCarry =
837       ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
838   bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
839 
840   SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
841   SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
842 
843   SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
844                                        DL, MVT::i32, LHS, Sub0);
845   SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
846                                        DL, MVT::i32, LHS, Sub1);
847 
848   SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
849                                        DL, MVT::i32, RHS, Sub0);
850   SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
851                                        DL, MVT::i32, RHS, Sub1);
852 
853   SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
854 
855   static const unsigned OpcMap[2][2][2] = {
856       {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
857        {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
858       {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
859        {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
860 
861   unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
862   unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
863 
864   SDNode *AddLo;
865   if (!ConsumeCarry) {
866     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
867     AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
868   } else {
869     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
870     AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
871   }
872   SDValue AddHiArgs[] = {
873     SDValue(Hi0, 0),
874     SDValue(Hi1, 0),
875     SDValue(AddLo, 1)
876   };
877   SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
878 
879   SDValue RegSequenceArgs[] = {
880     CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
881     SDValue(AddLo,0),
882     Sub0,
883     SDValue(AddHi,0),
884     Sub1,
885   };
886   SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
887                                                MVT::i64, RegSequenceArgs);
888 
889   if (ProduceCarry) {
890     // Replace the carry-use
891     ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
892   }
893 
894   // Replace the remaining uses.
895   ReplaceNode(N, RegSequence);
896 }
897 
898 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
899   SDLoc DL(N);
900   SDValue LHS = N->getOperand(0);
901   SDValue RHS = N->getOperand(1);
902   SDValue CI = N->getOperand(2);
903 
904   if (N->isDivergent()) {
905     unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
906                                                    : AMDGPU::V_SUBB_U32_e64;
907     CurDAG->SelectNodeTo(
908         N, Opc, N->getVTList(),
909         {LHS, RHS, CI,
910          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
911   } else {
912     unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO
913                                                    : AMDGPU::S_SUB_CO_PSEUDO;
914     CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
915   }
916 }
917 
918 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
919   // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
920   // carry out despite the _i32 name. These were renamed in VI to _U32.
921   // FIXME: We should probably rename the opcodes here.
922   bool IsAdd = N->getOpcode() == ISD::UADDO;
923   bool IsVALU = N->isDivergent();
924 
925   for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
926        ++UI)
927     if (UI.getUse().getResNo() == 1) {
928       if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) ||
929           (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) {
930         IsVALU = true;
931         break;
932       }
933     }
934 
935   if (IsVALU) {
936     unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
937 
938     CurDAG->SelectNodeTo(
939         N, Opc, N->getVTList(),
940         {N->getOperand(0), N->getOperand(1),
941          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
942   } else {
943     unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
944                                                 : AMDGPU::S_USUBO_PSEUDO;
945 
946     CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
947                          {N->getOperand(0), N->getOperand(1)});
948   }
949 }
950 
951 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
952   SDLoc SL(N);
953   //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
954   SDValue Ops[10];
955 
956   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
957   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
958   SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
959   Ops[8] = N->getOperand(0);
960   Ops[9] = N->getOperand(4);
961 
962   // If there are no source modifiers, prefer fmac over fma because it can use
963   // the smaller VOP2 encoding.
964   bool UseFMAC = Subtarget->hasDLInsts() &&
965                  cast<ConstantSDNode>(Ops[0])->isZero() &&
966                  cast<ConstantSDNode>(Ops[2])->isZero() &&
967                  cast<ConstantSDNode>(Ops[4])->isZero();
968   unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
969   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
970 }
971 
972 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
973   SDLoc SL(N);
974   //    src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
975   SDValue Ops[8];
976 
977   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
978   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
979   Ops[6] = N->getOperand(0);
980   Ops[7] = N->getOperand(3);
981 
982   CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
983 }
984 
985 // We need to handle this here because tablegen doesn't support matching
986 // instructions with multiple outputs.
987 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
988   SDLoc SL(N);
989   EVT VT = N->getValueType(0);
990 
991   assert(VT == MVT::f32 || VT == MVT::f64);
992 
993   unsigned Opc
994     = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
995 
996   // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
997   // omod
998   SDValue Ops[8];
999   SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1000   SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1001   SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1002   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1003 }
1004 
1005 // We need to handle this here because tablegen doesn't support matching
1006 // instructions with multiple outputs.
1007 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1008   SDLoc SL(N);
1009   bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1010   unsigned Opc;
1011   if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11)
1012     Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1013                  : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1014   else
1015     Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1016 
1017   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1018   SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1019                     Clamp };
1020   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1021 }
1022 
1023 // We need to handle this here because tablegen doesn't support matching
1024 // instructions with multiple outputs.
1025 void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1026   SDLoc SL(N);
1027   bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1028   unsigned Opc;
1029   if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11)
1030     Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1031                  : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1032   else
1033     Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1034 
1035   SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1036   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1037   SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1038   SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1039   if (!SDValue(N, 0).use_empty()) {
1040     SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1041     SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1042                                         MVT::i32, SDValue(Mad, 0), Sub0);
1043     ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1044   }
1045   if (!SDValue(N, 1).use_empty()) {
1046     SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1047     SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1048                                         MVT::i32, SDValue(Mad, 0), Sub1);
1049     ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1050   }
1051   CurDAG->RemoveDeadNode(N);
1052 }
1053 
1054 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1055   if (!isUInt<16>(Offset))
1056     return false;
1057 
1058   if (!Base || Subtarget->hasUsableDSOffset() ||
1059       Subtarget->unsafeDSOffsetFoldingEnabled())
1060     return true;
1061 
1062   // On Southern Islands instruction with a negative base value and an offset
1063   // don't seem to work.
1064   return CurDAG->SignBitIsZero(Base);
1065 }
1066 
1067 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1068                                               SDValue &Offset) const {
1069   SDLoc DL(Addr);
1070   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1071     SDValue N0 = Addr.getOperand(0);
1072     SDValue N1 = Addr.getOperand(1);
1073     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1074     if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1075       // (add n0, c0)
1076       Base = N0;
1077       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1078       return true;
1079     }
1080   } else if (Addr.getOpcode() == ISD::SUB) {
1081     // sub C, x -> add (sub 0, x), C
1082     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1083       int64_t ByteOffset = C->getSExtValue();
1084       if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1085         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1086 
1087         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1088         // the known bits in isDSOffsetLegal. We need to emit the selected node
1089         // here, so this is thrown away.
1090         SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1091                                       Zero, Addr.getOperand(1));
1092 
1093         if (isDSOffsetLegal(Sub, ByteOffset)) {
1094           SmallVector<SDValue, 3> Opnds;
1095           Opnds.push_back(Zero);
1096           Opnds.push_back(Addr.getOperand(1));
1097 
1098           // FIXME: Select to VOP3 version for with-carry.
1099           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1100           if (Subtarget->hasAddNoCarry()) {
1101             SubOp = AMDGPU::V_SUB_U32_e64;
1102             Opnds.push_back(
1103                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1104           }
1105 
1106           MachineSDNode *MachineSub =
1107               CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1108 
1109           Base = SDValue(MachineSub, 0);
1110           Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1111           return true;
1112         }
1113       }
1114     }
1115   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1116     // If we have a constant address, prefer to put the constant into the
1117     // offset. This can save moves to load the constant address since multiple
1118     // operations can share the zero base address register, and enables merging
1119     // into read2 / write2 instructions.
1120 
1121     SDLoc DL(Addr);
1122 
1123     if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1124       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1125       MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1126                                  DL, MVT::i32, Zero);
1127       Base = SDValue(MovZero, 0);
1128       Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1129       return true;
1130     }
1131   }
1132 
1133   // default case
1134   Base = Addr;
1135   Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1136   return true;
1137 }
1138 
1139 bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1140                                           unsigned Offset1,
1141                                           unsigned Size) const {
1142   if (Offset0 % Size != 0 || Offset1 % Size != 0)
1143     return false;
1144   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1145     return false;
1146 
1147   if (!Base || Subtarget->hasUsableDSOffset() ||
1148       Subtarget->unsafeDSOffsetFoldingEnabled())
1149     return true;
1150 
1151   // On Southern Islands instruction with a negative base value and an offset
1152   // don't seem to work.
1153   return CurDAG->SignBitIsZero(Base);
1154 }
1155 
1156 // TODO: If offset is too big, put low 16-bit into offset.
1157 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1158                                                    SDValue &Offset0,
1159                                                    SDValue &Offset1) const {
1160   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1161 }
1162 
1163 bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1164                                                     SDValue &Offset0,
1165                                                     SDValue &Offset1) const {
1166   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1167 }
1168 
1169 bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1170                                             SDValue &Offset0, SDValue &Offset1,
1171                                             unsigned Size) const {
1172   SDLoc DL(Addr);
1173 
1174   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1175     SDValue N0 = Addr.getOperand(0);
1176     SDValue N1 = Addr.getOperand(1);
1177     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1178     unsigned OffsetValue0 = C1->getZExtValue();
1179     unsigned OffsetValue1 = OffsetValue0 + Size;
1180 
1181     // (add n0, c0)
1182     if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1183       Base = N0;
1184       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1185       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1186       return true;
1187     }
1188   } else if (Addr.getOpcode() == ISD::SUB) {
1189     // sub C, x -> add (sub 0, x), C
1190     if (const ConstantSDNode *C =
1191             dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1192       unsigned OffsetValue0 = C->getZExtValue();
1193       unsigned OffsetValue1 = OffsetValue0 + Size;
1194 
1195       if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1196         SDLoc DL(Addr);
1197         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1198 
1199         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1200         // the known bits in isDSOffsetLegal. We need to emit the selected node
1201         // here, so this is thrown away.
1202         SDValue Sub =
1203             CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1204 
1205         if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1206           SmallVector<SDValue, 3> Opnds;
1207           Opnds.push_back(Zero);
1208           Opnds.push_back(Addr.getOperand(1));
1209           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1210           if (Subtarget->hasAddNoCarry()) {
1211             SubOp = AMDGPU::V_SUB_U32_e64;
1212             Opnds.push_back(
1213                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1214           }
1215 
1216           MachineSDNode *MachineSub = CurDAG->getMachineNode(
1217               SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1218 
1219           Base = SDValue(MachineSub, 0);
1220           Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1221           Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1222           return true;
1223         }
1224       }
1225     }
1226   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1227     unsigned OffsetValue0 = CAddr->getZExtValue();
1228     unsigned OffsetValue1 = OffsetValue0 + Size;
1229 
1230     if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1231       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1232       MachineSDNode *MovZero =
1233           CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1234       Base = SDValue(MovZero, 0);
1235       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1236       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1237       return true;
1238     }
1239   }
1240 
1241   // default case
1242 
1243   Base = Addr;
1244   Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1245   Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1246   return true;
1247 }
1248 
1249 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1250                                      SDValue &SOffset, SDValue &Offset,
1251                                      SDValue &Offen, SDValue &Idxen,
1252                                      SDValue &Addr64) const {
1253   // Subtarget prefers to use flat instruction
1254   // FIXME: This should be a pattern predicate and not reach here
1255   if (Subtarget->useFlatForGlobal())
1256     return false;
1257 
1258   SDLoc DL(Addr);
1259 
1260   Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1261   Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1262   Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1263   SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1264 
1265   ConstantSDNode *C1 = nullptr;
1266   SDValue N0 = Addr;
1267   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1268     C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1269     if (isUInt<32>(C1->getZExtValue()))
1270       N0 = Addr.getOperand(0);
1271     else
1272       C1 = nullptr;
1273   }
1274 
1275   if (N0.getOpcode() == ISD::ADD) {
1276     // (add N2, N3) -> addr64, or
1277     // (add (add N2, N3), C1) -> addr64
1278     SDValue N2 = N0.getOperand(0);
1279     SDValue N3 = N0.getOperand(1);
1280     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1281 
1282     if (N2->isDivergent()) {
1283       if (N3->isDivergent()) {
1284         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1285         // addr64, and construct the resource from a 0 address.
1286         Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1287         VAddr = N0;
1288       } else {
1289         // N2 is divergent, N3 is not.
1290         Ptr = N3;
1291         VAddr = N2;
1292       }
1293     } else {
1294       // N2 is not divergent.
1295       Ptr = N2;
1296       VAddr = N3;
1297     }
1298     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1299   } else if (N0->isDivergent()) {
1300     // N0 is divergent. Use it as the addr64, and construct the resource from a
1301     // 0 address.
1302     Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1303     VAddr = N0;
1304     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1305   } else {
1306     // N0 -> offset, or
1307     // (N0 + C1) -> offset
1308     VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1309     Ptr = N0;
1310   }
1311 
1312   if (!C1) {
1313     // No offset.
1314     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1315     return true;
1316   }
1317 
1318   if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
1319     // Legal offset for instruction.
1320     Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1321     return true;
1322   }
1323 
1324   // Illegal offset, store it in soffset.
1325   Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1326   SOffset =
1327       SDValue(CurDAG->getMachineNode(
1328                   AMDGPU::S_MOV_B32, DL, MVT::i32,
1329                   CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1330               0);
1331   return true;
1332 }
1333 
1334 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1335                                            SDValue &VAddr, SDValue &SOffset,
1336                                            SDValue &Offset) const {
1337   SDValue Ptr, Offen, Idxen, Addr64;
1338 
1339   // addr64 bit was removed for volcanic islands.
1340   // FIXME: This should be a pattern predicate and not reach here
1341   if (!Subtarget->hasAddr64())
1342     return false;
1343 
1344   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1345     return false;
1346 
1347   ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1348   if (C->getSExtValue()) {
1349     SDLoc DL(Addr);
1350 
1351     const SITargetLowering& Lowering =
1352       *static_cast<const SITargetLowering*>(getTargetLowering());
1353 
1354     SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1355     return true;
1356   }
1357 
1358   return false;
1359 }
1360 
1361 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1362   SDLoc DL(N);
1363 
1364   auto *FI = dyn_cast<FrameIndexSDNode>(N);
1365   SDValue TFI =
1366       FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1367 
1368   // We rebase the base address into an absolute stack address and hence
1369   // use constant 0 for soffset. This value must be retained until
1370   // frame elimination and eliminateFrameIndex will choose the appropriate
1371   // frame register if need be.
1372   return std::make_pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1373 }
1374 
1375 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1376                                                  SDValue Addr, SDValue &Rsrc,
1377                                                  SDValue &VAddr, SDValue &SOffset,
1378                                                  SDValue &ImmOffset) const {
1379 
1380   SDLoc DL(Addr);
1381   MachineFunction &MF = CurDAG->getMachineFunction();
1382   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1383 
1384   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1385 
1386   if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1387     int64_t Imm = CAddr->getSExtValue();
1388     const int64_t NullPtr =
1389         AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
1390     // Don't fold null pointer.
1391     if (Imm != NullPtr) {
1392       SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
1393       MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1394         AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1395       VAddr = SDValue(MovHighBits, 0);
1396 
1397       SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1398       ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
1399       return true;
1400     }
1401   }
1402 
1403   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1404     // (add n0, c1)
1405 
1406     SDValue N0 = Addr.getOperand(0);
1407     SDValue N1 = Addr.getOperand(1);
1408 
1409     // Offsets in vaddr must be positive if range checking is enabled.
1410     //
1411     // The total computation of vaddr + soffset + offset must not overflow.  If
1412     // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1413     // overflowing.
1414     //
1415     // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1416     // always perform a range check. If a negative vaddr base index was used,
1417     // this would fail the range check. The overall address computation would
1418     // compute a valid address, but this doesn't happen due to the range
1419     // check. For out-of-bounds MUBUF loads, a 0 is returned.
1420     //
1421     // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1422     // MUBUF vaddr, but not on older subtargets which can only do this if the
1423     // sign bit is known 0.
1424     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1425     if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) &&
1426         (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1427          CurDAG->SignBitIsZero(N0))) {
1428       std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1429       ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1430       return true;
1431     }
1432   }
1433 
1434   // (node)
1435   std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1436   ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1437   return true;
1438 }
1439 
1440 static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1441   if (Val.getOpcode() != ISD::CopyFromReg)
1442     return false;
1443   auto RC =
1444       TRI.getPhysRegClass(cast<RegisterSDNode>(Val.getOperand(1))->getReg());
1445   return RC && TRI.isSGPRClass(RC);
1446 }
1447 
1448 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1449                                                   SDValue Addr,
1450                                                   SDValue &SRsrc,
1451                                                   SDValue &SOffset,
1452                                                   SDValue &Offset) const {
1453   const SIRegisterInfo *TRI =
1454       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1455   MachineFunction &MF = CurDAG->getMachineFunction();
1456   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1457   SDLoc DL(Addr);
1458 
1459   // CopyFromReg <sgpr>
1460   if (IsCopyFromSGPR(*TRI, Addr)) {
1461     SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1462     SOffset = Addr;
1463     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1464     return true;
1465   }
1466 
1467   ConstantSDNode *CAddr;
1468   if (Addr.getOpcode() == ISD::ADD) {
1469     // Add (CopyFromReg <sgpr>) <constant>
1470     CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1471     if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1472       return false;
1473     if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1474       return false;
1475 
1476     SOffset = Addr.getOperand(0);
1477   } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1478              SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1479     // <constant>
1480     SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1481   } else {
1482     return false;
1483   }
1484 
1485   SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1486 
1487   Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1488   return true;
1489 }
1490 
1491 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1492                                            SDValue &SOffset, SDValue &Offset
1493                                            ) const {
1494   SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1495   const SIInstrInfo *TII =
1496     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1497 
1498   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1499     return false;
1500 
1501   if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1502       !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1503       !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1504     uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1505                     APInt::getAllOnes(32).getZExtValue(); // Size
1506     SDLoc DL(Addr);
1507 
1508     const SITargetLowering& Lowering =
1509       *static_cast<const SITargetLowering*>(getTargetLowering());
1510 
1511     SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1512     return true;
1513   }
1514   return false;
1515 }
1516 
1517 // Find a load or store from corresponding pattern root.
1518 // Roots may be build_vector, bitconvert or their combinations.
1519 static MemSDNode* findMemSDNode(SDNode *N) {
1520   N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
1521   if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1522     return MN;
1523   assert(isa<BuildVectorSDNode>(N));
1524   for (SDValue V : N->op_values())
1525     if (MemSDNode *MN =
1526           dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1527       return MN;
1528   llvm_unreachable("cannot find MemSDNode in the pattern!");
1529 }
1530 
1531 bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1532                                               SDValue &VAddr, SDValue &Offset,
1533                                               uint64_t FlatVariant) const {
1534   int64_t OffsetVal = 0;
1535 
1536   unsigned AS = findMemSDNode(N)->getAddressSpace();
1537 
1538   bool CanHaveFlatSegmentOffsetBug =
1539       Subtarget->hasFlatSegmentOffsetBug() &&
1540       FlatVariant == SIInstrFlags::FLAT &&
1541       (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
1542 
1543   if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1544     SDValue N0, N1;
1545     if (isBaseWithConstantOffset64(Addr, N0, N1)) {
1546       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1547 
1548       const SIInstrInfo *TII = Subtarget->getInstrInfo();
1549       if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1550         Addr = N0;
1551         OffsetVal = COffsetVal;
1552       } else {
1553         // If the offset doesn't fit, put the low bits into the offset field and
1554         // add the rest.
1555         //
1556         // For a FLAT instruction the hardware decides whether to access
1557         // global/scratch/shared memory based on the high bits of vaddr,
1558         // ignoring the offset field, so we have to ensure that when we add
1559         // remainder to vaddr it still points into the same underlying object.
1560         // The easiest way to do that is to make sure that we split the offset
1561         // into two pieces that are both >= 0 or both <= 0.
1562 
1563         SDLoc DL(N);
1564         uint64_t RemainderOffset;
1565 
1566         std::tie(OffsetVal, RemainderOffset) =
1567             TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1568 
1569         SDValue AddOffsetLo =
1570             getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1571         SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1572 
1573         if (Addr.getValueType().getSizeInBits() == 32) {
1574           SmallVector<SDValue, 3> Opnds;
1575           Opnds.push_back(N0);
1576           Opnds.push_back(AddOffsetLo);
1577           unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1578           if (Subtarget->hasAddNoCarry()) {
1579             AddOp = AMDGPU::V_ADD_U32_e64;
1580             Opnds.push_back(Clamp);
1581           }
1582           Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1583         } else {
1584           // TODO: Should this try to use a scalar add pseudo if the base address
1585           // is uniform and saddr is usable?
1586           SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1587           SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1588 
1589           SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1590                                                 DL, MVT::i32, N0, Sub0);
1591           SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1592                                                 DL, MVT::i32, N0, Sub1);
1593 
1594           SDValue AddOffsetHi =
1595               getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1596 
1597           SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1598 
1599           SDNode *Add =
1600               CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1601                                      {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1602 
1603           SDNode *Addc = CurDAG->getMachineNode(
1604               AMDGPU::V_ADDC_U32_e64, DL, VTs,
1605               {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1606 
1607           SDValue RegSequenceArgs[] = {
1608               CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1609               SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1610 
1611           Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1612                                                 MVT::i64, RegSequenceArgs),
1613                          0);
1614         }
1615       }
1616     }
1617   }
1618 
1619   VAddr = Addr;
1620   Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
1621   return true;
1622 }
1623 
1624 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1625                                           SDValue &VAddr,
1626                                           SDValue &Offset) const {
1627   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1628 }
1629 
1630 bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1631                                             SDValue &VAddr,
1632                                             SDValue &Offset) const {
1633   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1634 }
1635 
1636 bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1637                                              SDValue &VAddr,
1638                                              SDValue &Offset) const {
1639   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1640                               SIInstrFlags::FlatScratch);
1641 }
1642 
1643 // If this matches zero_extend i32:x, return x
1644 static SDValue matchZExtFromI32(SDValue Op) {
1645   if (Op.getOpcode() != ISD::ZERO_EXTEND)
1646     return SDValue();
1647 
1648   SDValue ExtSrc = Op.getOperand(0);
1649   return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1650 }
1651 
1652 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1653 bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1654                                            SDValue Addr,
1655                                            SDValue &SAddr,
1656                                            SDValue &VOffset,
1657                                            SDValue &Offset) const {
1658   int64_t ImmOffset = 0;
1659 
1660   // Match the immediate offset first, which canonically is moved as low as
1661   // possible.
1662 
1663   SDValue LHS, RHS;
1664   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1665     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1666     const SIInstrInfo *TII = Subtarget->getInstrInfo();
1667 
1668     if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1669                                SIInstrFlags::FlatGlobal)) {
1670       Addr = LHS;
1671       ImmOffset = COffsetVal;
1672     } else if (!LHS->isDivergent()) {
1673       if (COffsetVal > 0) {
1674         SDLoc SL(N);
1675         // saddr + large_offset -> saddr +
1676         //                         (voffset = large_offset & ~MaxOffset) +
1677         //                         (large_offset & MaxOffset);
1678         int64_t SplitImmOffset, RemainderOffset;
1679         std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1680             COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
1681 
1682         if (isUInt<32>(RemainderOffset)) {
1683           SDNode *VMov = CurDAG->getMachineNode(
1684               AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1685               CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1686           VOffset = SDValue(VMov, 0);
1687           SAddr = LHS;
1688           Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1689           return true;
1690         }
1691       }
1692 
1693       // We are adding a 64 bit SGPR and a constant. If constant bus limit
1694       // is 1 we would need to perform 1 or 2 extra moves for each half of
1695       // the constant and it is better to do a scalar add and then issue a
1696       // single VALU instruction to materialize zero. Otherwise it is less
1697       // instructions to perform VALU adds with immediates or inline literals.
1698       unsigned NumLiterals =
1699           !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1700           !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1701       if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1702         return false;
1703     }
1704   }
1705 
1706   // Match the variable offset.
1707   if (Addr.getOpcode() == ISD::ADD) {
1708     LHS = Addr.getOperand(0);
1709     RHS = Addr.getOperand(1);
1710 
1711     if (!LHS->isDivergent()) {
1712       // add (i64 sgpr), (zero_extend (i32 vgpr))
1713       if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1714         SAddr = LHS;
1715         VOffset = ZextRHS;
1716       }
1717     }
1718 
1719     if (!SAddr && !RHS->isDivergent()) {
1720       // add (zero_extend (i32 vgpr)), (i64 sgpr)
1721       if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1722         SAddr = RHS;
1723         VOffset = ZextLHS;
1724       }
1725     }
1726 
1727     if (SAddr) {
1728       Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1729       return true;
1730     }
1731   }
1732 
1733   if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1734       isa<ConstantSDNode>(Addr))
1735     return false;
1736 
1737   // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1738   // moves required to copy a 64-bit SGPR to VGPR.
1739   SAddr = Addr;
1740   SDNode *VMov =
1741       CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1742                              CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1743   VOffset = SDValue(VMov, 0);
1744   Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1745   return true;
1746 }
1747 
1748 static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
1749   if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1750     SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1751   } else if (SAddr.getOpcode() == ISD::ADD &&
1752              isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1753     // Materialize this into a scalar move for scalar address to avoid
1754     // readfirstlane.
1755     auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1756     SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1757                                               FI->getValueType(0));
1758     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1759                                            MVT::i32, TFI, SAddr.getOperand(1)),
1760                     0);
1761   }
1762 
1763   return SAddr;
1764 }
1765 
1766 // Match (32-bit SGPR base) + sext(imm offset)
1767 bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1768                                             SDValue &SAddr,
1769                                             SDValue &Offset) const {
1770   if (Addr->isDivergent())
1771     return false;
1772 
1773   SDLoc DL(Addr);
1774 
1775   int64_t COffsetVal = 0;
1776 
1777   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1778     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1779     SAddr = Addr.getOperand(0);
1780   } else {
1781     SAddr = Addr;
1782   }
1783 
1784   SAddr = SelectSAddrFI(CurDAG, SAddr);
1785 
1786   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1787 
1788   if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1789                               SIInstrFlags::FlatScratch)) {
1790     int64_t SplitImmOffset, RemainderOffset;
1791     std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1792         COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
1793 
1794     COffsetVal = SplitImmOffset;
1795 
1796     SDValue AddOffset =
1797         SAddr.getOpcode() == ISD::TargetFrameIndex
1798             ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1799             : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1800     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1801                                            SAddr, AddOffset),
1802                     0);
1803   }
1804 
1805   Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1806 
1807   return true;
1808 }
1809 
1810 // Check whether the flat scratch SVS swizzle bug affects this access.
1811 bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1812     SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1813   if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1814     return false;
1815 
1816   // The bug affects the swizzling of SVS accesses if there is any carry out
1817   // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1818   // voffset to (soffset + inst_offset).
1819   KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1820   KnownBits SKnown = KnownBits::computeForAddSub(
1821       true, false, CurDAG->computeKnownBits(SAddr),
1822       KnownBits::makeConstant(APInt(32, ImmOffset)));
1823   uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1824   uint64_t SMax = SKnown.getMaxValue().getZExtValue();
1825   return (VMax & 3) + (SMax & 3) >= 4;
1826 }
1827 
1828 bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1829                                              SDValue &VAddr, SDValue &SAddr,
1830                                              SDValue &Offset) const  {
1831   int64_t ImmOffset = 0;
1832 
1833   SDValue LHS, RHS;
1834   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1835     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1836     const SIInstrInfo *TII = Subtarget->getInstrInfo();
1837 
1838     if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1839       Addr = LHS;
1840       ImmOffset = COffsetVal;
1841     } else if (!LHS->isDivergent() && COffsetVal > 0) {
1842       SDLoc SL(N);
1843       // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1844       //                         (large_offset & MaxOffset);
1845       int64_t SplitImmOffset, RemainderOffset;
1846       std::tie(SplitImmOffset, RemainderOffset)
1847         = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1848 
1849       if (isUInt<32>(RemainderOffset)) {
1850         SDNode *VMov = CurDAG->getMachineNode(
1851           AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1852           CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1853         VAddr = SDValue(VMov, 0);
1854         SAddr = LHS;
1855         if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1856           return false;
1857         Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1858         return true;
1859       }
1860     }
1861   }
1862 
1863   if (Addr.getOpcode() != ISD::ADD)
1864     return false;
1865 
1866   LHS = Addr.getOperand(0);
1867   RHS = Addr.getOperand(1);
1868 
1869   if (!LHS->isDivergent() && RHS->isDivergent()) {
1870     SAddr = LHS;
1871     VAddr = RHS;
1872   } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1873     SAddr = RHS;
1874     VAddr = LHS;
1875   } else {
1876     return false;
1877   }
1878 
1879   if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
1880     return false;
1881   SAddr = SelectSAddrFI(CurDAG, SAddr);
1882   Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1883   return true;
1884 }
1885 
1886 // Match an immediate (if Imm is true) or an SGPR (if Imm is false)
1887 // offset. If Imm32Only is true, match only 32-bit immediate offsets
1888 // available on CI.
1889 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue Addr, SDValue ByteOffsetNode,
1890                                           SDValue *SOffset, SDValue *Offset,
1891                                           bool Imm32Only) const {
1892   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
1893   if (!C) {
1894     if (!SOffset)
1895       return false;
1896     if (ByteOffsetNode.getValueType().isScalarInteger() &&
1897         ByteOffsetNode.getValueType().getSizeInBits() == 32) {
1898       *SOffset = ByteOffsetNode;
1899       return true;
1900     }
1901     if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
1902       if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
1903         *SOffset = ByteOffsetNode.getOperand(0);
1904         return true;
1905       }
1906     }
1907     return false;
1908   }
1909 
1910   SDLoc SL(ByteOffsetNode);
1911   // GFX9 and GFX10 have signed byte immediate offsets.
1912   int64_t ByteOffset = C->getSExtValue();
1913   Optional<int64_t> EncodedOffset =
1914       AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
1915   if (EncodedOffset && Offset && !Imm32Only) {
1916     *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1917     return true;
1918   }
1919 
1920   // SGPR and literal offsets are unsigned.
1921   if (ByteOffset < 0)
1922     return false;
1923 
1924   EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
1925   if (EncodedOffset && Offset && Imm32Only) {
1926     *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1927     return true;
1928   }
1929 
1930   if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
1931     return false;
1932 
1933   if (SOffset) {
1934     SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
1935     *SOffset = SDValue(
1936         CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
1937     return true;
1938   }
1939 
1940   return false;
1941 }
1942 
1943 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
1944   if (Addr.getValueType() != MVT::i32)
1945     return Addr;
1946 
1947   // Zero-extend a 32-bit address.
1948   SDLoc SL(Addr);
1949 
1950   const MachineFunction &MF = CurDAG->getMachineFunction();
1951   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1952   unsigned AddrHiVal = Info->get32BitAddressHighBits();
1953   SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
1954 
1955   const SDValue Ops[] = {
1956     CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
1957     Addr,
1958     CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
1959     SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
1960             0),
1961     CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
1962   };
1963 
1964   return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
1965                                         Ops), 0);
1966 }
1967 
1968 // Match a base and an immediate (if Imm is true) or an SGPR
1969 // (if Imm is false) offset. If Imm32Only is true, match only 32-bit
1970 // immediate offsets available on CI.
1971 bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
1972                                               SDValue *SOffset, SDValue *Offset,
1973                                               bool Imm32Only) const {
1974   SDLoc SL(Addr);
1975 
1976   if (SOffset && Offset) {
1977     assert(!Imm32Only);
1978     SDValue B;
1979     return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
1980            SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
1981   }
1982 
1983   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
1984   // wraparound, because s_load instructions perform the addition in 64 bits.
1985   if ((Addr.getValueType() != MVT::i32 ||
1986        Addr->getFlags().hasNoUnsignedWrap())) {
1987     SDValue N0, N1;
1988     // Extract the base and offset if possible.
1989     if (CurDAG->isBaseWithConstantOffset(Addr) ||
1990         Addr.getOpcode() == ISD::ADD) {
1991       N0 = Addr.getOperand(0);
1992       N1 = Addr.getOperand(1);
1993     } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
1994       assert(N0 && N1 && isa<ConstantSDNode>(N1));
1995     }
1996     if (N0 && N1) {
1997       if (SelectSMRDOffset(N0, N1, SOffset, Offset, Imm32Only)) {
1998         SBase = N0;
1999         return true;
2000       }
2001       if (SelectSMRDOffset(N1, N0, SOffset, Offset, Imm32Only)) {
2002         SBase = N1;
2003         return true;
2004       }
2005     }
2006     return false;
2007   }
2008   if (Offset && !SOffset) {
2009     SBase = Addr;
2010     *Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
2011     return true;
2012   }
2013   return false;
2014 }
2015 
2016 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2017                                     SDValue *SOffset, SDValue *Offset,
2018                                     bool Imm32Only) const {
2019   if (!SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only))
2020     return false;
2021   SBase = Expand32BitAddress(SBase);
2022   return true;
2023 }
2024 
2025 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2026                                        SDValue &Offset) const {
2027   return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2028 }
2029 
2030 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2031                                          SDValue &Offset) const {
2032   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2033   return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2034                     /* Imm32Only */ true);
2035 }
2036 
2037 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2038                                         SDValue &SOffset) const {
2039   return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2040 }
2041 
2042 bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2043                                            SDValue &SOffset,
2044                                            SDValue &Offset) const {
2045   return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2046 }
2047 
2048 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
2049                                              SDValue &Offset) const {
2050   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
2051     // The immediate offset for S_BUFFER instructions is unsigned.
2052     if (auto Imm =
2053             AMDGPU::getSMRDEncodedOffset(*Subtarget, C->getZExtValue(), true)) {
2054       Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
2055       return true;
2056     }
2057   }
2058 
2059   return false;
2060 }
2061 
2062 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
2063                                                SDValue &Offset) const {
2064   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2065 
2066   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
2067     if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget,
2068                                                          C->getZExtValue())) {
2069       Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
2070       return true;
2071     }
2072   }
2073 
2074   return false;
2075 }
2076 
2077 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2078                                             SDValue &Base,
2079                                             SDValue &Offset) const {
2080   SDLoc DL(Index);
2081 
2082   if (CurDAG->isBaseWithConstantOffset(Index)) {
2083     SDValue N0 = Index.getOperand(0);
2084     SDValue N1 = Index.getOperand(1);
2085     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2086 
2087     // (add n0, c0)
2088     // Don't peel off the offset (c0) if doing so could possibly lead
2089     // the base (n0) to be negative.
2090     // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2091     if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2092         (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2093       Base = N0;
2094       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2095       return true;
2096     }
2097   }
2098 
2099   if (isa<ConstantSDNode>(Index))
2100     return false;
2101 
2102   Base = Index;
2103   Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2104   return true;
2105 }
2106 
2107 SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2108                                      SDValue Val, uint32_t Offset,
2109                                      uint32_t Width) {
2110   if (Val->isDivergent()) {
2111     unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2112     SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2113     SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2114 
2115     return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2116   }
2117   unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2118   // Transformation function, pack the offset and width of a BFE into
2119   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2120   // source, bits [5:0] contain the offset and bits [22:16] the width.
2121   uint32_t PackedVal = Offset | (Width << 16);
2122   SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2123 
2124   return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2125 }
2126 
2127 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2128   // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2129   // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2130   // Predicate: 0 < b <= c < 32
2131 
2132   const SDValue &Shl = N->getOperand(0);
2133   ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2134   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2135 
2136   if (B && C) {
2137     uint32_t BVal = B->getZExtValue();
2138     uint32_t CVal = C->getZExtValue();
2139 
2140     if (0 < BVal && BVal <= CVal && CVal < 32) {
2141       bool Signed = N->getOpcode() == ISD::SRA;
2142       ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2143                   32 - CVal));
2144       return;
2145     }
2146   }
2147   SelectCode(N);
2148 }
2149 
2150 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2151   switch (N->getOpcode()) {
2152   case ISD::AND:
2153     if (N->getOperand(0).getOpcode() == ISD::SRL) {
2154       // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2155       // Predicate: isMask(mask)
2156       const SDValue &Srl = N->getOperand(0);
2157       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2158       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2159 
2160       if (Shift && Mask) {
2161         uint32_t ShiftVal = Shift->getZExtValue();
2162         uint32_t MaskVal = Mask->getZExtValue();
2163 
2164         if (isMask_32(MaskVal)) {
2165           uint32_t WidthVal = countPopulation(MaskVal);
2166           ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2167                                   WidthVal));
2168           return;
2169         }
2170       }
2171     }
2172     break;
2173   case ISD::SRL:
2174     if (N->getOperand(0).getOpcode() == ISD::AND) {
2175       // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2176       // Predicate: isMask(mask >> b)
2177       const SDValue &And = N->getOperand(0);
2178       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2179       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2180 
2181       if (Shift && Mask) {
2182         uint32_t ShiftVal = Shift->getZExtValue();
2183         uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2184 
2185         if (isMask_32(MaskVal)) {
2186           uint32_t WidthVal = countPopulation(MaskVal);
2187           ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2188                       WidthVal));
2189           return;
2190         }
2191       }
2192     } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2193       SelectS_BFEFromShifts(N);
2194       return;
2195     }
2196     break;
2197   case ISD::SRA:
2198     if (N->getOperand(0).getOpcode() == ISD::SHL) {
2199       SelectS_BFEFromShifts(N);
2200       return;
2201     }
2202     break;
2203 
2204   case ISD::SIGN_EXTEND_INREG: {
2205     // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2206     SDValue Src = N->getOperand(0);
2207     if (Src.getOpcode() != ISD::SRL)
2208       break;
2209 
2210     const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2211     if (!Amt)
2212       break;
2213 
2214     unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2215     ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2216                             Amt->getZExtValue(), Width));
2217     return;
2218   }
2219   }
2220 
2221   SelectCode(N);
2222 }
2223 
2224 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2225   assert(N->getOpcode() == ISD::BRCOND);
2226   if (!N->hasOneUse())
2227     return false;
2228 
2229   SDValue Cond = N->getOperand(1);
2230   if (Cond.getOpcode() == ISD::CopyToReg)
2231     Cond = Cond.getOperand(2);
2232 
2233   if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2234     return false;
2235 
2236   MVT VT = Cond.getOperand(0).getSimpleValueType();
2237   if (VT == MVT::i32)
2238     return true;
2239 
2240   if (VT == MVT::i64) {
2241     auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2242 
2243     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2244     return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2245   }
2246 
2247   return false;
2248 }
2249 
2250 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2251   SDValue Cond = N->getOperand(1);
2252 
2253   if (Cond.isUndef()) {
2254     CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2255                          N->getOperand(2), N->getOperand(0));
2256     return;
2257   }
2258 
2259   const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2260   const SIRegisterInfo *TRI = ST->getRegisterInfo();
2261 
2262   bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2263   unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
2264   Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2265   SDLoc SL(N);
2266 
2267   if (!UseSCCBr) {
2268     // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not
2269     // analyzed what generates the vcc value, so we do not know whether vcc
2270     // bits for disabled lanes are 0.  Thus we need to mask out bits for
2271     // disabled lanes.
2272     //
2273     // For the case that we select S_CBRANCH_SCC1 and it gets
2274     // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2275     // SIInstrInfo::moveToVALU which inserts the S_AND).
2276     //
2277     // We could add an analysis of what generates the vcc value here and omit
2278     // the S_AND when is unnecessary. But it would be better to add a separate
2279     // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2280     // catches both cases.
2281     Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2282                                                          : AMDGPU::S_AND_B64,
2283                      SL, MVT::i1,
2284                      CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2285                                                         : AMDGPU::EXEC,
2286                                          MVT::i1),
2287                     Cond),
2288                    0);
2289   }
2290 
2291   SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2292   CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2293                        N->getOperand(2), // Basic Block
2294                        VCC.getValue(0));
2295 }
2296 
2297 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
2298   MVT VT = N->getSimpleValueType(0);
2299   bool IsFMA = N->getOpcode() == ISD::FMA;
2300   if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
2301                          !Subtarget->hasFmaMixInsts()) ||
2302       ((IsFMA && Subtarget->hasMadMixInsts()) ||
2303        (!IsFMA && Subtarget->hasFmaMixInsts()))) {
2304     SelectCode(N);
2305     return;
2306   }
2307 
2308   SDValue Src0 = N->getOperand(0);
2309   SDValue Src1 = N->getOperand(1);
2310   SDValue Src2 = N->getOperand(2);
2311   unsigned Src0Mods, Src1Mods, Src2Mods;
2312 
2313   // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
2314   // using the conversion from f16.
2315   bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
2316   bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
2317   bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
2318 
2319   assert((IsFMA || !Mode.allFP32Denormals()) &&
2320          "fmad selected with denormals enabled");
2321   // TODO: We can select this with f32 denormals enabled if all the sources are
2322   // converted from f16 (in which case fmad isn't legal).
2323 
2324   if (Sel0 || Sel1 || Sel2) {
2325     // For dummy operands.
2326     SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2327     SDValue Ops[] = {
2328       CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0,
2329       CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1,
2330       CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2,
2331       CurDAG->getTargetConstant(0, SDLoc(), MVT::i1),
2332       Zero, Zero
2333     };
2334 
2335     CurDAG->SelectNodeTo(N,
2336                          IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
2337                          MVT::f32, Ops);
2338   } else {
2339     SelectCode(N);
2340   }
2341 }
2342 
2343 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2344   // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2345   // be copied to an SGPR with readfirstlane.
2346   unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2347     AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2348 
2349   SDValue Chain = N->getOperand(0);
2350   SDValue Ptr = N->getOperand(2);
2351   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2352   MachineMemOperand *MMO = M->getMemOperand();
2353   bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2354 
2355   SDValue Offset;
2356   if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2357     SDValue PtrBase = Ptr.getOperand(0);
2358     SDValue PtrOffset = Ptr.getOperand(1);
2359 
2360     const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
2361     if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2362       N = glueCopyToM0(N, PtrBase);
2363       Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2364     }
2365   }
2366 
2367   if (!Offset) {
2368     N = glueCopyToM0(N, Ptr);
2369     Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2370   }
2371 
2372   SDValue Ops[] = {
2373     Offset,
2374     CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2375     Chain,
2376     N->getOperand(N->getNumOperands() - 1) // New glue
2377   };
2378 
2379   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2380   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2381 }
2382 
2383 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2384   switch (IntrID) {
2385   case Intrinsic::amdgcn_ds_gws_init:
2386     return AMDGPU::DS_GWS_INIT;
2387   case Intrinsic::amdgcn_ds_gws_barrier:
2388     return AMDGPU::DS_GWS_BARRIER;
2389   case Intrinsic::amdgcn_ds_gws_sema_v:
2390     return AMDGPU::DS_GWS_SEMA_V;
2391   case Intrinsic::amdgcn_ds_gws_sema_br:
2392     return AMDGPU::DS_GWS_SEMA_BR;
2393   case Intrinsic::amdgcn_ds_gws_sema_p:
2394     return AMDGPU::DS_GWS_SEMA_P;
2395   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2396     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2397   default:
2398     llvm_unreachable("not a gws intrinsic");
2399   }
2400 }
2401 
2402 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2403   if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2404       !Subtarget->hasGWSSemaReleaseAll()) {
2405     // Let this error.
2406     SelectCode(N);
2407     return;
2408   }
2409 
2410   // Chain, intrinsic ID, vsrc, offset
2411   const bool HasVSrc = N->getNumOperands() == 4;
2412   assert(HasVSrc || N->getNumOperands() == 3);
2413 
2414   SDLoc SL(N);
2415   SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2416   int ImmOffset = 0;
2417   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2418   MachineMemOperand *MMO = M->getMemOperand();
2419 
2420   // Don't worry if the offset ends up in a VGPR. Only one lane will have
2421   // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2422 
2423   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2424   // offset field) % 64. Some versions of the programming guide omit the m0
2425   // part, or claim it's from offset 0.
2426   if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2427     // If we have a constant offset, try to use the 0 in m0 as the base.
2428     // TODO: Look into changing the default m0 initialization value. If the
2429     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2430     // the immediate offset.
2431     glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2432     ImmOffset = ConstOffset->getZExtValue();
2433   } else {
2434     if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2435       ImmOffset = BaseOffset.getConstantOperandVal(1);
2436       BaseOffset = BaseOffset.getOperand(0);
2437     }
2438 
2439     // Prefer to do the shift in an SGPR since it should be possible to use m0
2440     // as the result directly. If it's already an SGPR, it will be eliminated
2441     // later.
2442     SDNode *SGPROffset
2443       = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2444                                BaseOffset);
2445     // Shift to offset in m0
2446     SDNode *M0Base
2447       = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2448                                SDValue(SGPROffset, 0),
2449                                CurDAG->getTargetConstant(16, SL, MVT::i32));
2450     glueCopyToM0(N, SDValue(M0Base, 0));
2451   }
2452 
2453   SDValue Chain = N->getOperand(0);
2454   SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2455 
2456   const unsigned Opc = gwsIntrinToOpcode(IntrID);
2457   SmallVector<SDValue, 5> Ops;
2458   if (HasVSrc)
2459     Ops.push_back(N->getOperand(2));
2460   Ops.push_back(OffsetField);
2461   Ops.push_back(Chain);
2462 
2463   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2464   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2465 }
2466 
2467 void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2468   if (Subtarget->getLDSBankCount() != 16) {
2469     // This is a single instruction with a pattern.
2470     SelectCode(N);
2471     return;
2472   }
2473 
2474   SDLoc DL(N);
2475 
2476   // This requires 2 instructions. It is possible to write a pattern to support
2477   // this, but the generated isel emitter doesn't correctly deal with multiple
2478   // output instructions using the same physical register input. The copy to m0
2479   // is incorrectly placed before the second instruction.
2480   //
2481   // TODO: Match source modifiers.
2482   //
2483   // def : Pat <
2484   //   (int_amdgcn_interp_p1_f16
2485   //    (VOP3Mods f32:$src0, i32:$src0_modifiers),
2486   //                             (i32 timm:$attrchan), (i32 timm:$attr),
2487   //                             (i1 timm:$high), M0),
2488   //   (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2489   //       timm:$attrchan, 0,
2490   //       (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2491   //   let Predicates = [has16BankLDS];
2492   // }
2493 
2494   // 16 bank LDS
2495   SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2496                                       N->getOperand(5), SDValue());
2497 
2498   SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2499 
2500   SDNode *InterpMov =
2501     CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2502         CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2503         N->getOperand(3),  // Attr
2504         N->getOperand(2),  // Attrchan
2505         ToM0.getValue(1) // In glue
2506   });
2507 
2508   SDNode *InterpP1LV =
2509     CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2510         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2511         N->getOperand(1), // Src0
2512         N->getOperand(3), // Attr
2513         N->getOperand(2), // Attrchan
2514         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2515         SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2516         N->getOperand(4), // high
2517         CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2518         CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2519         SDValue(InterpMov, 1)
2520   });
2521 
2522   CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2523 }
2524 
2525 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2526   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2527   switch (IntrID) {
2528   case Intrinsic::amdgcn_ds_append:
2529   case Intrinsic::amdgcn_ds_consume: {
2530     if (N->getValueType(0) != MVT::i32)
2531       break;
2532     SelectDSAppendConsume(N, IntrID);
2533     return;
2534   }
2535   }
2536 
2537   SelectCode(N);
2538 }
2539 
2540 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2541   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2542   unsigned Opcode;
2543   switch (IntrID) {
2544   case Intrinsic::amdgcn_wqm:
2545     Opcode = AMDGPU::WQM;
2546     break;
2547   case Intrinsic::amdgcn_softwqm:
2548     Opcode = AMDGPU::SOFT_WQM;
2549     break;
2550   case Intrinsic::amdgcn_wwm:
2551   case Intrinsic::amdgcn_strict_wwm:
2552     Opcode = AMDGPU::STRICT_WWM;
2553     break;
2554   case Intrinsic::amdgcn_strict_wqm:
2555     Opcode = AMDGPU::STRICT_WQM;
2556     break;
2557   case Intrinsic::amdgcn_interp_p1_f16:
2558     SelectInterpP1F16(N);
2559     return;
2560   default:
2561     SelectCode(N);
2562     return;
2563   }
2564 
2565   SDValue Src = N->getOperand(1);
2566   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2567 }
2568 
2569 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2570   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2571   switch (IntrID) {
2572   case Intrinsic::amdgcn_ds_gws_init:
2573   case Intrinsic::amdgcn_ds_gws_barrier:
2574   case Intrinsic::amdgcn_ds_gws_sema_v:
2575   case Intrinsic::amdgcn_ds_gws_sema_br:
2576   case Intrinsic::amdgcn_ds_gws_sema_p:
2577   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2578     SelectDS_GWS(N, IntrID);
2579     return;
2580   default:
2581     break;
2582   }
2583 
2584   SelectCode(N);
2585 }
2586 
2587 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2588                                             unsigned &Mods,
2589                                             bool AllowAbs) const {
2590   Mods = 0;
2591   Src = In;
2592 
2593   if (Src.getOpcode() == ISD::FNEG) {
2594     Mods |= SISrcMods::NEG;
2595     Src = Src.getOperand(0);
2596   }
2597 
2598   if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2599     Mods |= SISrcMods::ABS;
2600     Src = Src.getOperand(0);
2601   }
2602 
2603   return true;
2604 }
2605 
2606 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2607                                         SDValue &SrcMods) const {
2608   unsigned Mods;
2609   if (SelectVOP3ModsImpl(In, Src, Mods)) {
2610     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2611     return true;
2612   }
2613 
2614   return false;
2615 }
2616 
2617 bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2618                                          SDValue &SrcMods) const {
2619   unsigned Mods;
2620   if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
2621     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2622     return true;
2623   }
2624 
2625   return false;
2626 }
2627 
2628 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
2629                                              SDValue &SrcMods) const {
2630   SelectVOP3Mods(In, Src, SrcMods);
2631   return isNoNanSrc(Src);
2632 }
2633 
2634 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2635   if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2636     return false;
2637 
2638   Src = In;
2639   return true;
2640 }
2641 
2642 bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2643                                                SDValue &SrcMods,
2644                                                bool OpSel) const {
2645   unsigned Mods;
2646   if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
2647     if (OpSel)
2648       Mods |= SISrcMods::OP_SEL_0;
2649     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2650     return true;
2651   }
2652 
2653   return false;
2654 }
2655 
2656 bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2657                                            SDValue &SrcMods) const {
2658   return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2659 }
2660 
2661 bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2662                                              SDValue &SrcMods) const {
2663   return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2664 }
2665 
2666 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2667                                          SDValue &SrcMods, SDValue &Clamp,
2668                                          SDValue &Omod) const {
2669   SDLoc DL(In);
2670   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2671   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2672 
2673   return SelectVOP3Mods(In, Src, SrcMods);
2674 }
2675 
2676 bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2677                                           SDValue &SrcMods, SDValue &Clamp,
2678                                           SDValue &Omod) const {
2679   SDLoc DL(In);
2680   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2681   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2682 
2683   return SelectVOP3BMods(In, Src, SrcMods);
2684 }
2685 
2686 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2687                                          SDValue &Clamp, SDValue &Omod) const {
2688   Src = In;
2689 
2690   SDLoc DL(In);
2691   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2692   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2693 
2694   return true;
2695 }
2696 
2697 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2698                                          SDValue &SrcMods, bool IsDOT) const {
2699   unsigned Mods = 0;
2700   Src = In;
2701 
2702   if (Src.getOpcode() == ISD::FNEG) {
2703     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
2704     Src = Src.getOperand(0);
2705   }
2706 
2707   if (Src.getOpcode() == ISD::BUILD_VECTOR &&
2708       (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
2709     unsigned VecMods = Mods;
2710 
2711     SDValue Lo = stripBitcast(Src.getOperand(0));
2712     SDValue Hi = stripBitcast(Src.getOperand(1));
2713 
2714     if (Lo.getOpcode() == ISD::FNEG) {
2715       Lo = stripBitcast(Lo.getOperand(0));
2716       Mods ^= SISrcMods::NEG;
2717     }
2718 
2719     if (Hi.getOpcode() == ISD::FNEG) {
2720       Hi = stripBitcast(Hi.getOperand(0));
2721       Mods ^= SISrcMods::NEG_HI;
2722     }
2723 
2724     if (isExtractHiElt(Lo, Lo))
2725       Mods |= SISrcMods::OP_SEL_0;
2726 
2727     if (isExtractHiElt(Hi, Hi))
2728       Mods |= SISrcMods::OP_SEL_1;
2729 
2730     unsigned VecSize = Src.getValueSizeInBits();
2731     Lo = stripExtractLoElt(Lo);
2732     Hi = stripExtractLoElt(Hi);
2733 
2734     if (Lo.getValueSizeInBits() > VecSize) {
2735       Lo = CurDAG->getTargetExtractSubreg(
2736         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2737         MVT::getIntegerVT(VecSize), Lo);
2738     }
2739 
2740     if (Hi.getValueSizeInBits() > VecSize) {
2741       Hi = CurDAG->getTargetExtractSubreg(
2742         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2743         MVT::getIntegerVT(VecSize), Hi);
2744     }
2745 
2746     assert(Lo.getValueSizeInBits() <= VecSize &&
2747            Hi.getValueSizeInBits() <= VecSize);
2748 
2749     if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
2750       // Really a scalar input. Just select from the low half of the register to
2751       // avoid packing.
2752 
2753       if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
2754         Src = Lo;
2755       } else {
2756         assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
2757 
2758         SDLoc SL(In);
2759         SDValue Undef = SDValue(
2760           CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
2761                                  Lo.getValueType()), 0);
2762         auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
2763                                     : AMDGPU::SReg_64RegClassID;
2764         const SDValue Ops[] = {
2765           CurDAG->getTargetConstant(RC, SL, MVT::i32),
2766           Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2767           Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
2768 
2769         Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
2770                                              Src.getValueType(), Ops), 0);
2771       }
2772       SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2773       return true;
2774     }
2775 
2776     if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
2777       uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
2778                       .bitcastToAPInt().getZExtValue();
2779       if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
2780         Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);;
2781         SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2782         return true;
2783       }
2784     }
2785 
2786     Mods = VecMods;
2787   }
2788 
2789   // Packed instructions do not have abs modifiers.
2790   Mods |= SISrcMods::OP_SEL_1;
2791 
2792   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2793   return true;
2794 }
2795 
2796 bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
2797                                             SDValue &SrcMods) const {
2798   return SelectVOP3PMods(In, Src, SrcMods, true);
2799 }
2800 
2801 bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const {
2802   const ConstantSDNode *C = cast<ConstantSDNode>(In);
2803   // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
2804   // 1 promotes packed values to signed, 0 treats them as unsigned.
2805   assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
2806 
2807   unsigned Mods = SISrcMods::OP_SEL_1;
2808   unsigned SrcSign = C->getAPIntValue().getZExtValue();
2809   if (SrcSign == 1)
2810     Mods ^= SISrcMods::NEG;
2811 
2812   Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2813   return true;
2814 }
2815 
2816 bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
2817                                                   SDValue &Src) const {
2818   const ConstantSDNode *C = cast<ConstantSDNode>(In);
2819   assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
2820 
2821   unsigned Mods = SISrcMods::OP_SEL_1;
2822   unsigned SrcVal = C->getAPIntValue().getZExtValue();
2823   if (SrcVal == 1)
2824     Mods |= SISrcMods::OP_SEL_0;
2825 
2826   Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2827   return true;
2828 }
2829 
2830 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
2831                                          SDValue &SrcMods) const {
2832   Src = In;
2833   // FIXME: Handle op_sel
2834   SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
2835   return true;
2836 }
2837 
2838 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
2839                                              SDValue &SrcMods) const {
2840   // FIXME: Handle op_sel
2841   return SelectVOP3Mods(In, Src, SrcMods);
2842 }
2843 
2844 // The return value is not whether the match is possible (which it always is),
2845 // but whether or not it a conversion is really used.
2846 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
2847                                                    unsigned &Mods) const {
2848   Mods = 0;
2849   SelectVOP3ModsImpl(In, Src, Mods);
2850 
2851   if (Src.getOpcode() == ISD::FP_EXTEND) {
2852     Src = Src.getOperand(0);
2853     assert(Src.getValueType() == MVT::f16);
2854     Src = stripBitcast(Src);
2855 
2856     // Be careful about folding modifiers if we already have an abs. fneg is
2857     // applied last, so we don't want to apply an earlier fneg.
2858     if ((Mods & SISrcMods::ABS) == 0) {
2859       unsigned ModsTmp;
2860       SelectVOP3ModsImpl(Src, Src, ModsTmp);
2861 
2862       if ((ModsTmp & SISrcMods::NEG) != 0)
2863         Mods ^= SISrcMods::NEG;
2864 
2865       if ((ModsTmp & SISrcMods::ABS) != 0)
2866         Mods |= SISrcMods::ABS;
2867     }
2868 
2869     // op_sel/op_sel_hi decide the source type and source.
2870     // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
2871     // If the sources's op_sel is set, it picks the high half of the source
2872     // register.
2873 
2874     Mods |= SISrcMods::OP_SEL_1;
2875     if (isExtractHiElt(Src, Src)) {
2876       Mods |= SISrcMods::OP_SEL_0;
2877 
2878       // TODO: Should we try to look for neg/abs here?
2879     }
2880 
2881     return true;
2882   }
2883 
2884   return false;
2885 }
2886 
2887 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
2888                                                SDValue &SrcMods) const {
2889   unsigned Mods = 0;
2890   SelectVOP3PMadMixModsImpl(In, Src, Mods);
2891   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2892   return true;
2893 }
2894 
2895 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
2896   if (In.isUndef())
2897     return CurDAG->getUNDEF(MVT::i32);
2898 
2899   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
2900     SDLoc SL(In);
2901     return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
2902   }
2903 
2904   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
2905     SDLoc SL(In);
2906     return CurDAG->getConstant(
2907       C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
2908   }
2909 
2910   SDValue Src;
2911   if (isExtractHiElt(In, Src))
2912     return Src;
2913 
2914   return SDValue();
2915 }
2916 
2917 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
2918   assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
2919 
2920   const SIRegisterInfo *SIRI =
2921     static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
2922   const SIInstrInfo * SII =
2923     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
2924 
2925   unsigned Limit = 0;
2926   bool AllUsesAcceptSReg = true;
2927   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
2928     Limit < 10 && U != E; ++U, ++Limit) {
2929     const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
2930 
2931     // If the register class is unknown, it could be an unknown
2932     // register class that needs to be an SGPR, e.g. an inline asm
2933     // constraint
2934     if (!RC || SIRI->isSGPRClass(RC))
2935       return false;
2936 
2937     if (RC != &AMDGPU::VS_32RegClass) {
2938       AllUsesAcceptSReg = false;
2939       SDNode * User = *U;
2940       if (User->isMachineOpcode()) {
2941         unsigned Opc = User->getMachineOpcode();
2942         MCInstrDesc Desc = SII->get(Opc);
2943         if (Desc.isCommutable()) {
2944           unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
2945           unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
2946           if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
2947             unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
2948             const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
2949             if (CommutedRC == &AMDGPU::VS_32RegClass)
2950               AllUsesAcceptSReg = true;
2951           }
2952         }
2953       }
2954       // If "AllUsesAcceptSReg == false" so far we haven't succeeded
2955       // commuting current user. This means have at least one use
2956       // that strictly require VGPR. Thus, we will not attempt to commute
2957       // other user instructions.
2958       if (!AllUsesAcceptSReg)
2959         break;
2960     }
2961   }
2962   return !AllUsesAcceptSReg && (Limit < 10);
2963 }
2964 
2965 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
2966   auto Ld = cast<LoadSDNode>(N);
2967 
2968   return Ld->getAlign() >= Align(4) &&
2969          (((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
2970             Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
2971            !N->isDivergent()) ||
2972           (Subtarget->getScalarizeGlobalBehavior() &&
2973            Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
2974            Ld->isSimple() && !N->isDivergent() &&
2975            static_cast<const SITargetLowering *>(getTargetLowering())
2976                ->isMemOpHasNoClobberedMemOperand(N)));
2977 }
2978 
2979 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
2980   const AMDGPUTargetLowering& Lowering =
2981     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
2982   bool IsModified = false;
2983   do {
2984     IsModified = false;
2985 
2986     // Go over all selected nodes and try to fold them a bit more
2987     SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
2988     while (Position != CurDAG->allnodes_end()) {
2989       SDNode *Node = &*Position++;
2990       MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
2991       if (!MachineNode)
2992         continue;
2993 
2994       SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
2995       if (ResNode != Node) {
2996         if (ResNode)
2997           ReplaceUses(Node, ResNode);
2998         IsModified = true;
2999       }
3000     }
3001     CurDAG->RemoveDeadNodes();
3002   } while (IsModified);
3003 }
3004