1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Defines an instruction selector for the AMDGPU target.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPU.h"
15 #include "AMDGPUTargetMachine.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
18 #include "llvm/Analysis/ValueTracking.h"
19 #include "llvm/CodeGen/FunctionLoweringInfo.h"
20 #include "llvm/CodeGen/SelectionDAG.h"
21 #include "llvm/CodeGen/SelectionDAGISel.h"
22 #include "llvm/CodeGen/SelectionDAGNodes.h"
23 #include "llvm/IR/IntrinsicsAMDGPU.h"
24 #include "llvm/InitializePasses.h"
25
26 #ifdef EXPENSIVE_CHECKS
27 #include "llvm/Analysis/LoopInfo.h"
28 #include "llvm/IR/Dominators.h"
29 #endif
30
31 #define DEBUG_TYPE "isel"
32
33 using namespace llvm;
34
35 namespace llvm {
36
37 class R600InstrInfo;
38
39 } // end namespace llvm
40
41 //===----------------------------------------------------------------------===//
42 // Instruction Selector Implementation
43 //===----------------------------------------------------------------------===//
44
45 namespace {
46
isNullConstantOrUndef(SDValue V)47 static bool isNullConstantOrUndef(SDValue V) {
48 if (V.isUndef())
49 return true;
50
51 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
52 return Const != nullptr && Const->isNullValue();
53 }
54
getConstantValue(SDValue N,uint32_t & Out)55 static bool getConstantValue(SDValue N, uint32_t &Out) {
56 // This is only used for packed vectors, where ussing 0 for undef should
57 // always be good.
58 if (N.isUndef()) {
59 Out = 0;
60 return true;
61 }
62
63 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
64 Out = C->getAPIntValue().getSExtValue();
65 return true;
66 }
67
68 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
69 Out = C->getValueAPF().bitcastToAPInt().getSExtValue();
70 return true;
71 }
72
73 return false;
74 }
75
76 // TODO: Handle undef as zero
packConstantV2I16(const SDNode * N,SelectionDAG & DAG,bool Negate=false)77 static SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
78 bool Negate = false) {
79 assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
80 uint32_t LHSVal, RHSVal;
81 if (getConstantValue(N->getOperand(0), LHSVal) &&
82 getConstantValue(N->getOperand(1), RHSVal)) {
83 SDLoc SL(N);
84 uint32_t K = Negate ?
85 (-LHSVal & 0xffff) | (-RHSVal << 16) :
86 (LHSVal & 0xffff) | (RHSVal << 16);
87 return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
88 DAG.getTargetConstant(K, SL, MVT::i32));
89 }
90
91 return nullptr;
92 }
93
packNegConstantV2I16(const SDNode * N,SelectionDAG & DAG)94 static SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
95 return packConstantV2I16(N, DAG, true);
96 }
97
98 /// AMDGPU specific code to select AMDGPU machine instructions for
99 /// SelectionDAG operations.
100 class AMDGPUDAGToDAGISel : public SelectionDAGISel {
101 // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
102 // make the right decision when generating code for different targets.
103 const GCNSubtarget *Subtarget;
104
105 // Default FP mode for the current function.
106 AMDGPU::SIModeRegisterDefaults Mode;
107
108 bool EnableLateStructurizeCFG;
109
110 public:
AMDGPUDAGToDAGISel(TargetMachine * TM=nullptr,CodeGenOpt::Level OptLevel=CodeGenOpt::Default)111 explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
112 CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
113 : SelectionDAGISel(*TM, OptLevel) {
114 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
115 }
116 ~AMDGPUDAGToDAGISel() override = default;
117
getAnalysisUsage(AnalysisUsage & AU) const118 void getAnalysisUsage(AnalysisUsage &AU) const override {
119 AU.addRequired<AMDGPUArgumentUsageInfo>();
120 AU.addRequired<LegacyDivergenceAnalysis>();
121 #ifdef EXPENSIVE_CHECKS
122 AU.addRequired<DominatorTreeWrapperPass>();
123 AU.addRequired<LoopInfoWrapperPass>();
124 #endif
125 SelectionDAGISel::getAnalysisUsage(AU);
126 }
127
128 bool matchLoadD16FromBuildVector(SDNode *N) const;
129
130 bool runOnMachineFunction(MachineFunction &MF) override;
131 void PreprocessISelDAG() override;
132 void Select(SDNode *N) override;
133 StringRef getPassName() const override;
134 void PostprocessISelDAG() override;
135
136 protected:
137 void SelectBuildVector(SDNode *N, unsigned RegClassID);
138
139 private:
140 std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
141 bool isNoNanSrc(SDValue N) const;
142 bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
isNegInlineImmediate(const SDNode * N) const143 bool isNegInlineImmediate(const SDNode *N) const {
144 return isInlineImmediate(N, true);
145 }
146
isInlineImmediate16(int64_t Imm) const147 bool isInlineImmediate16(int64_t Imm) const {
148 return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm());
149 }
150
isInlineImmediate32(int64_t Imm) const151 bool isInlineImmediate32(int64_t Imm) const {
152 return AMDGPU::isInlinableLiteral32(Imm, Subtarget->hasInv2PiInlineImm());
153 }
154
isInlineImmediate64(int64_t Imm) const155 bool isInlineImmediate64(int64_t Imm) const {
156 return AMDGPU::isInlinableLiteral64(Imm, Subtarget->hasInv2PiInlineImm());
157 }
158
isInlineImmediate(const APFloat & Imm) const159 bool isInlineImmediate(const APFloat &Imm) const {
160 return Subtarget->getInstrInfo()->isInlineConstant(Imm);
161 }
162
163 bool isVGPRImm(const SDNode *N) const;
164 bool isUniformLoad(const SDNode *N) const;
165 bool isUniformBr(const SDNode *N) const;
166
167 bool isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
168 SDValue &RHS) const;
169
170 MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
171
172 SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const;
173 SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
174 SDNode *glueCopyToM0LDSInit(SDNode *N) const;
175
176 const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
177 virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
178 virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
179 bool isDSOffsetLegal(SDValue Base, unsigned Offset) const;
180 bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
181 unsigned Size) const;
182 bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
183 bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
184 SDValue &Offset1) const;
185 bool SelectDS128Bit8ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
186 SDValue &Offset1) const;
187 bool SelectDSReadWrite2(SDValue Ptr, SDValue &Base, SDValue &Offset0,
188 SDValue &Offset1, unsigned Size) const;
189 bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
190 SDValue &SOffset, SDValue &Offset, SDValue &Offen,
191 SDValue &Idxen, SDValue &Addr64) const;
192 bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
193 SDValue &SOffset, SDValue &Offset) const;
194 bool SelectMUBUFScratchOffen(SDNode *Parent,
195 SDValue Addr, SDValue &RSrc, SDValue &VAddr,
196 SDValue &SOffset, SDValue &ImmOffset) const;
197 bool SelectMUBUFScratchOffset(SDNode *Parent,
198 SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
199 SDValue &Offset) const;
200
201 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
202 SDValue &Offset) const;
203
204 bool SelectFlatOffsetImpl(SDNode *N, SDValue Addr, SDValue &VAddr,
205 SDValue &Offset, uint64_t FlatVariant) const;
206 bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
207 SDValue &Offset) const;
208 bool SelectGlobalOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
209 SDValue &Offset) const;
210 bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
211 SDValue &Offset) const;
212 bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
213 SDValue &VOffset, SDValue &Offset) const;
214 bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
215 SDValue &Offset) const;
216
217 bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
218 bool &Imm) const;
219 SDValue Expand32BitAddress(SDValue Addr) const;
220 bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
221 bool &Imm) const;
222 bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
223 bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
224 bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
225 bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
226 bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
227 bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
228
229 bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
230 bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
231 bool AllowAbs = true) const;
232 bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
233 bool SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
234 bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
235 bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
236 SDValue &Clamp, SDValue &Omod) const;
237 bool SelectVOP3BMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
238 SDValue &Clamp, SDValue &Omod) const;
239 bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
240 SDValue &Clamp, SDValue &Omod) const;
241
242 bool SelectVOP3OMods(SDValue In, SDValue &Src,
243 SDValue &Clamp, SDValue &Omod) const;
244
245 bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
246
247 bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
248
249 bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
250 bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
251 bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
252
253 SDValue getHi16Elt(SDValue In) const;
254
255 SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
256
257 void SelectADD_SUB_I64(SDNode *N);
258 void SelectAddcSubb(SDNode *N);
259 void SelectUADDO_USUBO(SDNode *N);
260 void SelectDIV_SCALE(SDNode *N);
261 void SelectMAD_64_32(SDNode *N);
262 void SelectFMA_W_CHAIN(SDNode *N);
263 void SelectFMUL_W_CHAIN(SDNode *N);
264
265 SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
266 uint32_t Offset, uint32_t Width);
267 void SelectS_BFEFromShifts(SDNode *N);
268 void SelectS_BFE(SDNode *N);
269 bool isCBranchSCC(const SDNode *N) const;
270 void SelectBRCOND(SDNode *N);
271 void SelectFMAD_FMA(SDNode *N);
272 void SelectATOMIC_CMP_SWAP(SDNode *N);
273 void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
274 void SelectDS_GWS(SDNode *N, unsigned IntrID);
275 void SelectInterpP1F16(SDNode *N);
276 void SelectINTRINSIC_W_CHAIN(SDNode *N);
277 void SelectINTRINSIC_WO_CHAIN(SDNode *N);
278 void SelectINTRINSIC_VOID(SDNode *N);
279
280 protected:
281 // Include the pieces autogenerated from the target description.
282 #include "AMDGPUGenDAGISel.inc"
283 };
284
285 class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
286 const R600Subtarget *Subtarget;
287
288 bool isConstantLoad(const MemSDNode *N, int cbID) const;
289 bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
290 bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
291 SDValue& Offset);
292 public:
R600DAGToDAGISel(TargetMachine * TM,CodeGenOpt::Level OptLevel)293 explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) :
294 AMDGPUDAGToDAGISel(TM, OptLevel) {}
295
296 void Select(SDNode *N) override;
297
298 bool SelectADDRIndirect(SDValue Addr, SDValue &Base,
299 SDValue &Offset) override;
300 bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
301 SDValue &Offset) override;
302
303 bool runOnMachineFunction(MachineFunction &MF) override;
304
PreprocessISelDAG()305 void PreprocessISelDAG() override {}
306
307 protected:
308 // Include the pieces autogenerated from the target description.
309 #include "R600GenDAGISel.inc"
310 };
311
stripBitcast(SDValue Val)312 static SDValue stripBitcast(SDValue Val) {
313 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
314 }
315
316 // Figure out if this is really an extract of the high 16-bits of a dword.
isExtractHiElt(SDValue In,SDValue & Out)317 static bool isExtractHiElt(SDValue In, SDValue &Out) {
318 In = stripBitcast(In);
319
320 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
321 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
322 if (!Idx->isOne())
323 return false;
324 Out = In.getOperand(0);
325 return true;
326 }
327 }
328
329 if (In.getOpcode() != ISD::TRUNCATE)
330 return false;
331
332 SDValue Srl = In.getOperand(0);
333 if (Srl.getOpcode() == ISD::SRL) {
334 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
335 if (ShiftAmt->getZExtValue() == 16) {
336 Out = stripBitcast(Srl.getOperand(0));
337 return true;
338 }
339 }
340 }
341
342 return false;
343 }
344
345 // Look through operations that obscure just looking at the low 16-bits of the
346 // same register.
stripExtractLoElt(SDValue In)347 static SDValue stripExtractLoElt(SDValue In) {
348 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
349 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
350 if (Idx->isNullValue() && In.getValueSizeInBits() <= 32)
351 return In.getOperand(0);
352 }
353 }
354
355 if (In.getOpcode() == ISD::TRUNCATE) {
356 SDValue Src = In.getOperand(0);
357 if (Src.getValueType().getSizeInBits() == 32)
358 return stripBitcast(Src);
359 }
360
361 return In;
362 }
363
364 } // end anonymous namespace
365
366 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
367 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)368 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
369 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
370 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
371 #ifdef EXPENSIVE_CHECKS
372 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
373 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
374 #endif
375 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
376 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
377
378 /// This pass converts a legalized DAG into a AMDGPU-specific
379 // DAG, ready for instruction scheduling.
380 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
381 CodeGenOpt::Level OptLevel) {
382 return new AMDGPUDAGToDAGISel(TM, OptLevel);
383 }
384
385 /// This pass converts a legalized DAG into a R600-specific
386 // DAG, ready for instruction scheduling.
createR600ISelDag(TargetMachine * TM,CodeGenOpt::Level OptLevel)387 FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
388 CodeGenOpt::Level OptLevel) {
389 return new R600DAGToDAGISel(TM, OptLevel);
390 }
391
runOnMachineFunction(MachineFunction & MF)392 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
393 #ifdef EXPENSIVE_CHECKS
394 DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
395 LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
396 for (auto &L : LI->getLoopsInPreorder()) {
397 assert(L->isLCSSAForm(DT));
398 }
399 #endif
400 Subtarget = &MF.getSubtarget<GCNSubtarget>();
401 Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction());
402 return SelectionDAGISel::runOnMachineFunction(MF);
403 }
404
matchLoadD16FromBuildVector(SDNode * N) const405 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
406 assert(Subtarget->d16PreservesUnusedBits());
407 MVT VT = N->getValueType(0).getSimpleVT();
408 if (VT != MVT::v2i16 && VT != MVT::v2f16)
409 return false;
410
411 SDValue Lo = N->getOperand(0);
412 SDValue Hi = N->getOperand(1);
413
414 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
415
416 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
417 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
418 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
419
420 // Need to check for possible indirect dependencies on the other half of the
421 // vector to avoid introducing a cycle.
422 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
423 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
424
425 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
426 SDValue Ops[] = {
427 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
428 };
429
430 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
431 if (LdHi->getMemoryVT() == MVT::i8) {
432 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
433 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
434 } else {
435 assert(LdHi->getMemoryVT() == MVT::i16);
436 }
437
438 SDValue NewLoadHi =
439 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
440 Ops, LdHi->getMemoryVT(),
441 LdHi->getMemOperand());
442
443 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
444 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
445 return true;
446 }
447
448 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
449 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
450 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
451 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
452 if (LdLo && Lo.hasOneUse()) {
453 SDValue TiedIn = getHi16Elt(Hi);
454 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
455 return false;
456
457 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
458 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
459 if (LdLo->getMemoryVT() == MVT::i8) {
460 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
461 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
462 } else {
463 assert(LdLo->getMemoryVT() == MVT::i16);
464 }
465
466 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
467
468 SDValue Ops[] = {
469 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
470 };
471
472 SDValue NewLoadLo =
473 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
474 Ops, LdLo->getMemoryVT(),
475 LdLo->getMemOperand());
476
477 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
478 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
479 return true;
480 }
481
482 return false;
483 }
484
PreprocessISelDAG()485 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
486 if (!Subtarget->d16PreservesUnusedBits())
487 return;
488
489 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
490
491 bool MadeChange = false;
492 while (Position != CurDAG->allnodes_begin()) {
493 SDNode *N = &*--Position;
494 if (N->use_empty())
495 continue;
496
497 switch (N->getOpcode()) {
498 case ISD::BUILD_VECTOR:
499 MadeChange |= matchLoadD16FromBuildVector(N);
500 break;
501 default:
502 break;
503 }
504 }
505
506 if (MadeChange) {
507 CurDAG->RemoveDeadNodes();
508 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
509 CurDAG->dump(););
510 }
511 }
512
isNoNanSrc(SDValue N) const513 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
514 if (TM.Options.NoNaNsFPMath)
515 return true;
516
517 // TODO: Move into isKnownNeverNaN
518 if (N->getFlags().hasNoNaNs())
519 return true;
520
521 return CurDAG->isKnownNeverNaN(N);
522 }
523
isInlineImmediate(const SDNode * N,bool Negated) const524 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
525 bool Negated) const {
526 if (N->isUndef())
527 return true;
528
529 const SIInstrInfo *TII = Subtarget->getInstrInfo();
530 if (Negated) {
531 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
532 return TII->isInlineConstant(-C->getAPIntValue());
533
534 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
535 return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
536
537 } else {
538 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
539 return TII->isInlineConstant(C->getAPIntValue());
540
541 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
542 return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
543 }
544
545 return false;
546 }
547
548 /// Determine the register class for \p OpNo
549 /// \returns The register class of the virtual register that will be used for
550 /// the given operand number \OpNo or NULL if the register class cannot be
551 /// determined.
getOperandRegClass(SDNode * N,unsigned OpNo) const552 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
553 unsigned OpNo) const {
554 if (!N->isMachineOpcode()) {
555 if (N->getOpcode() == ISD::CopyToReg) {
556 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
557 if (Reg.isVirtual()) {
558 MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
559 return MRI.getRegClass(Reg);
560 }
561
562 const SIRegisterInfo *TRI
563 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
564 return TRI->getPhysRegClass(Reg);
565 }
566
567 return nullptr;
568 }
569
570 switch (N->getMachineOpcode()) {
571 default: {
572 const MCInstrDesc &Desc =
573 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
574 unsigned OpIdx = Desc.getNumDefs() + OpNo;
575 if (OpIdx >= Desc.getNumOperands())
576 return nullptr;
577 int RegClass = Desc.OpInfo[OpIdx].RegClass;
578 if (RegClass == -1)
579 return nullptr;
580
581 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
582 }
583 case AMDGPU::REG_SEQUENCE: {
584 unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
585 const TargetRegisterClass *SuperRC =
586 Subtarget->getRegisterInfo()->getRegClass(RCID);
587
588 SDValue SubRegOp = N->getOperand(OpNo + 1);
589 unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
590 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
591 SubRegIdx);
592 }
593 }
594 }
595
glueCopyToOp(SDNode * N,SDValue NewChain,SDValue Glue) const596 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
597 SDValue Glue) const {
598 SmallVector <SDValue, 8> Ops;
599 Ops.push_back(NewChain); // Replace the chain.
600 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
601 Ops.push_back(N->getOperand(i));
602
603 Ops.push_back(Glue);
604 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
605 }
606
glueCopyToM0(SDNode * N,SDValue Val) const607 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
608 const SITargetLowering& Lowering =
609 *static_cast<const SITargetLowering*>(getTargetLowering());
610
611 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
612
613 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
614 return glueCopyToOp(N, M0, M0.getValue(1));
615 }
616
glueCopyToM0LDSInit(SDNode * N) const617 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
618 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
619 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
620 if (Subtarget->ldsRequiresM0Init())
621 return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
622 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
623 MachineFunction &MF = CurDAG->getMachineFunction();
624 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
625 return
626 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
627 }
628 return N;
629 }
630
buildSMovImm64(SDLoc & DL,uint64_t Imm,EVT VT) const631 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
632 EVT VT) const {
633 SDNode *Lo = CurDAG->getMachineNode(
634 AMDGPU::S_MOV_B32, DL, MVT::i32,
635 CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
636 SDNode *Hi =
637 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
638 CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
639 const SDValue Ops[] = {
640 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
641 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
642 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
643
644 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
645 }
646
SelectBuildVector(SDNode * N,unsigned RegClassID)647 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
648 EVT VT = N->getValueType(0);
649 unsigned NumVectorElts = VT.getVectorNumElements();
650 EVT EltVT = VT.getVectorElementType();
651 SDLoc DL(N);
652 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
653
654 if (NumVectorElts == 1) {
655 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
656 RegClass);
657 return;
658 }
659
660 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
661 "supported yet");
662 // 32 = Max Num Vector Elements
663 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
664 // 1 = Vector Register Class
665 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
666
667 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
668 Triple::amdgcn;
669 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
670 bool IsRegSeq = true;
671 unsigned NOps = N->getNumOperands();
672 for (unsigned i = 0; i < NOps; i++) {
673 // XXX: Why is this here?
674 if (isa<RegisterSDNode>(N->getOperand(i))) {
675 IsRegSeq = false;
676 break;
677 }
678 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
679 : R600RegisterInfo::getSubRegFromChannel(i);
680 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
681 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
682 }
683 if (NOps != NumVectorElts) {
684 // Fill in the missing undef elements if this was a scalar_to_vector.
685 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
686 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
687 DL, EltVT);
688 for (unsigned i = NOps; i < NumVectorElts; ++i) {
689 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
690 : R600RegisterInfo::getSubRegFromChannel(i);
691 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
692 RegSeqArgs[1 + (2 * i) + 1] =
693 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
694 }
695 }
696
697 if (!IsRegSeq)
698 SelectCode(N);
699 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
700 }
701
Select(SDNode * N)702 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
703 unsigned int Opc = N->getOpcode();
704 if (N->isMachineOpcode()) {
705 N->setNodeId(-1);
706 return; // Already selected.
707 }
708
709 // isa<MemSDNode> almost works but is slightly too permissive for some DS
710 // intrinsics.
711 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
712 (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
713 Opc == ISD::ATOMIC_LOAD_FADD ||
714 Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
715 Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
716 N = glueCopyToM0LDSInit(N);
717 SelectCode(N);
718 return;
719 }
720
721 switch (Opc) {
722 default:
723 break;
724 // We are selecting i64 ADD here instead of custom lower it during
725 // DAG legalization, so we can fold some i64 ADDs used for address
726 // calculation into the LOAD and STORE instructions.
727 case ISD::ADDC:
728 case ISD::ADDE:
729 case ISD::SUBC:
730 case ISD::SUBE: {
731 if (N->getValueType(0) != MVT::i64)
732 break;
733
734 SelectADD_SUB_I64(N);
735 return;
736 }
737 case ISD::ADDCARRY:
738 case ISD::SUBCARRY:
739 if (N->getValueType(0) != MVT::i32)
740 break;
741
742 SelectAddcSubb(N);
743 return;
744 case ISD::UADDO:
745 case ISD::USUBO: {
746 SelectUADDO_USUBO(N);
747 return;
748 }
749 case AMDGPUISD::FMUL_W_CHAIN: {
750 SelectFMUL_W_CHAIN(N);
751 return;
752 }
753 case AMDGPUISD::FMA_W_CHAIN: {
754 SelectFMA_W_CHAIN(N);
755 return;
756 }
757
758 case ISD::SCALAR_TO_VECTOR:
759 case ISD::BUILD_VECTOR: {
760 EVT VT = N->getValueType(0);
761 unsigned NumVectorElts = VT.getVectorNumElements();
762 if (VT.getScalarSizeInBits() == 16) {
763 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
764 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
765 ReplaceNode(N, Packed);
766 return;
767 }
768 }
769
770 break;
771 }
772
773 assert(VT.getVectorElementType().bitsEq(MVT::i32));
774 unsigned RegClassID =
775 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
776 SelectBuildVector(N, RegClassID);
777 return;
778 }
779 case ISD::BUILD_PAIR: {
780 SDValue RC, SubReg0, SubReg1;
781 SDLoc DL(N);
782 if (N->getValueType(0) == MVT::i128) {
783 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
784 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
785 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
786 } else if (N->getValueType(0) == MVT::i64) {
787 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
788 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
789 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
790 } else {
791 llvm_unreachable("Unhandled value type for BUILD_PAIR");
792 }
793 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
794 N->getOperand(1), SubReg1 };
795 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
796 N->getValueType(0), Ops));
797 return;
798 }
799
800 case ISD::Constant:
801 case ISD::ConstantFP: {
802 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
803 break;
804
805 uint64_t Imm;
806 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
807 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
808 else {
809 ConstantSDNode *C = cast<ConstantSDNode>(N);
810 Imm = C->getZExtValue();
811 }
812
813 SDLoc DL(N);
814 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
815 return;
816 }
817 case AMDGPUISD::BFE_I32:
818 case AMDGPUISD::BFE_U32: {
819 // There is a scalar version available, but unlike the vector version which
820 // has a separate operand for the offset and width, the scalar version packs
821 // the width and offset into a single operand. Try to move to the scalar
822 // version if the offsets are constant, so that we can try to keep extended
823 // loads of kernel arguments in SGPRs.
824
825 // TODO: Technically we could try to pattern match scalar bitshifts of
826 // dynamic values, but it's probably not useful.
827 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
828 if (!Offset)
829 break;
830
831 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
832 if (!Width)
833 break;
834
835 bool Signed = Opc == AMDGPUISD::BFE_I32;
836
837 uint32_t OffsetVal = Offset->getZExtValue();
838 uint32_t WidthVal = Width->getZExtValue();
839
840 ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
841 SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
842 return;
843 }
844 case AMDGPUISD::DIV_SCALE: {
845 SelectDIV_SCALE(N);
846 return;
847 }
848 case AMDGPUISD::MAD_I64_I32:
849 case AMDGPUISD::MAD_U64_U32: {
850 SelectMAD_64_32(N);
851 return;
852 }
853 case ISD::CopyToReg: {
854 const SITargetLowering& Lowering =
855 *static_cast<const SITargetLowering*>(getTargetLowering());
856 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
857 break;
858 }
859 case ISD::AND:
860 case ISD::SRL:
861 case ISD::SRA:
862 case ISD::SIGN_EXTEND_INREG:
863 if (N->getValueType(0) != MVT::i32)
864 break;
865
866 SelectS_BFE(N);
867 return;
868 case ISD::BRCOND:
869 SelectBRCOND(N);
870 return;
871 case ISD::FMAD:
872 case ISD::FMA:
873 SelectFMAD_FMA(N);
874 return;
875 case AMDGPUISD::ATOMIC_CMP_SWAP:
876 SelectATOMIC_CMP_SWAP(N);
877 return;
878 case AMDGPUISD::CVT_PKRTZ_F16_F32:
879 case AMDGPUISD::CVT_PKNORM_I16_F32:
880 case AMDGPUISD::CVT_PKNORM_U16_F32:
881 case AMDGPUISD::CVT_PK_U16_U32:
882 case AMDGPUISD::CVT_PK_I16_I32: {
883 // Hack around using a legal type if f16 is illegal.
884 if (N->getValueType(0) == MVT::i32) {
885 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
886 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
887 { N->getOperand(0), N->getOperand(1) });
888 SelectCode(N);
889 return;
890 }
891
892 break;
893 }
894 case ISD::INTRINSIC_W_CHAIN: {
895 SelectINTRINSIC_W_CHAIN(N);
896 return;
897 }
898 case ISD::INTRINSIC_WO_CHAIN: {
899 SelectINTRINSIC_WO_CHAIN(N);
900 return;
901 }
902 case ISD::INTRINSIC_VOID: {
903 SelectINTRINSIC_VOID(N);
904 return;
905 }
906 }
907
908 SelectCode(N);
909 }
910
isUniformBr(const SDNode * N) const911 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
912 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
913 const Instruction *Term = BB->getTerminator();
914 return Term->getMetadata("amdgpu.uniform") ||
915 Term->getMetadata("structurizecfg.uniform");
916 }
917
getBaseWithOffsetUsingSplitOR(SelectionDAG & DAG,SDValue Addr,SDValue & N0,SDValue & N1)918 static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
919 SDValue &N0, SDValue &N1) {
920 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
921 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
922 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
923 // (i64 (bitcast (v2i32 (build_vector
924 // (or (extract_vector_elt V, 0), OFFSET),
925 // (extract_vector_elt V, 1)))))
926 SDValue Lo = Addr.getOperand(0).getOperand(0);
927 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
928 SDValue BaseLo = Lo.getOperand(0);
929 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
930 // Check that split base (Lo and Hi) are extracted from the same one.
931 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
932 BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
933 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
934 // Lo is statically extracted from index 0.
935 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
936 BaseLo.getConstantOperandVal(1) == 0 &&
937 // Hi is statically extracted from index 0.
938 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
939 BaseHi.getConstantOperandVal(1) == 1) {
940 N0 = BaseLo.getOperand(0).getOperand(0);
941 N1 = Lo.getOperand(1);
942 return true;
943 }
944 }
945 }
946 return false;
947 }
948
isBaseWithConstantOffset64(SDValue Addr,SDValue & LHS,SDValue & RHS) const949 bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
950 SDValue &RHS) const {
951 if (CurDAG->isBaseWithConstantOffset(Addr)) {
952 LHS = Addr.getOperand(0);
953 RHS = Addr.getOperand(1);
954 return true;
955 }
956
957 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
958 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
959 return true;
960 }
961
962 return false;
963 }
964
getPassName() const965 StringRef AMDGPUDAGToDAGISel::getPassName() const {
966 return "AMDGPU DAG->DAG Pattern Instruction Selection";
967 }
968
969 //===----------------------------------------------------------------------===//
970 // Complex Patterns
971 //===----------------------------------------------------------------------===//
972
SelectADDRVTX_READ(SDValue Addr,SDValue & Base,SDValue & Offset)973 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
974 SDValue &Offset) {
975 return false;
976 }
977
SelectADDRIndirect(SDValue Addr,SDValue & Base,SDValue & Offset)978 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
979 SDValue &Offset) {
980 ConstantSDNode *C;
981 SDLoc DL(Addr);
982
983 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
984 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
985 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
986 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
987 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
988 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
989 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
990 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
991 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
992 Base = Addr.getOperand(0);
993 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
994 } else {
995 Base = Addr;
996 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
997 }
998
999 return true;
1000 }
1001
getMaterializedScalarImm32(int64_t Val,const SDLoc & DL) const1002 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
1003 const SDLoc &DL) const {
1004 SDNode *Mov = CurDAG->getMachineNode(
1005 AMDGPU::S_MOV_B32, DL, MVT::i32,
1006 CurDAG->getTargetConstant(Val, DL, MVT::i32));
1007 return SDValue(Mov, 0);
1008 }
1009
1010 // FIXME: Should only handle addcarry/subcarry
SelectADD_SUB_I64(SDNode * N)1011 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
1012 SDLoc DL(N);
1013 SDValue LHS = N->getOperand(0);
1014 SDValue RHS = N->getOperand(1);
1015
1016 unsigned Opcode = N->getOpcode();
1017 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1018 bool ProduceCarry =
1019 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1020 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1021
1022 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1023 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1024
1025 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1026 DL, MVT::i32, LHS, Sub0);
1027 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1028 DL, MVT::i32, LHS, Sub1);
1029
1030 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1031 DL, MVT::i32, RHS, Sub0);
1032 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1033 DL, MVT::i32, RHS, Sub1);
1034
1035 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
1036
1037 static const unsigned OpcMap[2][2][2] = {
1038 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1039 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1040 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1041 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1042
1043 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
1044 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
1045
1046 SDNode *AddLo;
1047 if (!ConsumeCarry) {
1048 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1049 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
1050 } else {
1051 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1052 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1053 }
1054 SDValue AddHiArgs[] = {
1055 SDValue(Hi0, 0),
1056 SDValue(Hi1, 0),
1057 SDValue(AddLo, 1)
1058 };
1059 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1060
1061 SDValue RegSequenceArgs[] = {
1062 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1063 SDValue(AddLo,0),
1064 Sub0,
1065 SDValue(AddHi,0),
1066 Sub1,
1067 };
1068 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1069 MVT::i64, RegSequenceArgs);
1070
1071 if (ProduceCarry) {
1072 // Replace the carry-use
1073 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1074 }
1075
1076 // Replace the remaining uses.
1077 ReplaceNode(N, RegSequence);
1078 }
1079
SelectAddcSubb(SDNode * N)1080 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1081 SDLoc DL(N);
1082 SDValue LHS = N->getOperand(0);
1083 SDValue RHS = N->getOperand(1);
1084 SDValue CI = N->getOperand(2);
1085
1086 if (N->isDivergent()) {
1087 unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
1088 : AMDGPU::V_SUBB_U32_e64;
1089 CurDAG->SelectNodeTo(
1090 N, Opc, N->getVTList(),
1091 {LHS, RHS, CI,
1092 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1093 } else {
1094 unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO
1095 : AMDGPU::S_SUB_CO_PSEUDO;
1096 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1097 }
1098 }
1099
SelectUADDO_USUBO(SDNode * N)1100 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1101 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1102 // carry out despite the _i32 name. These were renamed in VI to _U32.
1103 // FIXME: We should probably rename the opcodes here.
1104 bool IsAdd = N->getOpcode() == ISD::UADDO;
1105 bool IsVALU = N->isDivergent();
1106
1107 for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
1108 ++UI)
1109 if (UI.getUse().getResNo() == 1) {
1110 if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) ||
1111 (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) {
1112 IsVALU = true;
1113 break;
1114 }
1115 }
1116
1117 if (IsVALU) {
1118 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1119
1120 CurDAG->SelectNodeTo(
1121 N, Opc, N->getVTList(),
1122 {N->getOperand(0), N->getOperand(1),
1123 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1124 } else {
1125 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
1126 : AMDGPU::S_USUBO_PSEUDO;
1127
1128 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1129 {N->getOperand(0), N->getOperand(1)});
1130 }
1131 }
1132
SelectFMA_W_CHAIN(SDNode * N)1133 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1134 SDLoc SL(N);
1135 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1136 SDValue Ops[10];
1137
1138 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1139 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1140 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1141 Ops[8] = N->getOperand(0);
1142 Ops[9] = N->getOperand(4);
1143
1144 CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32_e64, N->getVTList(), Ops);
1145 }
1146
SelectFMUL_W_CHAIN(SDNode * N)1147 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1148 SDLoc SL(N);
1149 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1150 SDValue Ops[8];
1151
1152 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1153 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1154 Ops[6] = N->getOperand(0);
1155 Ops[7] = N->getOperand(3);
1156
1157 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1158 }
1159
1160 // We need to handle this here because tablegen doesn't support matching
1161 // instructions with multiple outputs.
SelectDIV_SCALE(SDNode * N)1162 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1163 SDLoc SL(N);
1164 EVT VT = N->getValueType(0);
1165
1166 assert(VT == MVT::f32 || VT == MVT::f64);
1167
1168 unsigned Opc
1169 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1170
1171 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1172 // omod
1173 SDValue Ops[8];
1174 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1175 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1176 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1177 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1178 }
1179
1180 // We need to handle this here because tablegen doesn't support matching
1181 // instructions with multiple outputs.
SelectMAD_64_32(SDNode * N)1182 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1183 SDLoc SL(N);
1184 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1185 unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1186
1187 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1188 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1189 Clamp };
1190 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1191 }
1192
isDSOffsetLegal(SDValue Base,unsigned Offset) const1193 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1194 if (!isUInt<16>(Offset))
1195 return false;
1196
1197 if (!Base || Subtarget->hasUsableDSOffset() ||
1198 Subtarget->unsafeDSOffsetFoldingEnabled())
1199 return true;
1200
1201 // On Southern Islands instruction with a negative base value and an offset
1202 // don't seem to work.
1203 return CurDAG->SignBitIsZero(Base);
1204 }
1205
SelectDS1Addr1Offset(SDValue Addr,SDValue & Base,SDValue & Offset) const1206 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1207 SDValue &Offset) const {
1208 SDLoc DL(Addr);
1209 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1210 SDValue N0 = Addr.getOperand(0);
1211 SDValue N1 = Addr.getOperand(1);
1212 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1213 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1214 // (add n0, c0)
1215 Base = N0;
1216 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1217 return true;
1218 }
1219 } else if (Addr.getOpcode() == ISD::SUB) {
1220 // sub C, x -> add (sub 0, x), C
1221 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1222 int64_t ByteOffset = C->getSExtValue();
1223 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1224 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1225
1226 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1227 // the known bits in isDSOffsetLegal. We need to emit the selected node
1228 // here, so this is thrown away.
1229 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1230 Zero, Addr.getOperand(1));
1231
1232 if (isDSOffsetLegal(Sub, ByteOffset)) {
1233 SmallVector<SDValue, 3> Opnds;
1234 Opnds.push_back(Zero);
1235 Opnds.push_back(Addr.getOperand(1));
1236
1237 // FIXME: Select to VOP3 version for with-carry.
1238 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1239 if (Subtarget->hasAddNoCarry()) {
1240 SubOp = AMDGPU::V_SUB_U32_e64;
1241 Opnds.push_back(
1242 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1243 }
1244
1245 MachineSDNode *MachineSub =
1246 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1247
1248 Base = SDValue(MachineSub, 0);
1249 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1250 return true;
1251 }
1252 }
1253 }
1254 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1255 // If we have a constant address, prefer to put the constant into the
1256 // offset. This can save moves to load the constant address since multiple
1257 // operations can share the zero base address register, and enables merging
1258 // into read2 / write2 instructions.
1259
1260 SDLoc DL(Addr);
1261
1262 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1263 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1264 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1265 DL, MVT::i32, Zero);
1266 Base = SDValue(MovZero, 0);
1267 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1268 return true;
1269 }
1270 }
1271
1272 // default case
1273 Base = Addr;
1274 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1275 return true;
1276 }
1277
isDSOffset2Legal(SDValue Base,unsigned Offset0,unsigned Offset1,unsigned Size) const1278 bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1279 unsigned Offset1,
1280 unsigned Size) const {
1281 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1282 return false;
1283 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1284 return false;
1285
1286 if (!Base || Subtarget->hasUsableDSOffset() ||
1287 Subtarget->unsafeDSOffsetFoldingEnabled())
1288 return true;
1289
1290 // On Southern Islands instruction with a negative base value and an offset
1291 // don't seem to work.
1292 return CurDAG->SignBitIsZero(Base);
1293 }
1294
1295 // TODO: If offset is too big, put low 16-bit into offset.
SelectDS64Bit4ByteAligned(SDValue Addr,SDValue & Base,SDValue & Offset0,SDValue & Offset1) const1296 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1297 SDValue &Offset0,
1298 SDValue &Offset1) const {
1299 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1300 }
1301
SelectDS128Bit8ByteAligned(SDValue Addr,SDValue & Base,SDValue & Offset0,SDValue & Offset1) const1302 bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1303 SDValue &Offset0,
1304 SDValue &Offset1) const {
1305 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1306 }
1307
SelectDSReadWrite2(SDValue Addr,SDValue & Base,SDValue & Offset0,SDValue & Offset1,unsigned Size) const1308 bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1309 SDValue &Offset0, SDValue &Offset1,
1310 unsigned Size) const {
1311 SDLoc DL(Addr);
1312
1313 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1314 SDValue N0 = Addr.getOperand(0);
1315 SDValue N1 = Addr.getOperand(1);
1316 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1317 unsigned OffsetValue0 = C1->getZExtValue();
1318 unsigned OffsetValue1 = OffsetValue0 + Size;
1319
1320 // (add n0, c0)
1321 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1322 Base = N0;
1323 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1324 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1325 return true;
1326 }
1327 } else if (Addr.getOpcode() == ISD::SUB) {
1328 // sub C, x -> add (sub 0, x), C
1329 if (const ConstantSDNode *C =
1330 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1331 unsigned OffsetValue0 = C->getZExtValue();
1332 unsigned OffsetValue1 = OffsetValue0 + Size;
1333
1334 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1335 SDLoc DL(Addr);
1336 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1337
1338 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1339 // the known bits in isDSOffsetLegal. We need to emit the selected node
1340 // here, so this is thrown away.
1341 SDValue Sub =
1342 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1343
1344 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1345 SmallVector<SDValue, 3> Opnds;
1346 Opnds.push_back(Zero);
1347 Opnds.push_back(Addr.getOperand(1));
1348 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1349 if (Subtarget->hasAddNoCarry()) {
1350 SubOp = AMDGPU::V_SUB_U32_e64;
1351 Opnds.push_back(
1352 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1353 }
1354
1355 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1356 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1357
1358 Base = SDValue(MachineSub, 0);
1359 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1360 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1361 return true;
1362 }
1363 }
1364 }
1365 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1366 unsigned OffsetValue0 = CAddr->getZExtValue();
1367 unsigned OffsetValue1 = OffsetValue0 + Size;
1368
1369 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1370 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1371 MachineSDNode *MovZero =
1372 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1373 Base = SDValue(MovZero, 0);
1374 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1375 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1376 return true;
1377 }
1378 }
1379
1380 // default case
1381
1382 Base = Addr;
1383 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1384 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1385 return true;
1386 }
1387
SelectMUBUF(SDValue Addr,SDValue & Ptr,SDValue & VAddr,SDValue & SOffset,SDValue & Offset,SDValue & Offen,SDValue & Idxen,SDValue & Addr64) const1388 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1389 SDValue &SOffset, SDValue &Offset,
1390 SDValue &Offen, SDValue &Idxen,
1391 SDValue &Addr64) const {
1392 // Subtarget prefers to use flat instruction
1393 // FIXME: This should be a pattern predicate and not reach here
1394 if (Subtarget->useFlatForGlobal())
1395 return false;
1396
1397 SDLoc DL(Addr);
1398
1399 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1400 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1401 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1402 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1403
1404 ConstantSDNode *C1 = nullptr;
1405 SDValue N0 = Addr;
1406 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1407 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1408 if (isUInt<32>(C1->getZExtValue()))
1409 N0 = Addr.getOperand(0);
1410 else
1411 C1 = nullptr;
1412 }
1413
1414 if (N0.getOpcode() == ISD::ADD) {
1415 // (add N2, N3) -> addr64, or
1416 // (add (add N2, N3), C1) -> addr64
1417 SDValue N2 = N0.getOperand(0);
1418 SDValue N3 = N0.getOperand(1);
1419 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1420
1421 if (N2->isDivergent()) {
1422 if (N3->isDivergent()) {
1423 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1424 // addr64, and construct the resource from a 0 address.
1425 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1426 VAddr = N0;
1427 } else {
1428 // N2 is divergent, N3 is not.
1429 Ptr = N3;
1430 VAddr = N2;
1431 }
1432 } else {
1433 // N2 is not divergent.
1434 Ptr = N2;
1435 VAddr = N3;
1436 }
1437 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1438 } else if (N0->isDivergent()) {
1439 // N0 is divergent. Use it as the addr64, and construct the resource from a
1440 // 0 address.
1441 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1442 VAddr = N0;
1443 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1444 } else {
1445 // N0 -> offset, or
1446 // (N0 + C1) -> offset
1447 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1448 Ptr = N0;
1449 }
1450
1451 if (!C1) {
1452 // No offset.
1453 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1454 return true;
1455 }
1456
1457 if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
1458 // Legal offset for instruction.
1459 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1460 return true;
1461 }
1462
1463 // Illegal offset, store it in soffset.
1464 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1465 SOffset =
1466 SDValue(CurDAG->getMachineNode(
1467 AMDGPU::S_MOV_B32, DL, MVT::i32,
1468 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1469 0);
1470 return true;
1471 }
1472
SelectMUBUFAddr64(SDValue Addr,SDValue & SRsrc,SDValue & VAddr,SDValue & SOffset,SDValue & Offset) const1473 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1474 SDValue &VAddr, SDValue &SOffset,
1475 SDValue &Offset) const {
1476 SDValue Ptr, Offen, Idxen, Addr64;
1477
1478 // addr64 bit was removed for volcanic islands.
1479 // FIXME: This should be a pattern predicate and not reach here
1480 if (!Subtarget->hasAddr64())
1481 return false;
1482
1483 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1484 return false;
1485
1486 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1487 if (C->getSExtValue()) {
1488 SDLoc DL(Addr);
1489
1490 const SITargetLowering& Lowering =
1491 *static_cast<const SITargetLowering*>(getTargetLowering());
1492
1493 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1494 return true;
1495 }
1496
1497 return false;
1498 }
1499
isStackPtrRelative(const MachinePointerInfo & PtrInfo)1500 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
1501 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
1502 return PSV && PSV->isStack();
1503 }
1504
foldFrameIndex(SDValue N) const1505 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1506 SDLoc DL(N);
1507
1508 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1509 SDValue TFI =
1510 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1511
1512 // We rebase the base address into an absolute stack address and hence
1513 // use constant 0 for soffset. This value must be retained until
1514 // frame elimination and eliminateFrameIndex will choose the appropriate
1515 // frame register if need be.
1516 return std::make_pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1517 }
1518
SelectMUBUFScratchOffen(SDNode * Parent,SDValue Addr,SDValue & Rsrc,SDValue & VAddr,SDValue & SOffset,SDValue & ImmOffset) const1519 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1520 SDValue Addr, SDValue &Rsrc,
1521 SDValue &VAddr, SDValue &SOffset,
1522 SDValue &ImmOffset) const {
1523
1524 SDLoc DL(Addr);
1525 MachineFunction &MF = CurDAG->getMachineFunction();
1526 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1527
1528 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1529
1530 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1531 int64_t Imm = CAddr->getSExtValue();
1532 const int64_t NullPtr =
1533 AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
1534 // Don't fold null pointer.
1535 if (Imm != NullPtr) {
1536 SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
1537 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1538 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1539 VAddr = SDValue(MovHighBits, 0);
1540
1541 // In a call sequence, stores to the argument stack area are relative to the
1542 // stack pointer.
1543 const MachinePointerInfo &PtrInfo
1544 = cast<MemSDNode>(Parent)->getPointerInfo();
1545 SOffset = isStackPtrRelative(PtrInfo)
1546 ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
1547 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1548 ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
1549 return true;
1550 }
1551 }
1552
1553 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1554 // (add n0, c1)
1555
1556 SDValue N0 = Addr.getOperand(0);
1557 SDValue N1 = Addr.getOperand(1);
1558
1559 // Offsets in vaddr must be positive if range checking is enabled.
1560 //
1561 // The total computation of vaddr + soffset + offset must not overflow. If
1562 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1563 // overflowing.
1564 //
1565 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1566 // always perform a range check. If a negative vaddr base index was used,
1567 // this would fail the range check. The overall address computation would
1568 // compute a valid address, but this doesn't happen due to the range
1569 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1570 //
1571 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1572 // MUBUF vaddr, but not on older subtargets which can only do this if the
1573 // sign bit is known 0.
1574 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1575 if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) &&
1576 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1577 CurDAG->SignBitIsZero(N0))) {
1578 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1579 ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1580 return true;
1581 }
1582 }
1583
1584 // (node)
1585 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1586 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1587 return true;
1588 }
1589
SelectMUBUFScratchOffset(SDNode * Parent,SDValue Addr,SDValue & SRsrc,SDValue & SOffset,SDValue & Offset) const1590 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1591 SDValue Addr,
1592 SDValue &SRsrc,
1593 SDValue &SOffset,
1594 SDValue &Offset) const {
1595 ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr);
1596 if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1597 return false;
1598
1599 SDLoc DL(Addr);
1600 MachineFunction &MF = CurDAG->getMachineFunction();
1601 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1602
1603 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1604
1605 const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
1606
1607 // FIXME: Get from MachinePointerInfo? We should only be using the frame
1608 // offset if we know this is in a call sequence.
1609 SOffset = isStackPtrRelative(PtrInfo)
1610 ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
1611 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1612
1613 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1614 return true;
1615 }
1616
SelectMUBUFOffset(SDValue Addr,SDValue & SRsrc,SDValue & SOffset,SDValue & Offset) const1617 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1618 SDValue &SOffset, SDValue &Offset
1619 ) const {
1620 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1621 const SIInstrInfo *TII =
1622 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1623
1624 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1625 return false;
1626
1627 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1628 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1629 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1630 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1631 APInt::getAllOnesValue(32).getZExtValue(); // Size
1632 SDLoc DL(Addr);
1633
1634 const SITargetLowering& Lowering =
1635 *static_cast<const SITargetLowering*>(getTargetLowering());
1636
1637 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1638 return true;
1639 }
1640 return false;
1641 }
1642
1643 // Find a load or store from corresponding pattern root.
1644 // Roots may be build_vector, bitconvert or their combinations.
findMemSDNode(SDNode * N)1645 static MemSDNode* findMemSDNode(SDNode *N) {
1646 N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
1647 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1648 return MN;
1649 assert(isa<BuildVectorSDNode>(N));
1650 for (SDValue V : N->op_values())
1651 if (MemSDNode *MN =
1652 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1653 return MN;
1654 llvm_unreachable("cannot find MemSDNode in the pattern!");
1655 }
1656
SelectFlatOffsetImpl(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset,uint64_t FlatVariant) const1657 bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1658 SDValue &VAddr, SDValue &Offset,
1659 uint64_t FlatVariant) const {
1660 int64_t OffsetVal = 0;
1661
1662 unsigned AS = findMemSDNode(N)->getAddressSpace();
1663
1664 bool CanHaveFlatSegmentOffsetBug =
1665 Subtarget->hasFlatSegmentOffsetBug() &&
1666 FlatVariant == SIInstrFlags::FLAT &&
1667 (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
1668
1669 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1670 SDValue N0, N1;
1671 if (isBaseWithConstantOffset64(Addr, N0, N1)) {
1672 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1673
1674 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1675 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1676 Addr = N0;
1677 OffsetVal = COffsetVal;
1678 } else {
1679 // If the offset doesn't fit, put the low bits into the offset field and
1680 // add the rest.
1681 //
1682 // For a FLAT instruction the hardware decides whether to access
1683 // global/scratch/shared memory based on the high bits of vaddr,
1684 // ignoring the offset field, so we have to ensure that when we add
1685 // remainder to vaddr it still points into the same underlying object.
1686 // The easiest way to do that is to make sure that we split the offset
1687 // into two pieces that are both >= 0 or both <= 0.
1688
1689 SDLoc DL(N);
1690 uint64_t RemainderOffset;
1691
1692 std::tie(OffsetVal, RemainderOffset) =
1693 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1694
1695 SDValue AddOffsetLo =
1696 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1697 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1698
1699 if (Addr.getValueType().getSizeInBits() == 32) {
1700 SmallVector<SDValue, 3> Opnds;
1701 Opnds.push_back(N0);
1702 Opnds.push_back(AddOffsetLo);
1703 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1704 if (Subtarget->hasAddNoCarry()) {
1705 AddOp = AMDGPU::V_ADD_U32_e64;
1706 Opnds.push_back(Clamp);
1707 }
1708 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1709 } else {
1710 // TODO: Should this try to use a scalar add pseudo if the base address
1711 // is uniform and saddr is usable?
1712 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1713 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1714
1715 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1716 DL, MVT::i32, N0, Sub0);
1717 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1718 DL, MVT::i32, N0, Sub1);
1719
1720 SDValue AddOffsetHi =
1721 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1722
1723 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1724
1725 SDNode *Add =
1726 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1727 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1728
1729 SDNode *Addc = CurDAG->getMachineNode(
1730 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1731 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1732
1733 SDValue RegSequenceArgs[] = {
1734 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1735 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1736
1737 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1738 MVT::i64, RegSequenceArgs),
1739 0);
1740 }
1741 }
1742 }
1743 }
1744
1745 VAddr = Addr;
1746 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
1747 return true;
1748 }
1749
SelectFlatOffset(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset) const1750 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1751 SDValue &VAddr,
1752 SDValue &Offset) const {
1753 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1754 }
1755
SelectGlobalOffset(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset) const1756 bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1757 SDValue &VAddr,
1758 SDValue &Offset) const {
1759 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1760 }
1761
SelectScratchOffset(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset) const1762 bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1763 SDValue &VAddr,
1764 SDValue &Offset) const {
1765 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1766 SIInstrFlags::FlatScratch);
1767 }
1768
1769 // If this matches zero_extend i32:x, return x
matchZExtFromI32(SDValue Op)1770 static SDValue matchZExtFromI32(SDValue Op) {
1771 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1772 return SDValue();
1773
1774 SDValue ExtSrc = Op.getOperand(0);
1775 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1776 }
1777
1778 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
SelectGlobalSAddr(SDNode * N,SDValue Addr,SDValue & SAddr,SDValue & VOffset,SDValue & Offset) const1779 bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1780 SDValue Addr,
1781 SDValue &SAddr,
1782 SDValue &VOffset,
1783 SDValue &Offset) const {
1784 int64_t ImmOffset = 0;
1785
1786 // Match the immediate offset first, which canonically is moved as low as
1787 // possible.
1788
1789 SDValue LHS, RHS;
1790 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1791 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1792 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1793
1794 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1795 SIInstrFlags::FlatGlobal)) {
1796 Addr = LHS;
1797 ImmOffset = COffsetVal;
1798 } else if (!LHS->isDivergent()) {
1799 if (COffsetVal > 0) {
1800 SDLoc SL(N);
1801 // saddr + large_offset -> saddr +
1802 // (voffset = large_offset & ~MaxOffset) +
1803 // (large_offset & MaxOffset);
1804 int64_t SplitImmOffset, RemainderOffset;
1805 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1806 COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
1807
1808 if (isUInt<32>(RemainderOffset)) {
1809 SDNode *VMov = CurDAG->getMachineNode(
1810 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1811 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1812 VOffset = SDValue(VMov, 0);
1813 SAddr = LHS;
1814 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1815 return true;
1816 }
1817 }
1818
1819 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1820 // is 1 we would need to perform 1 or 2 extra moves for each half of
1821 // the constant and it is better to do a scalar add and then issue a
1822 // single VALU instruction to materialize zero. Otherwise it is less
1823 // instructions to perform VALU adds with immediates or inline literals.
1824 unsigned NumLiterals =
1825 !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1826 !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1827 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1828 return false;
1829 }
1830 }
1831
1832 // Match the variable offset.
1833 if (Addr.getOpcode() == ISD::ADD) {
1834 LHS = Addr.getOperand(0);
1835 RHS = Addr.getOperand(1);
1836
1837 if (!LHS->isDivergent()) {
1838 // add (i64 sgpr), (zero_extend (i32 vgpr))
1839 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1840 SAddr = LHS;
1841 VOffset = ZextRHS;
1842 }
1843 }
1844
1845 if (!SAddr && !RHS->isDivergent()) {
1846 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1847 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1848 SAddr = RHS;
1849 VOffset = ZextLHS;
1850 }
1851 }
1852
1853 if (SAddr) {
1854 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1855 return true;
1856 }
1857 }
1858
1859 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1860 isa<ConstantSDNode>(Addr))
1861 return false;
1862
1863 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1864 // moves required to copy a 64-bit SGPR to VGPR.
1865 SAddr = Addr;
1866 SDNode *VMov =
1867 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1868 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1869 VOffset = SDValue(VMov, 0);
1870 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1871 return true;
1872 }
1873
SelectSAddrFI(SelectionDAG * CurDAG,SDValue SAddr)1874 static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
1875 if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1876 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1877 } else if (SAddr.getOpcode() == ISD::ADD &&
1878 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1879 // Materialize this into a scalar move for scalar address to avoid
1880 // readfirstlane.
1881 auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1882 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1883 FI->getValueType(0));
1884 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, SDLoc(SAddr),
1885 MVT::i32, TFI, SAddr.getOperand(1)),
1886 0);
1887 }
1888
1889 return SAddr;
1890 }
1891
1892 // Match (32-bit SGPR base) + sext(imm offset)
SelectScratchSAddr(SDNode * N,SDValue Addr,SDValue & SAddr,SDValue & Offset) const1893 bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N,
1894 SDValue Addr,
1895 SDValue &SAddr,
1896 SDValue &Offset) const {
1897 if (Addr->isDivergent())
1898 return false;
1899
1900 SAddr = Addr;
1901 int64_t COffsetVal = 0;
1902
1903 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1904 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1905 SAddr = Addr.getOperand(0);
1906 }
1907
1908 SAddr = SelectSAddrFI(CurDAG, SAddr);
1909
1910 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1911
1912 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1913 SIInstrFlags::FlatScratch)) {
1914 int64_t SplitImmOffset, RemainderOffset;
1915 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1916 COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
1917
1918 COffsetVal = SplitImmOffset;
1919
1920 SDLoc DL(N);
1921 SDValue AddOffset =
1922 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1923 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32,
1924 SAddr, AddOffset), 0);
1925 }
1926
1927 Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16);
1928
1929 return true;
1930 }
1931
SelectSMRDOffset(SDValue ByteOffsetNode,SDValue & Offset,bool & Imm) const1932 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
1933 SDValue &Offset, bool &Imm) const {
1934 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
1935 if (!C) {
1936 if (ByteOffsetNode.getValueType().isScalarInteger() &&
1937 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
1938 Offset = ByteOffsetNode;
1939 Imm = false;
1940 return true;
1941 }
1942 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
1943 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
1944 Offset = ByteOffsetNode.getOperand(0);
1945 Imm = false;
1946 return true;
1947 }
1948 }
1949 return false;
1950 }
1951
1952 SDLoc SL(ByteOffsetNode);
1953 // GFX9 and GFX10 have signed byte immediate offsets.
1954 int64_t ByteOffset = C->getSExtValue();
1955 Optional<int64_t> EncodedOffset =
1956 AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
1957 if (EncodedOffset) {
1958 Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1959 Imm = true;
1960 return true;
1961 }
1962
1963 // SGPR and literal offsets are unsigned.
1964 if (ByteOffset < 0)
1965 return false;
1966
1967 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
1968 if (EncodedOffset) {
1969 Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1970 return true;
1971 }
1972
1973 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
1974 return false;
1975
1976 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
1977 Offset = SDValue(
1978 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
1979
1980 return true;
1981 }
1982
Expand32BitAddress(SDValue Addr) const1983 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
1984 if (Addr.getValueType() != MVT::i32)
1985 return Addr;
1986
1987 // Zero-extend a 32-bit address.
1988 SDLoc SL(Addr);
1989
1990 const MachineFunction &MF = CurDAG->getMachineFunction();
1991 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1992 unsigned AddrHiVal = Info->get32BitAddressHighBits();
1993 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
1994
1995 const SDValue Ops[] = {
1996 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
1997 Addr,
1998 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
1999 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2000 0),
2001 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2002 };
2003
2004 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2005 Ops), 0);
2006 }
2007
SelectSMRD(SDValue Addr,SDValue & SBase,SDValue & Offset,bool & Imm) const2008 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2009 SDValue &Offset, bool &Imm) const {
2010 SDLoc SL(Addr);
2011
2012 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2013 // wraparound, because s_load instructions perform the addition in 64 bits.
2014 if ((Addr.getValueType() != MVT::i32 ||
2015 Addr->getFlags().hasNoUnsignedWrap())) {
2016 SDValue N0, N1;
2017 // Extract the base and offset if possible.
2018 if (CurDAG->isBaseWithConstantOffset(Addr) ||
2019 Addr.getOpcode() == ISD::ADD) {
2020 N0 = Addr.getOperand(0);
2021 N1 = Addr.getOperand(1);
2022 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2023 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2024 }
2025 if (N0 && N1) {
2026 if (SelectSMRDOffset(N1, Offset, Imm)) {
2027 SBase = Expand32BitAddress(N0);
2028 return true;
2029 }
2030 }
2031 }
2032 SBase = Expand32BitAddress(Addr);
2033 Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
2034 Imm = true;
2035 return true;
2036 }
2037
SelectSMRDImm(SDValue Addr,SDValue & SBase,SDValue & Offset) const2038 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2039 SDValue &Offset) const {
2040 bool Imm = false;
2041 return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
2042 }
2043
SelectSMRDImm32(SDValue Addr,SDValue & SBase,SDValue & Offset) const2044 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2045 SDValue &Offset) const {
2046
2047 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2048
2049 bool Imm = false;
2050 if (!SelectSMRD(Addr, SBase, Offset, Imm))
2051 return false;
2052
2053 return !Imm && isa<ConstantSDNode>(Offset);
2054 }
2055
SelectSMRDSgpr(SDValue Addr,SDValue & SBase,SDValue & Offset) const2056 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2057 SDValue &Offset) const {
2058 bool Imm = false;
2059 return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
2060 !isa<ConstantSDNode>(Offset);
2061 }
2062
SelectSMRDBufferImm(SDValue Addr,SDValue & Offset) const2063 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
2064 SDValue &Offset) const {
2065 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
2066 // The immediate offset for S_BUFFER instructions is unsigned.
2067 if (auto Imm =
2068 AMDGPU::getSMRDEncodedOffset(*Subtarget, C->getZExtValue(), true)) {
2069 Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
2070 return true;
2071 }
2072 }
2073
2074 return false;
2075 }
2076
SelectSMRDBufferImm32(SDValue Addr,SDValue & Offset) const2077 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
2078 SDValue &Offset) const {
2079 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2080
2081 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
2082 if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget,
2083 C->getZExtValue())) {
2084 Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
2085 return true;
2086 }
2087 }
2088
2089 return false;
2090 }
2091
SelectMOVRELOffset(SDValue Index,SDValue & Base,SDValue & Offset) const2092 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2093 SDValue &Base,
2094 SDValue &Offset) const {
2095 SDLoc DL(Index);
2096
2097 if (CurDAG->isBaseWithConstantOffset(Index)) {
2098 SDValue N0 = Index.getOperand(0);
2099 SDValue N1 = Index.getOperand(1);
2100 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2101
2102 // (add n0, c0)
2103 // Don't peel off the offset (c0) if doing so could possibly lead
2104 // the base (n0) to be negative.
2105 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2106 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2107 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2108 Base = N0;
2109 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2110 return true;
2111 }
2112 }
2113
2114 if (isa<ConstantSDNode>(Index))
2115 return false;
2116
2117 Base = Index;
2118 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2119 return true;
2120 }
2121
getS_BFE(unsigned Opcode,const SDLoc & DL,SDValue Val,uint32_t Offset,uint32_t Width)2122 SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
2123 SDValue Val, uint32_t Offset,
2124 uint32_t Width) {
2125 // Transformation function, pack the offset and width of a BFE into
2126 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2127 // source, bits [5:0] contain the offset and bits [22:16] the width.
2128 uint32_t PackedVal = Offset | (Width << 16);
2129 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2130
2131 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2132 }
2133
SelectS_BFEFromShifts(SDNode * N)2134 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2135 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2136 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2137 // Predicate: 0 < b <= c < 32
2138
2139 const SDValue &Shl = N->getOperand(0);
2140 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2141 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2142
2143 if (B && C) {
2144 uint32_t BVal = B->getZExtValue();
2145 uint32_t CVal = C->getZExtValue();
2146
2147 if (0 < BVal && BVal <= CVal && CVal < 32) {
2148 bool Signed = N->getOpcode() == ISD::SRA;
2149 unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2150
2151 ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2152 32 - CVal));
2153 return;
2154 }
2155 }
2156 SelectCode(N);
2157 }
2158
SelectS_BFE(SDNode * N)2159 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2160 switch (N->getOpcode()) {
2161 case ISD::AND:
2162 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2163 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2164 // Predicate: isMask(mask)
2165 const SDValue &Srl = N->getOperand(0);
2166 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2167 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2168
2169 if (Shift && Mask) {
2170 uint32_t ShiftVal = Shift->getZExtValue();
2171 uint32_t MaskVal = Mask->getZExtValue();
2172
2173 if (isMask_32(MaskVal)) {
2174 uint32_t WidthVal = countPopulation(MaskVal);
2175
2176 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
2177 Srl.getOperand(0), ShiftVal, WidthVal));
2178 return;
2179 }
2180 }
2181 }
2182 break;
2183 case ISD::SRL:
2184 if (N->getOperand(0).getOpcode() == ISD::AND) {
2185 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2186 // Predicate: isMask(mask >> b)
2187 const SDValue &And = N->getOperand(0);
2188 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2189 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2190
2191 if (Shift && Mask) {
2192 uint32_t ShiftVal = Shift->getZExtValue();
2193 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2194
2195 if (isMask_32(MaskVal)) {
2196 uint32_t WidthVal = countPopulation(MaskVal);
2197
2198 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
2199 And.getOperand(0), ShiftVal, WidthVal));
2200 return;
2201 }
2202 }
2203 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2204 SelectS_BFEFromShifts(N);
2205 return;
2206 }
2207 break;
2208 case ISD::SRA:
2209 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2210 SelectS_BFEFromShifts(N);
2211 return;
2212 }
2213 break;
2214
2215 case ISD::SIGN_EXTEND_INREG: {
2216 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2217 SDValue Src = N->getOperand(0);
2218 if (Src.getOpcode() != ISD::SRL)
2219 break;
2220
2221 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2222 if (!Amt)
2223 break;
2224
2225 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2226 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0),
2227 Amt->getZExtValue(), Width));
2228 return;
2229 }
2230 }
2231
2232 SelectCode(N);
2233 }
2234
isCBranchSCC(const SDNode * N) const2235 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2236 assert(N->getOpcode() == ISD::BRCOND);
2237 if (!N->hasOneUse())
2238 return false;
2239
2240 SDValue Cond = N->getOperand(1);
2241 if (Cond.getOpcode() == ISD::CopyToReg)
2242 Cond = Cond.getOperand(2);
2243
2244 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2245 return false;
2246
2247 MVT VT = Cond.getOperand(0).getSimpleValueType();
2248 if (VT == MVT::i32)
2249 return true;
2250
2251 if (VT == MVT::i64) {
2252 auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2253
2254 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2255 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2256 }
2257
2258 return false;
2259 }
2260
SelectBRCOND(SDNode * N)2261 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2262 SDValue Cond = N->getOperand(1);
2263
2264 if (Cond.isUndef()) {
2265 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2266 N->getOperand(2), N->getOperand(0));
2267 return;
2268 }
2269
2270 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2271 const SIRegisterInfo *TRI = ST->getRegisterInfo();
2272
2273 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2274 unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
2275 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2276 SDLoc SL(N);
2277
2278 if (!UseSCCBr) {
2279 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2280 // analyzed what generates the vcc value, so we do not know whether vcc
2281 // bits for disabled lanes are 0. Thus we need to mask out bits for
2282 // disabled lanes.
2283 //
2284 // For the case that we select S_CBRANCH_SCC1 and it gets
2285 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2286 // SIInstrInfo::moveToVALU which inserts the S_AND).
2287 //
2288 // We could add an analysis of what generates the vcc value here and omit
2289 // the S_AND when is unnecessary. But it would be better to add a separate
2290 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2291 // catches both cases.
2292 Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2293 : AMDGPU::S_AND_B64,
2294 SL, MVT::i1,
2295 CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2296 : AMDGPU::EXEC,
2297 MVT::i1),
2298 Cond),
2299 0);
2300 }
2301
2302 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2303 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2304 N->getOperand(2), // Basic Block
2305 VCC.getValue(0));
2306 }
2307
SelectFMAD_FMA(SDNode * N)2308 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
2309 MVT VT = N->getSimpleValueType(0);
2310 bool IsFMA = N->getOpcode() == ISD::FMA;
2311 if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
2312 !Subtarget->hasFmaMixInsts()) ||
2313 ((IsFMA && Subtarget->hasMadMixInsts()) ||
2314 (!IsFMA && Subtarget->hasFmaMixInsts()))) {
2315 SelectCode(N);
2316 return;
2317 }
2318
2319 SDValue Src0 = N->getOperand(0);
2320 SDValue Src1 = N->getOperand(1);
2321 SDValue Src2 = N->getOperand(2);
2322 unsigned Src0Mods, Src1Mods, Src2Mods;
2323
2324 // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
2325 // using the conversion from f16.
2326 bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
2327 bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
2328 bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
2329
2330 assert((IsFMA || !Mode.allFP32Denormals()) &&
2331 "fmad selected with denormals enabled");
2332 // TODO: We can select this with f32 denormals enabled if all the sources are
2333 // converted from f16 (in which case fmad isn't legal).
2334
2335 if (Sel0 || Sel1 || Sel2) {
2336 // For dummy operands.
2337 SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2338 SDValue Ops[] = {
2339 CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0,
2340 CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1,
2341 CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2,
2342 CurDAG->getTargetConstant(0, SDLoc(), MVT::i1),
2343 Zero, Zero
2344 };
2345
2346 CurDAG->SelectNodeTo(N,
2347 IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
2348 MVT::f32, Ops);
2349 } else {
2350 SelectCode(N);
2351 }
2352 }
2353
2354 // This is here because there isn't a way to use the generated sub0_sub1 as the
2355 // subreg index to EXTRACT_SUBREG in tablegen.
SelectATOMIC_CMP_SWAP(SDNode * N)2356 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
2357 MemSDNode *Mem = cast<MemSDNode>(N);
2358 unsigned AS = Mem->getAddressSpace();
2359 if (AS == AMDGPUAS::FLAT_ADDRESS) {
2360 SelectCode(N);
2361 return;
2362 }
2363
2364 MVT VT = N->getSimpleValueType(0);
2365 bool Is32 = (VT == MVT::i32);
2366 SDLoc SL(N);
2367
2368 MachineSDNode *CmpSwap = nullptr;
2369 if (Subtarget->hasAddr64()) {
2370 SDValue SRsrc, VAddr, SOffset, Offset;
2371
2372 if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset)) {
2373 unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
2374 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
2375 SDValue CmpVal = Mem->getOperand(2);
2376 SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
2377
2378 // XXX - Do we care about glue operands?
2379
2380 SDValue Ops[] = {CmpVal, VAddr, SRsrc, SOffset, Offset, CPol,
2381 Mem->getChain()};
2382
2383 CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
2384 }
2385 }
2386
2387 if (!CmpSwap) {
2388 SDValue SRsrc, SOffset, Offset;
2389 if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset)) {
2390 unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN :
2391 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
2392
2393 SDValue CmpVal = Mem->getOperand(2);
2394 SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
2395 SDValue Ops[] = {CmpVal, SRsrc, SOffset, Offset, CPol, Mem->getChain()};
2396
2397 CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
2398 }
2399 }
2400
2401 if (!CmpSwap) {
2402 SelectCode(N);
2403 return;
2404 }
2405
2406 MachineMemOperand *MMO = Mem->getMemOperand();
2407 CurDAG->setNodeMemRefs(CmpSwap, {MMO});
2408
2409 unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
2410 SDValue Extract
2411 = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));
2412
2413 ReplaceUses(SDValue(N, 0), Extract);
2414 ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));
2415 CurDAG->RemoveDeadNode(N);
2416 }
2417
SelectDSAppendConsume(SDNode * N,unsigned IntrID)2418 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2419 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2420 // be copied to an SGPR with readfirstlane.
2421 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2422 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2423
2424 SDValue Chain = N->getOperand(0);
2425 SDValue Ptr = N->getOperand(2);
2426 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2427 MachineMemOperand *MMO = M->getMemOperand();
2428 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2429
2430 SDValue Offset;
2431 if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2432 SDValue PtrBase = Ptr.getOperand(0);
2433 SDValue PtrOffset = Ptr.getOperand(1);
2434
2435 const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
2436 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2437 N = glueCopyToM0(N, PtrBase);
2438 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2439 }
2440 }
2441
2442 if (!Offset) {
2443 N = glueCopyToM0(N, Ptr);
2444 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2445 }
2446
2447 SDValue Ops[] = {
2448 Offset,
2449 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2450 Chain,
2451 N->getOperand(N->getNumOperands() - 1) // New glue
2452 };
2453
2454 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2455 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2456 }
2457
gwsIntrinToOpcode(unsigned IntrID)2458 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2459 switch (IntrID) {
2460 case Intrinsic::amdgcn_ds_gws_init:
2461 return AMDGPU::DS_GWS_INIT;
2462 case Intrinsic::amdgcn_ds_gws_barrier:
2463 return AMDGPU::DS_GWS_BARRIER;
2464 case Intrinsic::amdgcn_ds_gws_sema_v:
2465 return AMDGPU::DS_GWS_SEMA_V;
2466 case Intrinsic::amdgcn_ds_gws_sema_br:
2467 return AMDGPU::DS_GWS_SEMA_BR;
2468 case Intrinsic::amdgcn_ds_gws_sema_p:
2469 return AMDGPU::DS_GWS_SEMA_P;
2470 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2471 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2472 default:
2473 llvm_unreachable("not a gws intrinsic");
2474 }
2475 }
2476
SelectDS_GWS(SDNode * N,unsigned IntrID)2477 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2478 if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2479 !Subtarget->hasGWSSemaReleaseAll()) {
2480 // Let this error.
2481 SelectCode(N);
2482 return;
2483 }
2484
2485 // Chain, intrinsic ID, vsrc, offset
2486 const bool HasVSrc = N->getNumOperands() == 4;
2487 assert(HasVSrc || N->getNumOperands() == 3);
2488
2489 SDLoc SL(N);
2490 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2491 int ImmOffset = 0;
2492 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2493 MachineMemOperand *MMO = M->getMemOperand();
2494
2495 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2496 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2497
2498 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2499 // offset field) % 64. Some versions of the programming guide omit the m0
2500 // part, or claim it's from offset 0.
2501 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2502 // If we have a constant offset, try to use the 0 in m0 as the base.
2503 // TODO: Look into changing the default m0 initialization value. If the
2504 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2505 // the immediate offset.
2506 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2507 ImmOffset = ConstOffset->getZExtValue();
2508 } else {
2509 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2510 ImmOffset = BaseOffset.getConstantOperandVal(1);
2511 BaseOffset = BaseOffset.getOperand(0);
2512 }
2513
2514 // Prefer to do the shift in an SGPR since it should be possible to use m0
2515 // as the result directly. If it's already an SGPR, it will be eliminated
2516 // later.
2517 SDNode *SGPROffset
2518 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2519 BaseOffset);
2520 // Shift to offset in m0
2521 SDNode *M0Base
2522 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2523 SDValue(SGPROffset, 0),
2524 CurDAG->getTargetConstant(16, SL, MVT::i32));
2525 glueCopyToM0(N, SDValue(M0Base, 0));
2526 }
2527
2528 SDValue Chain = N->getOperand(0);
2529 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2530
2531 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2532 SmallVector<SDValue, 5> Ops;
2533 if (HasVSrc)
2534 Ops.push_back(N->getOperand(2));
2535 Ops.push_back(OffsetField);
2536 Ops.push_back(Chain);
2537
2538 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2539 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2540 }
2541
SelectInterpP1F16(SDNode * N)2542 void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2543 if (Subtarget->getLDSBankCount() != 16) {
2544 // This is a single instruction with a pattern.
2545 SelectCode(N);
2546 return;
2547 }
2548
2549 SDLoc DL(N);
2550
2551 // This requires 2 instructions. It is possible to write a pattern to support
2552 // this, but the generated isel emitter doesn't correctly deal with multiple
2553 // output instructions using the same physical register input. The copy to m0
2554 // is incorrectly placed before the second instruction.
2555 //
2556 // TODO: Match source modifiers.
2557 //
2558 // def : Pat <
2559 // (int_amdgcn_interp_p1_f16
2560 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2561 // (i32 timm:$attrchan), (i32 timm:$attr),
2562 // (i1 timm:$high), M0),
2563 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2564 // timm:$attrchan, 0,
2565 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2566 // let Predicates = [has16BankLDS];
2567 // }
2568
2569 // 16 bank LDS
2570 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2571 N->getOperand(5), SDValue());
2572
2573 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2574
2575 SDNode *InterpMov =
2576 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2577 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2578 N->getOperand(3), // Attr
2579 N->getOperand(2), // Attrchan
2580 ToM0.getValue(1) // In glue
2581 });
2582
2583 SDNode *InterpP1LV =
2584 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2585 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2586 N->getOperand(1), // Src0
2587 N->getOperand(3), // Attr
2588 N->getOperand(2), // Attrchan
2589 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2590 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2591 N->getOperand(4), // high
2592 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2593 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2594 SDValue(InterpMov, 1)
2595 });
2596
2597 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2598 }
2599
SelectINTRINSIC_W_CHAIN(SDNode * N)2600 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2601 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2602 switch (IntrID) {
2603 case Intrinsic::amdgcn_ds_append:
2604 case Intrinsic::amdgcn_ds_consume: {
2605 if (N->getValueType(0) != MVT::i32)
2606 break;
2607 SelectDSAppendConsume(N, IntrID);
2608 return;
2609 }
2610 }
2611
2612 SelectCode(N);
2613 }
2614
SelectINTRINSIC_WO_CHAIN(SDNode * N)2615 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2616 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2617 unsigned Opcode;
2618 switch (IntrID) {
2619 case Intrinsic::amdgcn_wqm:
2620 Opcode = AMDGPU::WQM;
2621 break;
2622 case Intrinsic::amdgcn_softwqm:
2623 Opcode = AMDGPU::SOFT_WQM;
2624 break;
2625 case Intrinsic::amdgcn_wwm:
2626 case Intrinsic::amdgcn_strict_wwm:
2627 Opcode = AMDGPU::STRICT_WWM;
2628 break;
2629 case Intrinsic::amdgcn_strict_wqm:
2630 Opcode = AMDGPU::STRICT_WQM;
2631 break;
2632 case Intrinsic::amdgcn_interp_p1_f16:
2633 SelectInterpP1F16(N);
2634 return;
2635 default:
2636 SelectCode(N);
2637 return;
2638 }
2639
2640 SDValue Src = N->getOperand(1);
2641 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2642 }
2643
SelectINTRINSIC_VOID(SDNode * N)2644 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2645 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2646 switch (IntrID) {
2647 case Intrinsic::amdgcn_ds_gws_init:
2648 case Intrinsic::amdgcn_ds_gws_barrier:
2649 case Intrinsic::amdgcn_ds_gws_sema_v:
2650 case Intrinsic::amdgcn_ds_gws_sema_br:
2651 case Intrinsic::amdgcn_ds_gws_sema_p:
2652 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2653 SelectDS_GWS(N, IntrID);
2654 return;
2655 default:
2656 break;
2657 }
2658
2659 SelectCode(N);
2660 }
2661
SelectVOP3ModsImpl(SDValue In,SDValue & Src,unsigned & Mods,bool AllowAbs) const2662 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2663 unsigned &Mods,
2664 bool AllowAbs) const {
2665 Mods = 0;
2666 Src = In;
2667
2668 if (Src.getOpcode() == ISD::FNEG) {
2669 Mods |= SISrcMods::NEG;
2670 Src = Src.getOperand(0);
2671 }
2672
2673 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2674 Mods |= SISrcMods::ABS;
2675 Src = Src.getOperand(0);
2676 }
2677
2678 return true;
2679 }
2680
SelectVOP3Mods(SDValue In,SDValue & Src,SDValue & SrcMods) const2681 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2682 SDValue &SrcMods) const {
2683 unsigned Mods;
2684 if (SelectVOP3ModsImpl(In, Src, Mods)) {
2685 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2686 return true;
2687 }
2688
2689 return false;
2690 }
2691
SelectVOP3BMods(SDValue In,SDValue & Src,SDValue & SrcMods) const2692 bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2693 SDValue &SrcMods) const {
2694 unsigned Mods;
2695 if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
2696 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2697 return true;
2698 }
2699
2700 return false;
2701 }
2702
SelectVOP3Mods_NNaN(SDValue In,SDValue & Src,SDValue & SrcMods) const2703 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
2704 SDValue &SrcMods) const {
2705 SelectVOP3Mods(In, Src, SrcMods);
2706 return isNoNanSrc(Src);
2707 }
2708
SelectVOP3NoMods(SDValue In,SDValue & Src) const2709 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2710 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2711 return false;
2712
2713 Src = In;
2714 return true;
2715 }
2716
SelectVOP3Mods0(SDValue In,SDValue & Src,SDValue & SrcMods,SDValue & Clamp,SDValue & Omod) const2717 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2718 SDValue &SrcMods, SDValue &Clamp,
2719 SDValue &Omod) const {
2720 SDLoc DL(In);
2721 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2722 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2723
2724 return SelectVOP3Mods(In, Src, SrcMods);
2725 }
2726
SelectVOP3BMods0(SDValue In,SDValue & Src,SDValue & SrcMods,SDValue & Clamp,SDValue & Omod) const2727 bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2728 SDValue &SrcMods, SDValue &Clamp,
2729 SDValue &Omod) const {
2730 SDLoc DL(In);
2731 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2732 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2733
2734 return SelectVOP3BMods(In, Src, SrcMods);
2735 }
2736
SelectVOP3OMods(SDValue In,SDValue & Src,SDValue & Clamp,SDValue & Omod) const2737 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2738 SDValue &Clamp, SDValue &Omod) const {
2739 Src = In;
2740
2741 SDLoc DL(In);
2742 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2743 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2744
2745 return true;
2746 }
2747
SelectVOP3PMods(SDValue In,SDValue & Src,SDValue & SrcMods) const2748 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2749 SDValue &SrcMods) const {
2750 unsigned Mods = 0;
2751 Src = In;
2752
2753 if (Src.getOpcode() == ISD::FNEG) {
2754 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
2755 Src = Src.getOperand(0);
2756 }
2757
2758 if (Src.getOpcode() == ISD::BUILD_VECTOR) {
2759 unsigned VecMods = Mods;
2760
2761 SDValue Lo = stripBitcast(Src.getOperand(0));
2762 SDValue Hi = stripBitcast(Src.getOperand(1));
2763
2764 if (Lo.getOpcode() == ISD::FNEG) {
2765 Lo = stripBitcast(Lo.getOperand(0));
2766 Mods ^= SISrcMods::NEG;
2767 }
2768
2769 if (Hi.getOpcode() == ISD::FNEG) {
2770 Hi = stripBitcast(Hi.getOperand(0));
2771 Mods ^= SISrcMods::NEG_HI;
2772 }
2773
2774 if (isExtractHiElt(Lo, Lo))
2775 Mods |= SISrcMods::OP_SEL_0;
2776
2777 if (isExtractHiElt(Hi, Hi))
2778 Mods |= SISrcMods::OP_SEL_1;
2779
2780 unsigned VecSize = Src.getValueSizeInBits();
2781 Lo = stripExtractLoElt(Lo);
2782 Hi = stripExtractLoElt(Hi);
2783
2784 if (Lo.getValueSizeInBits() > VecSize) {
2785 Lo = CurDAG->getTargetExtractSubreg(
2786 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2787 MVT::getIntegerVT(VecSize), Lo);
2788 }
2789
2790 if (Hi.getValueSizeInBits() > VecSize) {
2791 Hi = CurDAG->getTargetExtractSubreg(
2792 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2793 MVT::getIntegerVT(VecSize), Hi);
2794 }
2795
2796 assert(Lo.getValueSizeInBits() <= VecSize &&
2797 Hi.getValueSizeInBits() <= VecSize);
2798
2799 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
2800 // Really a scalar input. Just select from the low half of the register to
2801 // avoid packing.
2802
2803 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
2804 Src = Lo;
2805 } else {
2806 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
2807
2808 SDLoc SL(In);
2809 SDValue Undef = SDValue(
2810 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
2811 Lo.getValueType()), 0);
2812 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
2813 : AMDGPU::SReg_64RegClassID;
2814 const SDValue Ops[] = {
2815 CurDAG->getTargetConstant(RC, SL, MVT::i32),
2816 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2817 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
2818
2819 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
2820 Src.getValueType(), Ops), 0);
2821 }
2822 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2823 return true;
2824 }
2825
2826 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
2827 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
2828 .bitcastToAPInt().getZExtValue();
2829 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
2830 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);;
2831 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2832 return true;
2833 }
2834 }
2835
2836 Mods = VecMods;
2837 }
2838
2839 // Packed instructions do not have abs modifiers.
2840 Mods |= SISrcMods::OP_SEL_1;
2841
2842 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2843 return true;
2844 }
2845
SelectVOP3OpSel(SDValue In,SDValue & Src,SDValue & SrcMods) const2846 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
2847 SDValue &SrcMods) const {
2848 Src = In;
2849 // FIXME: Handle op_sel
2850 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
2851 return true;
2852 }
2853
SelectVOP3OpSelMods(SDValue In,SDValue & Src,SDValue & SrcMods) const2854 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
2855 SDValue &SrcMods) const {
2856 // FIXME: Handle op_sel
2857 return SelectVOP3Mods(In, Src, SrcMods);
2858 }
2859
2860 // The return value is not whether the match is possible (which it always is),
2861 // but whether or not it a conversion is really used.
SelectVOP3PMadMixModsImpl(SDValue In,SDValue & Src,unsigned & Mods) const2862 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
2863 unsigned &Mods) const {
2864 Mods = 0;
2865 SelectVOP3ModsImpl(In, Src, Mods);
2866
2867 if (Src.getOpcode() == ISD::FP_EXTEND) {
2868 Src = Src.getOperand(0);
2869 assert(Src.getValueType() == MVT::f16);
2870 Src = stripBitcast(Src);
2871
2872 // Be careful about folding modifiers if we already have an abs. fneg is
2873 // applied last, so we don't want to apply an earlier fneg.
2874 if ((Mods & SISrcMods::ABS) == 0) {
2875 unsigned ModsTmp;
2876 SelectVOP3ModsImpl(Src, Src, ModsTmp);
2877
2878 if ((ModsTmp & SISrcMods::NEG) != 0)
2879 Mods ^= SISrcMods::NEG;
2880
2881 if ((ModsTmp & SISrcMods::ABS) != 0)
2882 Mods |= SISrcMods::ABS;
2883 }
2884
2885 // op_sel/op_sel_hi decide the source type and source.
2886 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
2887 // If the sources's op_sel is set, it picks the high half of the source
2888 // register.
2889
2890 Mods |= SISrcMods::OP_SEL_1;
2891 if (isExtractHiElt(Src, Src)) {
2892 Mods |= SISrcMods::OP_SEL_0;
2893
2894 // TODO: Should we try to look for neg/abs here?
2895 }
2896
2897 return true;
2898 }
2899
2900 return false;
2901 }
2902
SelectVOP3PMadMixMods(SDValue In,SDValue & Src,SDValue & SrcMods) const2903 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
2904 SDValue &SrcMods) const {
2905 unsigned Mods = 0;
2906 SelectVOP3PMadMixModsImpl(In, Src, Mods);
2907 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2908 return true;
2909 }
2910
getHi16Elt(SDValue In) const2911 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
2912 if (In.isUndef())
2913 return CurDAG->getUNDEF(MVT::i32);
2914
2915 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
2916 SDLoc SL(In);
2917 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
2918 }
2919
2920 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
2921 SDLoc SL(In);
2922 return CurDAG->getConstant(
2923 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
2924 }
2925
2926 SDValue Src;
2927 if (isExtractHiElt(In, Src))
2928 return Src;
2929
2930 return SDValue();
2931 }
2932
isVGPRImm(const SDNode * N) const2933 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
2934 assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
2935
2936 const SIRegisterInfo *SIRI =
2937 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
2938 const SIInstrInfo * SII =
2939 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
2940
2941 unsigned Limit = 0;
2942 bool AllUsesAcceptSReg = true;
2943 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
2944 Limit < 10 && U != E; ++U, ++Limit) {
2945 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
2946
2947 // If the register class is unknown, it could be an unknown
2948 // register class that needs to be an SGPR, e.g. an inline asm
2949 // constraint
2950 if (!RC || SIRI->isSGPRClass(RC))
2951 return false;
2952
2953 if (RC != &AMDGPU::VS_32RegClass) {
2954 AllUsesAcceptSReg = false;
2955 SDNode * User = *U;
2956 if (User->isMachineOpcode()) {
2957 unsigned Opc = User->getMachineOpcode();
2958 MCInstrDesc Desc = SII->get(Opc);
2959 if (Desc.isCommutable()) {
2960 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
2961 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
2962 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
2963 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
2964 const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
2965 if (CommutedRC == &AMDGPU::VS_32RegClass)
2966 AllUsesAcceptSReg = true;
2967 }
2968 }
2969 }
2970 // If "AllUsesAcceptSReg == false" so far we haven't suceeded
2971 // commuting current user. This means have at least one use
2972 // that strictly require VGPR. Thus, we will not attempt to commute
2973 // other user instructions.
2974 if (!AllUsesAcceptSReg)
2975 break;
2976 }
2977 }
2978 return !AllUsesAcceptSReg && (Limit < 10);
2979 }
2980
isUniformLoad(const SDNode * N) const2981 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
2982 auto Ld = cast<LoadSDNode>(N);
2983
2984 return Ld->getAlignment() >= 4 &&
2985 (
2986 (
2987 (
2988 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
2989 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
2990 )
2991 &&
2992 !N->isDivergent()
2993 )
2994 ||
2995 (
2996 Subtarget->getScalarizeGlobalBehavior() &&
2997 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
2998 Ld->isSimple() &&
2999 !N->isDivergent() &&
3000 static_cast<const SITargetLowering *>(
3001 getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
3002 )
3003 );
3004 }
3005
PostprocessISelDAG()3006 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
3007 const AMDGPUTargetLowering& Lowering =
3008 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3009 bool IsModified = false;
3010 do {
3011 IsModified = false;
3012
3013 // Go over all selected nodes and try to fold them a bit more
3014 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
3015 while (Position != CurDAG->allnodes_end()) {
3016 SDNode *Node = &*Position++;
3017 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3018 if (!MachineNode)
3019 continue;
3020
3021 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3022 if (ResNode != Node) {
3023 if (ResNode)
3024 ReplaceUses(Node, ResNode);
3025 IsModified = true;
3026 }
3027 }
3028 CurDAG->RemoveDeadNodes();
3029 } while (IsModified);
3030 }
3031
runOnMachineFunction(MachineFunction & MF)3032 bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
3033 Subtarget = &MF.getSubtarget<R600Subtarget>();
3034 return SelectionDAGISel::runOnMachineFunction(MF);
3035 }
3036
isConstantLoad(const MemSDNode * N,int CbId) const3037 bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
3038 if (!N->readMem())
3039 return false;
3040 if (CbId == -1)
3041 return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3042 N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
3043
3044 return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
3045 }
3046
SelectGlobalValueConstantOffset(SDValue Addr,SDValue & IntPtr)3047 bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
3048 SDValue& IntPtr) {
3049 if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
3050 IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
3051 true);
3052 return true;
3053 }
3054 return false;
3055 }
3056
SelectGlobalValueVariableOffset(SDValue Addr,SDValue & BaseReg,SDValue & Offset)3057 bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
3058 SDValue& BaseReg, SDValue &Offset) {
3059 if (!isa<ConstantSDNode>(Addr)) {
3060 BaseReg = Addr;
3061 Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
3062 return true;
3063 }
3064 return false;
3065 }
3066
Select(SDNode * N)3067 void R600DAGToDAGISel::Select(SDNode *N) {
3068 unsigned int Opc = N->getOpcode();
3069 if (N->isMachineOpcode()) {
3070 N->setNodeId(-1);
3071 return; // Already selected.
3072 }
3073
3074 switch (Opc) {
3075 default: break;
3076 case AMDGPUISD::BUILD_VERTICAL_VECTOR:
3077 case ISD::SCALAR_TO_VECTOR:
3078 case ISD::BUILD_VECTOR: {
3079 EVT VT = N->getValueType(0);
3080 unsigned NumVectorElts = VT.getVectorNumElements();
3081 unsigned RegClassID;
3082 // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
3083 // that adds a 128 bits reg copy when going through TwoAddressInstructions
3084 // pass. We want to avoid 128 bits copies as much as possible because they
3085 // can't be bundled by our scheduler.
3086 switch(NumVectorElts) {
3087 case 2: RegClassID = R600::R600_Reg64RegClassID; break;
3088 case 4:
3089 if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
3090 RegClassID = R600::R600_Reg128VerticalRegClassID;
3091 else
3092 RegClassID = R600::R600_Reg128RegClassID;
3093 break;
3094 default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
3095 }
3096 SelectBuildVector(N, RegClassID);
3097 return;
3098 }
3099 }
3100
3101 SelectCode(N);
3102 }
3103
SelectADDRIndirect(SDValue Addr,SDValue & Base,SDValue & Offset)3104 bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
3105 SDValue &Offset) {
3106 ConstantSDNode *C;
3107 SDLoc DL(Addr);
3108
3109 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
3110 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
3111 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
3112 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
3113 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
3114 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
3115 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
3116 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
3117 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
3118 Base = Addr.getOperand(0);
3119 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
3120 } else {
3121 Base = Addr;
3122 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
3123 }
3124
3125 return true;
3126 }
3127
SelectADDRVTX_READ(SDValue Addr,SDValue & Base,SDValue & Offset)3128 bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
3129 SDValue &Offset) {
3130 ConstantSDNode *IMMOffset;
3131
3132 if (Addr.getOpcode() == ISD::ADD
3133 && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
3134 && isInt<16>(IMMOffset->getZExtValue())) {
3135
3136 Base = Addr.getOperand(0);
3137 Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
3138 MVT::i32);
3139 return true;
3140 // If the pointer address is constant, we can move it to the offset field.
3141 } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
3142 && isInt<16>(IMMOffset->getZExtValue())) {
3143 Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
3144 SDLoc(CurDAG->getEntryNode()),
3145 R600::ZERO, MVT::i32);
3146 Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
3147 MVT::i32);
3148 return true;
3149 }
3150
3151 // Default case, no offset
3152 Base = Addr;
3153 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
3154 return true;
3155 }
3156