1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Custom DAG lowering for SI
12 //
13 //===----------------------------------------------------------------------===//
14
15 #ifdef _MSC_VER
16 // Provide M_PI.
17 #define _USE_MATH_DEFINES
18 #include <cmath>
19 #endif
20
21 #include "SIISelLowering.h"
22 #include "AMDGPU.h"
23 #include "AMDGPUIntrinsicInfo.h"
24 #include "AMDGPUSubtarget.h"
25 #include "SIInstrInfo.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "SIRegisterInfo.h"
28 #include "llvm/ADT/BitVector.h"
29 #include "llvm/CodeGen/CallingConvLower.h"
30 #include "llvm/CodeGen/MachineInstrBuilder.h"
31 #include "llvm/CodeGen/MachineRegisterInfo.h"
32 #include "llvm/CodeGen/SelectionDAG.h"
33 #include "llvm/IR/Function.h"
34 #include "llvm/ADT/SmallString.h"
35
36 using namespace llvm;
37
SITargetLowering(TargetMachine & TM)38 SITargetLowering::SITargetLowering(TargetMachine &TM) :
39 AMDGPUTargetLowering(TM) {
40 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
41 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
42
43 addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
44 addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
45
46 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
47 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
48
49 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
50 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
51 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
52
53 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
54 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
55
56 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
57 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
58
59 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
60 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
61
62 computeRegisterProperties();
63
64 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
65 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
66 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
67 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
68
69 setOperationAction(ISD::ADD, MVT::i32, Legal);
70 setOperationAction(ISD::ADDC, MVT::i32, Legal);
71 setOperationAction(ISD::ADDE, MVT::i32, Legal);
72 setOperationAction(ISD::SUBC, MVT::i32, Legal);
73 setOperationAction(ISD::SUBE, MVT::i32, Legal);
74
75 setOperationAction(ISD::FSIN, MVT::f32, Custom);
76 setOperationAction(ISD::FCOS, MVT::f32, Custom);
77
78 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
79 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
80 setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
81 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
82
83 // We need to custom lower vector stores from local memory
84 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
85 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
86 setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
87
88 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
89 setOperationAction(ISD::STORE, MVT::v16i32, Custom);
90
91 setOperationAction(ISD::STORE, MVT::i1, Custom);
92 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
93
94 setOperationAction(ISD::SELECT, MVT::i64, Custom);
95 setOperationAction(ISD::SELECT, MVT::f64, Promote);
96 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
97
98 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
99 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
100 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
101 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
102
103 setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
104 setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
105
106 setOperationAction(ISD::BSWAP, MVT::i32, Legal);
107
108 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
109 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
110 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
111
112 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
113 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
114 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
115
116 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
117 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
118 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
119
120 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
121 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
122
123 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
124 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
125 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
126 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
127
128 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
129 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
130
131 for (MVT VT : MVT::integer_valuetypes()) {
132 if (VT == MVT::i64)
133 continue;
134
135 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
136 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
137 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
138 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
139
140 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
141 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
142 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
143 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
144
145 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
146 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
147 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
148 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
149 }
150
151 for (MVT VT : MVT::integer_vector_valuetypes()) {
152 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand);
153 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand);
154 }
155
156 for (MVT VT : MVT::fp_valuetypes())
157 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
158
159 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
160 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
161 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
162 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
163
164 setOperationAction(ISD::LOAD, MVT::i1, Custom);
165
166 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
167 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
168 setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
169
170 // These should use UDIVREM, so set them to expand
171 setOperationAction(ISD::UDIV, MVT::i64, Expand);
172 setOperationAction(ISD::UREM, MVT::i64, Expand);
173
174 // We only support LOAD/STORE and vector manipulation ops for vectors
175 // with > 4 elements.
176 MVT VecTypes[] = {
177 MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32
178 };
179
180 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
181 setOperationAction(ISD::SELECT, MVT::i1, Promote);
182
183 for (MVT VT : VecTypes) {
184 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
185 switch(Op) {
186 case ISD::LOAD:
187 case ISD::STORE:
188 case ISD::BUILD_VECTOR:
189 case ISD::BITCAST:
190 case ISD::EXTRACT_VECTOR_ELT:
191 case ISD::INSERT_VECTOR_ELT:
192 case ISD::INSERT_SUBVECTOR:
193 case ISD::EXTRACT_SUBVECTOR:
194 break;
195 case ISD::CONCAT_VECTORS:
196 setOperationAction(Op, VT, Custom);
197 break;
198 default:
199 setOperationAction(Op, VT, Expand);
200 break;
201 }
202 }
203 }
204
205 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
206 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
207 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
208 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
209 setOperationAction(ISD::FRINT, MVT::f64, Legal);
210 }
211
212 setOperationAction(ISD::FDIV, MVT::f32, Custom);
213 setOperationAction(ISD::FDIV, MVT::f64, Custom);
214
215 setTargetDAGCombine(ISD::FADD);
216 setTargetDAGCombine(ISD::FSUB);
217 setTargetDAGCombine(ISD::FMINNUM);
218 setTargetDAGCombine(ISD::FMAXNUM);
219 setTargetDAGCombine(ISD::SELECT_CC);
220 setTargetDAGCombine(ISD::SETCC);
221 setTargetDAGCombine(ISD::AND);
222 setTargetDAGCombine(ISD::OR);
223 setTargetDAGCombine(ISD::UINT_TO_FP);
224
225 // All memory operations. Some folding on the pointer operand is done to help
226 // matching the constant offsets in the addressing modes.
227 setTargetDAGCombine(ISD::LOAD);
228 setTargetDAGCombine(ISD::STORE);
229 setTargetDAGCombine(ISD::ATOMIC_LOAD);
230 setTargetDAGCombine(ISD::ATOMIC_STORE);
231 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
232 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
233 setTargetDAGCombine(ISD::ATOMIC_SWAP);
234 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
235 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
236 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
237 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
238 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
239 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
240 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
241 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
242 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
243 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
244
245 setSchedulingPreference(Sched::RegPressure);
246 }
247
248 //===----------------------------------------------------------------------===//
249 // TargetLowering queries
250 //===----------------------------------------------------------------------===//
251
isShuffleMaskLegal(const SmallVectorImpl<int> &,EVT) const252 bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
253 EVT) const {
254 // SI has some legal vector types, but no legal vector operations. Say no
255 // shuffles are legal in order to prefer scalarizing some vector operations.
256 return false;
257 }
258
259 // FIXME: This really needs an address space argument. The immediate offset
260 // size is different for different sets of memory instruction sets.
261
262 // The single offset DS instructions have a 16-bit unsigned byte offset.
263 //
264 // MUBUF / MTBUF have a 12-bit unsigned byte offset, and additionally can do r +
265 // r + i with addr64. 32-bit has more addressing mode options. Depending on the
266 // resource constant, it can also do (i64 r0) + (i32 r1) * (i14 i).
267 //
268 // SMRD instructions have an 8-bit, dword offset.
269 //
isLegalAddressingMode(const AddrMode & AM,Type * Ty) const270 bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
271 Type *Ty) const {
272 // No global is ever allowed as a base.
273 if (AM.BaseGV)
274 return false;
275
276 // Allow a 16-bit unsigned immediate field, since this is what DS instructions
277 // use.
278 if (!isUInt<16>(AM.BaseOffs))
279 return false;
280
281 // Only support r+r,
282 switch (AM.Scale) {
283 case 0: // "r+i" or just "i", depending on HasBaseReg.
284 break;
285 case 1:
286 if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
287 return false;
288 // Otherwise we have r+r or r+i.
289 break;
290 case 2:
291 if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed.
292 return false;
293 // Allow 2*r as r+r.
294 break;
295 default: // Don't allow n * r
296 return false;
297 }
298
299 return true;
300 }
301
allowsMisalignedMemoryAccesses(EVT VT,unsigned AddrSpace,unsigned Align,bool * IsFast) const302 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
303 unsigned AddrSpace,
304 unsigned Align,
305 bool *IsFast) const {
306 if (IsFast)
307 *IsFast = false;
308
309 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
310 // which isn't a simple VT.
311 if (!VT.isSimple() || VT == MVT::Other)
312 return false;
313
314 // TODO - CI+ supports unaligned memory accesses, but this requires driver
315 // support.
316
317 // XXX - The only mention I see of this in the ISA manual is for LDS direct
318 // reads the "byte address and must be dword aligned". Is it also true for the
319 // normal loads and stores?
320 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) {
321 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
322 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
323 // with adjacent offsets.
324 return Align % 4 == 0;
325 }
326
327 // Smaller than dword value must be aligned.
328 // FIXME: This should be allowed on CI+
329 if (VT.bitsLT(MVT::i32))
330 return false;
331
332 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
333 // byte-address are ignored, thus forcing Dword alignment.
334 // This applies to private, global, and constant memory.
335 if (IsFast)
336 *IsFast = true;
337
338 return VT.bitsGT(MVT::i32) && Align % 4 == 0;
339 }
340
getOptimalMemOpType(uint64_t Size,unsigned DstAlign,unsigned SrcAlign,bool IsMemset,bool ZeroMemset,bool MemcpyStrSrc,MachineFunction & MF) const341 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
342 unsigned SrcAlign, bool IsMemset,
343 bool ZeroMemset,
344 bool MemcpyStrSrc,
345 MachineFunction &MF) const {
346 // FIXME: Should account for address space here.
347
348 // The default fallback uses the private pointer size as a guess for a type to
349 // use. Make sure we switch these to 64-bit accesses.
350
351 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
352 return MVT::v4i32;
353
354 if (Size >= 8 && DstAlign >= 4)
355 return MVT::v2i32;
356
357 // Use the default.
358 return MVT::Other;
359 }
360
361 TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(EVT VT) const362 SITargetLowering::getPreferredVectorAction(EVT VT) const {
363 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
364 return TypeSplitVector;
365
366 return TargetLoweringBase::getPreferredVectorAction(VT);
367 }
368
shouldConvertConstantLoadToIntImm(const APInt & Imm,Type * Ty) const369 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
370 Type *Ty) const {
371 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
372 getTargetMachine().getSubtargetImpl()->getInstrInfo());
373 return TII->isInlineConstant(Imm);
374 }
375
LowerParameter(SelectionDAG & DAG,EVT VT,EVT MemVT,SDLoc SL,SDValue Chain,unsigned Offset,bool Signed) const376 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
377 SDLoc SL, SDValue Chain,
378 unsigned Offset, bool Signed) const {
379 const DataLayout *DL = getDataLayout();
380 MachineFunction &MF = DAG.getMachineFunction();
381 const SIRegisterInfo *TRI =
382 static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
383 unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
384
385 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
386
387 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
388 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
389 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
390 MRI.getLiveInVirtReg(InputPtrReg), MVT::i64);
391 SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, BasePtr,
392 DAG.getConstant(Offset, MVT::i64));
393 SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
394 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
395
396 return DAG.getLoad(ISD::UNINDEXED, Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD,
397 VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
398 false, // isVolatile
399 true, // isNonTemporal
400 true, // isInvariant
401 DL->getABITypeAlignment(Ty)); // Alignment
402 }
403
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,SDLoc DL,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const404 SDValue SITargetLowering::LowerFormalArguments(
405 SDValue Chain,
406 CallingConv::ID CallConv,
407 bool isVarArg,
408 const SmallVectorImpl<ISD::InputArg> &Ins,
409 SDLoc DL, SelectionDAG &DAG,
410 SmallVectorImpl<SDValue> &InVals) const {
411
412 const TargetMachine &TM = getTargetMachine();
413 const SIRegisterInfo *TRI =
414 static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo());
415
416 MachineFunction &MF = DAG.getMachineFunction();
417 FunctionType *FType = MF.getFunction()->getFunctionType();
418 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
419
420 assert(CallConv == CallingConv::C);
421
422 SmallVector<ISD::InputArg, 16> Splits;
423 BitVector Skipped(Ins.size());
424
425 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
426 const ISD::InputArg &Arg = Ins[i];
427
428 // First check if it's a PS input addr
429 if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
430 !Arg.Flags.isByVal()) {
431
432 assert((PSInputNum <= 15) && "Too many PS inputs!");
433
434 if (!Arg.Used) {
435 // We can savely skip PS inputs
436 Skipped.set(i);
437 ++PSInputNum;
438 continue;
439 }
440
441 Info->PSInputAddr |= 1 << PSInputNum++;
442 }
443
444 // Second split vertices into their elements
445 if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) {
446 ISD::InputArg NewArg = Arg;
447 NewArg.Flags.setSplit();
448 NewArg.VT = Arg.VT.getVectorElementType();
449
450 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
451 // three or five element vertex only needs three or five registers,
452 // NOT four or eigth.
453 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
454 unsigned NumElements = ParamType->getVectorNumElements();
455
456 for (unsigned j = 0; j != NumElements; ++j) {
457 Splits.push_back(NewArg);
458 NewArg.PartOffset += NewArg.VT.getStoreSize();
459 }
460
461 } else if (Info->getShaderType() != ShaderType::COMPUTE) {
462 Splits.push_back(Arg);
463 }
464 }
465
466 SmallVector<CCValAssign, 16> ArgLocs;
467 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
468 *DAG.getContext());
469
470 // At least one interpolation mode must be enabled or else the GPU will hang.
471 if (Info->getShaderType() == ShaderType::PIXEL &&
472 (Info->PSInputAddr & 0x7F) == 0) {
473 Info->PSInputAddr |= 1;
474 CCInfo.AllocateReg(AMDGPU::VGPR0);
475 CCInfo.AllocateReg(AMDGPU::VGPR1);
476 }
477
478 // The pointer to the list of arguments is stored in SGPR0, SGPR1
479 // The pointer to the scratch buffer is stored in SGPR2, SGPR3
480 if (Info->getShaderType() == ShaderType::COMPUTE) {
481 if (Subtarget->isAmdHsaOS())
482 Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers.
483 else
484 Info->NumUserSGPRs = 4;
485
486 unsigned InputPtrReg =
487 TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
488 unsigned InputPtrRegLo =
489 TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
490 unsigned InputPtrRegHi =
491 TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
492
493 unsigned ScratchPtrReg =
494 TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
495 unsigned ScratchPtrRegLo =
496 TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0);
497 unsigned ScratchPtrRegHi =
498 TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1);
499
500 CCInfo.AllocateReg(InputPtrRegLo);
501 CCInfo.AllocateReg(InputPtrRegHi);
502 CCInfo.AllocateReg(ScratchPtrRegLo);
503 CCInfo.AllocateReg(ScratchPtrRegHi);
504 MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
505 MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass);
506 }
507
508 if (Info->getShaderType() == ShaderType::COMPUTE) {
509 getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
510 Splits);
511 }
512
513 AnalyzeFormalArguments(CCInfo, Splits);
514
515 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
516
517 const ISD::InputArg &Arg = Ins[i];
518 if (Skipped[i]) {
519 InVals.push_back(DAG.getUNDEF(Arg.VT));
520 continue;
521 }
522
523 CCValAssign &VA = ArgLocs[ArgIdx++];
524 MVT VT = VA.getLocVT();
525
526 if (VA.isMemLoc()) {
527 VT = Ins[i].VT;
528 EVT MemVT = Splits[i].VT;
529 const unsigned Offset = 36 + VA.getLocMemOffset();
530 // The first 36 bytes of the input buffer contains information about
531 // thread group and global sizes.
532 SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(),
533 Offset, Ins[i].Flags.isSExt());
534
535 const PointerType *ParamTy =
536 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
537 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
538 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
539 // On SI local pointers are just offsets into LDS, so they are always
540 // less than 16-bits. On CI and newer they could potentially be
541 // real pointers, so we can't guarantee their size.
542 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
543 DAG.getValueType(MVT::i16));
544 }
545
546 InVals.push_back(Arg);
547 Info->ABIArgOffset = Offset + MemVT.getStoreSize();
548 continue;
549 }
550 assert(VA.isRegLoc() && "Parameter must be in a register!");
551
552 unsigned Reg = VA.getLocReg();
553
554 if (VT == MVT::i64) {
555 // For now assume it is a pointer
556 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
557 &AMDGPU::SReg_64RegClass);
558 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
559 InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
560 continue;
561 }
562
563 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
564
565 Reg = MF.addLiveIn(Reg, RC);
566 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
567
568 if (Arg.VT.isVector()) {
569
570 // Build a vector from the registers
571 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
572 unsigned NumElements = ParamType->getVectorNumElements();
573
574 SmallVector<SDValue, 4> Regs;
575 Regs.push_back(Val);
576 for (unsigned j = 1; j != NumElements; ++j) {
577 Reg = ArgLocs[ArgIdx++].getLocReg();
578 Reg = MF.addLiveIn(Reg, RC);
579 Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
580 }
581
582 // Fill up the missing vector elements
583 NumElements = Arg.VT.getVectorNumElements() - NumElements;
584 for (unsigned j = 0; j != NumElements; ++j)
585 Regs.push_back(DAG.getUNDEF(VT));
586
587 InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
588 continue;
589 }
590
591 InVals.push_back(Val);
592 }
593
594 if (Info->getShaderType() != ShaderType::COMPUTE) {
595 unsigned ScratchIdx = CCInfo.getFirstUnallocated(
596 AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs());
597 Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
598 }
599 return Chain;
600 }
601
EmitInstrWithCustomInserter(MachineInstr * MI,MachineBasicBlock * BB) const602 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
603 MachineInstr * MI, MachineBasicBlock * BB) const {
604
605 MachineBasicBlock::iterator I = *MI;
606 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
607 getTargetMachine().getSubtargetImpl()->getInstrInfo());
608
609 switch (MI->getOpcode()) {
610 default:
611 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
612 case AMDGPU::BRANCH: return BB;
613 case AMDGPU::V_SUB_F64: {
614 unsigned DestReg = MI->getOperand(0).getReg();
615 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
616 .addImm(0) // SRC0 modifiers
617 .addReg(MI->getOperand(1).getReg())
618 .addImm(1) // SRC1 modifiers
619 .addReg(MI->getOperand(2).getReg())
620 .addImm(0) // CLAMP
621 .addImm(0); // OMOD
622 MI->eraseFromParent();
623 break;
624 }
625 case AMDGPU::SI_RegisterStorePseudo: {
626 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
627 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
628 MachineInstrBuilder MIB =
629 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
630 Reg);
631 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
632 MIB.addOperand(MI->getOperand(i));
633
634 MI->eraseFromParent();
635 break;
636 }
637 }
638 return BB;
639 }
640
getSetCCResultType(LLVMContext & Ctx,EVT VT) const641 EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const {
642 if (!VT.isVector()) {
643 return MVT::i1;
644 }
645 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
646 }
647
getScalarShiftAmountTy(EVT VT) const648 MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
649 return MVT::i32;
650 }
651
isFMAFasterThanFMulAndFAdd(EVT VT) const652 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
653 VT = VT.getScalarType();
654
655 if (!VT.isSimple())
656 return false;
657
658 switch (VT.getSimpleVT().SimpleTy) {
659 case MVT::f32:
660 return false; /* There is V_MAD_F32 for f32 */
661 case MVT::f64:
662 return true;
663 default:
664 break;
665 }
666
667 return false;
668 }
669
670 //===----------------------------------------------------------------------===//
671 // Custom DAG Lowering Operations
672 //===----------------------------------------------------------------------===//
673
LowerOperation(SDValue Op,SelectionDAG & DAG) const674 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
675 switch (Op.getOpcode()) {
676 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
677 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
678 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
679 case ISD::LOAD: {
680 SDValue Result = LowerLOAD(Op, DAG);
681 assert((!Result.getNode() ||
682 Result.getNode()->getNumValues() == 2) &&
683 "Load should return a value and a chain");
684 return Result;
685 }
686
687 case ISD::FSIN:
688 case ISD::FCOS:
689 return LowerTrig(Op, DAG);
690 case ISD::SELECT: return LowerSELECT(Op, DAG);
691 case ISD::FDIV: return LowerFDIV(Op, DAG);
692 case ISD::STORE: return LowerSTORE(Op, DAG);
693 case ISD::GlobalAddress: {
694 MachineFunction &MF = DAG.getMachineFunction();
695 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
696 return LowerGlobalAddress(MFI, Op, DAG);
697 }
698 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
699 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
700 }
701 return SDValue();
702 }
703
704 /// \brief Helper function for LowerBRCOND
findUser(SDValue Value,unsigned Opcode)705 static SDNode *findUser(SDValue Value, unsigned Opcode) {
706
707 SDNode *Parent = Value.getNode();
708 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
709 I != E; ++I) {
710
711 if (I.getUse().get() != Value)
712 continue;
713
714 if (I->getOpcode() == Opcode)
715 return *I;
716 }
717 return nullptr;
718 }
719
LowerFrameIndex(SDValue Op,SelectionDAG & DAG) const720 SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
721
722 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
723 unsigned FrameIndex = FINode->getIndex();
724
725 return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
726 }
727
728 /// This transforms the control flow intrinsics to get the branch destination as
729 /// last parameter, also switches branch target with BR if the need arise
LowerBRCOND(SDValue BRCOND,SelectionDAG & DAG) const730 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
731 SelectionDAG &DAG) const {
732
733 SDLoc DL(BRCOND);
734
735 SDNode *Intr = BRCOND.getOperand(1).getNode();
736 SDValue Target = BRCOND.getOperand(2);
737 SDNode *BR = nullptr;
738
739 if (Intr->getOpcode() == ISD::SETCC) {
740 // As long as we negate the condition everything is fine
741 SDNode *SetCC = Intr;
742 assert(SetCC->getConstantOperandVal(1) == 1);
743 assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
744 ISD::SETNE);
745 Intr = SetCC->getOperand(0).getNode();
746
747 } else {
748 // Get the target from BR if we don't negate the condition
749 BR = findUser(BRCOND, ISD::BR);
750 Target = BR->getOperand(1);
751 }
752
753 assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
754
755 // Build the result and
756 SmallVector<EVT, 4> Res;
757 for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i)
758 Res.push_back(Intr->getValueType(i));
759
760 // operands of the new intrinsic call
761 SmallVector<SDValue, 4> Ops;
762 Ops.push_back(BRCOND.getOperand(0));
763 for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i)
764 Ops.push_back(Intr->getOperand(i));
765 Ops.push_back(Target);
766
767 // build the new intrinsic call
768 SDNode *Result = DAG.getNode(
769 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
770 DAG.getVTList(Res), Ops).getNode();
771
772 if (BR) {
773 // Give the branch instruction our target
774 SDValue Ops[] = {
775 BR->getOperand(0),
776 BRCOND.getOperand(2)
777 };
778 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
779 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
780 BR = NewBR.getNode();
781 }
782
783 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
784
785 // Copy the intrinsic results to registers
786 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
787 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
788 if (!CopyToReg)
789 continue;
790
791 Chain = DAG.getCopyToReg(
792 Chain, DL,
793 CopyToReg->getOperand(1),
794 SDValue(Result, i - 1),
795 SDValue());
796
797 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
798 }
799
800 // Remove the old intrinsic from the chain
801 DAG.ReplaceAllUsesOfValueWith(
802 SDValue(Intr, Intr->getNumValues() - 1),
803 Intr->getOperand(0));
804
805 return Chain;
806 }
807
LowerGlobalAddress(AMDGPUMachineFunction * MFI,SDValue Op,SelectionDAG & DAG) const808 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
809 SDValue Op,
810 SelectionDAG &DAG) const {
811 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
812
813 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
814 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
815
816 SDLoc DL(GSD);
817 const GlobalValue *GV = GSD->getGlobal();
818 MVT PtrVT = getPointerTy(GSD->getAddressSpace());
819
820 SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
821 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
822
823 SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
824 DAG.getConstant(0, MVT::i32));
825 SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
826 DAG.getConstant(1, MVT::i32));
827
828 SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
829 PtrLo, GA);
830 SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
831 PtrHi, DAG.getConstant(0, MVT::i32),
832 SDValue(Lo.getNode(), 1));
833 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
834 }
835
LowerINTRINSIC_WO_CHAIN(SDValue Op,SelectionDAG & DAG) const836 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
837 SelectionDAG &DAG) const {
838 MachineFunction &MF = DAG.getMachineFunction();
839 const SIRegisterInfo *TRI =
840 static_cast<const SIRegisterInfo*>(MF.getSubtarget().getRegisterInfo());
841
842 EVT VT = Op.getValueType();
843 SDLoc DL(Op);
844 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
845
846 switch (IntrinsicID) {
847 case Intrinsic::r600_read_ngroups_x:
848 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
849 SI::KernelInputOffsets::NGROUPS_X, false);
850 case Intrinsic::r600_read_ngroups_y:
851 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
852 SI::KernelInputOffsets::NGROUPS_Y, false);
853 case Intrinsic::r600_read_ngroups_z:
854 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
855 SI::KernelInputOffsets::NGROUPS_Z, false);
856 case Intrinsic::r600_read_global_size_x:
857 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
858 SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
859 case Intrinsic::r600_read_global_size_y:
860 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
861 SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
862 case Intrinsic::r600_read_global_size_z:
863 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
864 SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
865 case Intrinsic::r600_read_local_size_x:
866 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
867 SI::KernelInputOffsets::LOCAL_SIZE_X, false);
868 case Intrinsic::r600_read_local_size_y:
869 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
870 SI::KernelInputOffsets::LOCAL_SIZE_Y, false);
871 case Intrinsic::r600_read_local_size_z:
872 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
873 SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
874
875 case Intrinsic::AMDGPU_read_workdim:
876 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
877 MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset,
878 false);
879
880 case Intrinsic::r600_read_tgid_x:
881 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
882 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
883 case Intrinsic::r600_read_tgid_y:
884 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
885 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT);
886 case Intrinsic::r600_read_tgid_z:
887 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
888 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
889 case Intrinsic::r600_read_tidig_x:
890 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
891 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
892 case Intrinsic::r600_read_tidig_y:
893 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
894 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
895 case Intrinsic::r600_read_tidig_z:
896 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
897 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
898 case AMDGPUIntrinsic::SI_load_const: {
899 SDValue Ops[] = {
900 Op.getOperand(1),
901 Op.getOperand(2)
902 };
903
904 MachineMemOperand *MMO = MF.getMachineMemOperand(
905 MachinePointerInfo(),
906 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
907 VT.getStoreSize(), 4);
908 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
909 Op->getVTList(), Ops, VT, MMO);
910 }
911 case AMDGPUIntrinsic::SI_sample:
912 return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
913 case AMDGPUIntrinsic::SI_sampleb:
914 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
915 case AMDGPUIntrinsic::SI_sampled:
916 return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
917 case AMDGPUIntrinsic::SI_samplel:
918 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
919 case AMDGPUIntrinsic::SI_vs_load_input:
920 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
921 Op.getOperand(1),
922 Op.getOperand(2),
923 Op.getOperand(3));
924
925 case AMDGPUIntrinsic::AMDGPU_fract:
926 case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
927 return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1),
928 DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1)));
929
930 default:
931 return AMDGPUTargetLowering::LowerOperation(Op, DAG);
932 }
933 }
934
LowerINTRINSIC_VOID(SDValue Op,SelectionDAG & DAG) const935 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
936 SelectionDAG &DAG) const {
937 MachineFunction &MF = DAG.getMachineFunction();
938 SDValue Chain = Op.getOperand(0);
939 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
940
941 switch (IntrinsicID) {
942 case AMDGPUIntrinsic::SI_tbuffer_store: {
943 SDLoc DL(Op);
944 SDValue Ops[] = {
945 Chain,
946 Op.getOperand(2),
947 Op.getOperand(3),
948 Op.getOperand(4),
949 Op.getOperand(5),
950 Op.getOperand(6),
951 Op.getOperand(7),
952 Op.getOperand(8),
953 Op.getOperand(9),
954 Op.getOperand(10),
955 Op.getOperand(11),
956 Op.getOperand(12),
957 Op.getOperand(13),
958 Op.getOperand(14)
959 };
960
961 EVT VT = Op.getOperand(3).getValueType();
962
963 MachineMemOperand *MMO = MF.getMachineMemOperand(
964 MachinePointerInfo(),
965 MachineMemOperand::MOStore,
966 VT.getStoreSize(), 4);
967 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
968 Op->getVTList(), Ops, VT, MMO);
969 }
970 default:
971 return SDValue();
972 }
973 }
974
LowerLOAD(SDValue Op,SelectionDAG & DAG) const975 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
976 SDLoc DL(Op);
977 LoadSDNode *Load = cast<LoadSDNode>(Op);
978
979 if (Op.getValueType().isVector()) {
980 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
981 "Custom lowering for non-i32 vectors hasn't been implemented.");
982 unsigned NumElements = Op.getValueType().getVectorNumElements();
983 assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
984 switch (Load->getAddressSpace()) {
985 default: break;
986 case AMDGPUAS::GLOBAL_ADDRESS:
987 case AMDGPUAS::PRIVATE_ADDRESS:
988 // v4 loads are supported for private and global memory.
989 if (NumElements <= 4)
990 break;
991 // fall-through
992 case AMDGPUAS::LOCAL_ADDRESS:
993 return ScalarizeVectorLoad(Op, DAG);
994 }
995 }
996
997 return AMDGPUTargetLowering::LowerLOAD(Op, DAG);
998 }
999
LowerSampleIntrinsic(unsigned Opcode,const SDValue & Op,SelectionDAG & DAG) const1000 SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
1001 const SDValue &Op,
1002 SelectionDAG &DAG) const {
1003 return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
1004 Op.getOperand(2),
1005 Op.getOperand(3),
1006 Op.getOperand(4));
1007 }
1008
LowerSELECT(SDValue Op,SelectionDAG & DAG) const1009 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
1010 if (Op.getValueType() != MVT::i64)
1011 return SDValue();
1012
1013 SDLoc DL(Op);
1014 SDValue Cond = Op.getOperand(0);
1015
1016 SDValue Zero = DAG.getConstant(0, MVT::i32);
1017 SDValue One = DAG.getConstant(1, MVT::i32);
1018
1019 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
1020 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
1021
1022 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
1023 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
1024
1025 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
1026
1027 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
1028 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
1029
1030 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
1031
1032 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi);
1033 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
1034 }
1035
1036 // Catch division cases where we can use shortcuts with rcp and rsq
1037 // instructions.
LowerFastFDIV(SDValue Op,SelectionDAG & DAG) const1038 SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
1039 SDLoc SL(Op);
1040 SDValue LHS = Op.getOperand(0);
1041 SDValue RHS = Op.getOperand(1);
1042 EVT VT = Op.getValueType();
1043 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
1044
1045 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
1046 if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) &&
1047 CLHS->isExactlyValue(1.0)) {
1048 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
1049 // the CI documentation has a worst case error of 1 ulp.
1050 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
1051 // use it as long as we aren't trying to use denormals.
1052
1053 // 1.0 / sqrt(x) -> rsq(x)
1054 //
1055 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
1056 // error seems really high at 2^29 ULP.
1057 if (RHS.getOpcode() == ISD::FSQRT)
1058 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
1059
1060 // 1.0 / x -> rcp(x)
1061 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
1062 }
1063 }
1064
1065 if (Unsafe) {
1066 // Turn into multiply by the reciprocal.
1067 // x / y -> x * (1.0 / y)
1068 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
1069 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip);
1070 }
1071
1072 return SDValue();
1073 }
1074
LowerFDIV32(SDValue Op,SelectionDAG & DAG) const1075 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
1076 SDValue FastLowered = LowerFastFDIV(Op, DAG);
1077 if (FastLowered.getNode())
1078 return FastLowered;
1079
1080 // This uses v_rcp_f32 which does not handle denormals. Let this hit a
1081 // selection error for now rather than do something incorrect.
1082 if (Subtarget->hasFP32Denormals())
1083 return SDValue();
1084
1085 SDLoc SL(Op);
1086 SDValue LHS = Op.getOperand(0);
1087 SDValue RHS = Op.getOperand(1);
1088
1089 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
1090
1091 const APFloat K0Val(BitsToFloat(0x6f800000));
1092 const SDValue K0 = DAG.getConstantFP(K0Val, MVT::f32);
1093
1094 const APFloat K1Val(BitsToFloat(0x2f800000));
1095 const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32);
1096
1097 const SDValue One = DAG.getConstantFP(1.0, MVT::f32);
1098
1099 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
1100
1101 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
1102
1103 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
1104
1105 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
1106
1107 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
1108
1109 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
1110
1111 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
1112 }
1113
LowerFDIV64(SDValue Op,SelectionDAG & DAG) const1114 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
1115 if (DAG.getTarget().Options.UnsafeFPMath)
1116 return LowerFastFDIV(Op, DAG);
1117
1118 SDLoc SL(Op);
1119 SDValue X = Op.getOperand(0);
1120 SDValue Y = Op.getOperand(1);
1121
1122 const SDValue One = DAG.getConstantFP(1.0, MVT::f64);
1123
1124 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
1125
1126 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
1127
1128 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
1129
1130 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
1131
1132 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
1133
1134 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
1135
1136 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
1137
1138 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
1139
1140 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
1141 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
1142
1143 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
1144 NegDivScale0, Mul, DivScale1);
1145
1146 SDValue Scale;
1147
1148 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1149 // Workaround a hardware bug on SI where the condition output from div_scale
1150 // is not usable.
1151
1152 const SDValue Hi = DAG.getConstant(1, MVT::i32);
1153
1154 // Figure out if the scale to use for div_fmas.
1155 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
1156 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
1157 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
1158 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
1159
1160 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
1161 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
1162
1163 SDValue Scale0Hi
1164 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
1165 SDValue Scale1Hi
1166 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
1167
1168 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
1169 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
1170 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
1171 } else {
1172 Scale = DivScale1.getValue(1);
1173 }
1174
1175 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
1176 Fma4, Fma3, Mul, Scale);
1177
1178 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
1179 }
1180
LowerFDIV(SDValue Op,SelectionDAG & DAG) const1181 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
1182 EVT VT = Op.getValueType();
1183
1184 if (VT == MVT::f32)
1185 return LowerFDIV32(Op, DAG);
1186
1187 if (VT == MVT::f64)
1188 return LowerFDIV64(Op, DAG);
1189
1190 llvm_unreachable("Unexpected type for fdiv");
1191 }
1192
LowerSTORE(SDValue Op,SelectionDAG & DAG) const1193 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1194 SDLoc DL(Op);
1195 StoreSDNode *Store = cast<StoreSDNode>(Op);
1196 EVT VT = Store->getMemoryVT();
1197
1198 // These stores are legal.
1199 if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1200 if (VT.isVector() && VT.getVectorNumElements() > 4)
1201 return ScalarizeVectorStore(Op, DAG);
1202 return SDValue();
1203 }
1204
1205 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1206 if (Ret.getNode())
1207 return Ret;
1208
1209 if (VT.isVector() && VT.getVectorNumElements() >= 8)
1210 return ScalarizeVectorStore(Op, DAG);
1211
1212 if (VT == MVT::i1)
1213 return DAG.getTruncStore(Store->getChain(), DL,
1214 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
1215 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
1216
1217 return SDValue();
1218 }
1219
LowerTrig(SDValue Op,SelectionDAG & DAG) const1220 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
1221 EVT VT = Op.getValueType();
1222 SDValue Arg = Op.getOperand(0);
1223 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
1224 DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
1225 DAG.getConstantFP(0.5 / M_PI, VT)));
1226
1227 switch (Op.getOpcode()) {
1228 case ISD::FCOS:
1229 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
1230 case ISD::FSIN:
1231 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
1232 default:
1233 llvm_unreachable("Wrong trig opcode");
1234 }
1235 }
1236
1237 //===----------------------------------------------------------------------===//
1238 // Custom DAG optimizations
1239 //===----------------------------------------------------------------------===//
1240
performUCharToFloatCombine(SDNode * N,DAGCombinerInfo & DCI) const1241 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
1242 DAGCombinerInfo &DCI) const {
1243 EVT VT = N->getValueType(0);
1244 EVT ScalarVT = VT.getScalarType();
1245 if (ScalarVT != MVT::f32)
1246 return SDValue();
1247
1248 SelectionDAG &DAG = DCI.DAG;
1249 SDLoc DL(N);
1250
1251 SDValue Src = N->getOperand(0);
1252 EVT SrcVT = Src.getValueType();
1253
1254 // TODO: We could try to match extracting the higher bytes, which would be
1255 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
1256 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
1257 // about in practice.
1258 if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
1259 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
1260 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
1261 DCI.AddToWorklist(Cvt.getNode());
1262 return Cvt;
1263 }
1264 }
1265
1266 // We are primarily trying to catch operations on illegal vector types
1267 // before they are expanded.
1268 // For scalars, we can use the more flexible method of checking masked bits
1269 // after legalization.
1270 if (!DCI.isBeforeLegalize() ||
1271 !SrcVT.isVector() ||
1272 SrcVT.getVectorElementType() != MVT::i8) {
1273 return SDValue();
1274 }
1275
1276 assert(DCI.isBeforeLegalize() && "Unexpected legal type");
1277
1278 // Weird sized vectors are a pain to handle, but we know 3 is really the same
1279 // size as 4.
1280 unsigned NElts = SrcVT.getVectorNumElements();
1281 if (!SrcVT.isSimple() && NElts != 3)
1282 return SDValue();
1283
1284 // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
1285 // prevent a mess from expanding to v4i32 and repacking.
1286 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
1287 EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
1288 EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
1289 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
1290 LoadSDNode *Load = cast<LoadSDNode>(Src);
1291
1292 unsigned AS = Load->getAddressSpace();
1293 unsigned Align = Load->getAlignment();
1294 Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
1295 unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
1296
1297 // Don't try to replace the load if we have to expand it due to alignment
1298 // problems. Otherwise we will end up scalarizing the load, and trying to
1299 // repack into the vector for no real reason.
1300 if (Align < ABIAlignment &&
1301 !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) {
1302 return SDValue();
1303 }
1304
1305 SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
1306 Load->getChain(),
1307 Load->getBasePtr(),
1308 LoadVT,
1309 Load->getMemOperand());
1310
1311 // Make sure successors of the original load stay after it by updating
1312 // them to use the new Chain.
1313 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
1314
1315 SmallVector<SDValue, 4> Elts;
1316 if (RegVT.isVector())
1317 DAG.ExtractVectorElements(NewLoad, Elts);
1318 else
1319 Elts.push_back(NewLoad);
1320
1321 SmallVector<SDValue, 4> Ops;
1322
1323 unsigned EltIdx = 0;
1324 for (SDValue Elt : Elts) {
1325 unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
1326 for (unsigned I = 0; I < ComponentsInElt; ++I) {
1327 unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
1328 SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
1329 DCI.AddToWorklist(Cvt.getNode());
1330 Ops.push_back(Cvt);
1331 }
1332
1333 ++EltIdx;
1334 }
1335
1336 assert(Ops.size() == NElts);
1337
1338 return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops);
1339 }
1340
1341 return SDValue();
1342 }
1343
1344 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
1345
1346 // This is a variant of
1347 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
1348 //
1349 // The normal DAG combiner will do this, but only if the add has one use since
1350 // that would increase the number of instructions.
1351 //
1352 // This prevents us from seeing a constant offset that can be folded into a
1353 // memory instruction's addressing mode. If we know the resulting add offset of
1354 // a pointer can be folded into an addressing offset, we can replace the pointer
1355 // operand with the add of new constant offset. This eliminates one of the uses,
1356 // and may allow the remaining use to also be simplified.
1357 //
performSHLPtrCombine(SDNode * N,unsigned AddrSpace,DAGCombinerInfo & DCI) const1358 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
1359 unsigned AddrSpace,
1360 DAGCombinerInfo &DCI) const {
1361 SDValue N0 = N->getOperand(0);
1362 SDValue N1 = N->getOperand(1);
1363
1364 if (N0.getOpcode() != ISD::ADD)
1365 return SDValue();
1366
1367 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
1368 if (!CN1)
1369 return SDValue();
1370
1371 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
1372 if (!CAdd)
1373 return SDValue();
1374
1375 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
1376 getTargetMachine().getSubtargetImpl()->getInstrInfo());
1377
1378 // If the resulting offset is too large, we can't fold it into the addressing
1379 // mode offset.
1380 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
1381 if (!TII->canFoldOffset(Offset.getZExtValue(), AddrSpace))
1382 return SDValue();
1383
1384 SelectionDAG &DAG = DCI.DAG;
1385 SDLoc SL(N);
1386 EVT VT = N->getValueType(0);
1387
1388 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
1389 SDValue COffset = DAG.getConstant(Offset, MVT::i32);
1390
1391 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
1392 }
1393
performAndCombine(SDNode * N,DAGCombinerInfo & DCI) const1394 SDValue SITargetLowering::performAndCombine(SDNode *N,
1395 DAGCombinerInfo &DCI) const {
1396 if (DCI.isBeforeLegalize())
1397 return SDValue();
1398
1399 SelectionDAG &DAG = DCI.DAG;
1400
1401 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
1402 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
1403 SDValue LHS = N->getOperand(0);
1404 SDValue RHS = N->getOperand(1);
1405
1406 if (LHS.getOpcode() == ISD::SETCC &&
1407 RHS.getOpcode() == ISD::SETCC) {
1408 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
1409 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
1410
1411 SDValue X = LHS.getOperand(0);
1412 SDValue Y = RHS.getOperand(0);
1413 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
1414 return SDValue();
1415
1416 if (LCC == ISD::SETO) {
1417 if (X != LHS.getOperand(1))
1418 return SDValue();
1419
1420 if (RCC == ISD::SETUNE) {
1421 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
1422 if (!C1 || !C1->isInfinity() || C1->isNegative())
1423 return SDValue();
1424
1425 const uint32_t Mask = SIInstrFlags::N_NORMAL |
1426 SIInstrFlags::N_SUBNORMAL |
1427 SIInstrFlags::N_ZERO |
1428 SIInstrFlags::P_ZERO |
1429 SIInstrFlags::P_SUBNORMAL |
1430 SIInstrFlags::P_NORMAL;
1431
1432 static_assert(((~(SIInstrFlags::S_NAN |
1433 SIInstrFlags::Q_NAN |
1434 SIInstrFlags::N_INFINITY |
1435 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
1436 "mask not equal");
1437
1438 return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1,
1439 X, DAG.getConstant(Mask, MVT::i32));
1440 }
1441 }
1442 }
1443
1444 return SDValue();
1445 }
1446
performOrCombine(SDNode * N,DAGCombinerInfo & DCI) const1447 SDValue SITargetLowering::performOrCombine(SDNode *N,
1448 DAGCombinerInfo &DCI) const {
1449 SelectionDAG &DAG = DCI.DAG;
1450 SDValue LHS = N->getOperand(0);
1451 SDValue RHS = N->getOperand(1);
1452
1453 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
1454 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
1455 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
1456 SDValue Src = LHS.getOperand(0);
1457 if (Src != RHS.getOperand(0))
1458 return SDValue();
1459
1460 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1461 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
1462 if (!CLHS || !CRHS)
1463 return SDValue();
1464
1465 // Only 10 bits are used.
1466 static const uint32_t MaxMask = 0x3ff;
1467
1468 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
1469 return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1,
1470 Src, DAG.getConstant(NewMask, MVT::i32));
1471 }
1472
1473 return SDValue();
1474 }
1475
performClassCombine(SDNode * N,DAGCombinerInfo & DCI) const1476 SDValue SITargetLowering::performClassCombine(SDNode *N,
1477 DAGCombinerInfo &DCI) const {
1478 SelectionDAG &DAG = DCI.DAG;
1479 SDValue Mask = N->getOperand(1);
1480
1481 // fp_class x, 0 -> false
1482 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
1483 if (CMask->isNullValue())
1484 return DAG.getConstant(0, MVT::i1);
1485 }
1486
1487 return SDValue();
1488 }
1489
minMaxOpcToMin3Max3Opc(unsigned Opc)1490 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
1491 switch (Opc) {
1492 case ISD::FMAXNUM:
1493 return AMDGPUISD::FMAX3;
1494 case AMDGPUISD::SMAX:
1495 return AMDGPUISD::SMAX3;
1496 case AMDGPUISD::UMAX:
1497 return AMDGPUISD::UMAX3;
1498 case ISD::FMINNUM:
1499 return AMDGPUISD::FMIN3;
1500 case AMDGPUISD::SMIN:
1501 return AMDGPUISD::SMIN3;
1502 case AMDGPUISD::UMIN:
1503 return AMDGPUISD::UMIN3;
1504 default:
1505 llvm_unreachable("Not a min/max opcode");
1506 }
1507 }
1508
performMin3Max3Combine(SDNode * N,DAGCombinerInfo & DCI) const1509 SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
1510 DAGCombinerInfo &DCI) const {
1511 SelectionDAG &DAG = DCI.DAG;
1512
1513 unsigned Opc = N->getOpcode();
1514 SDValue Op0 = N->getOperand(0);
1515 SDValue Op1 = N->getOperand(1);
1516
1517 // Only do this if the inner op has one use since this will just increases
1518 // register pressure for no benefit.
1519
1520 // max(max(a, b), c)
1521 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
1522 SDLoc DL(N);
1523 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
1524 DL,
1525 N->getValueType(0),
1526 Op0.getOperand(0),
1527 Op0.getOperand(1),
1528 Op1);
1529 }
1530
1531 // max(a, max(b, c))
1532 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
1533 SDLoc DL(N);
1534 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
1535 DL,
1536 N->getValueType(0),
1537 Op0,
1538 Op1.getOperand(0),
1539 Op1.getOperand(1));
1540 }
1541
1542 return SDValue();
1543 }
1544
performSetCCCombine(SDNode * N,DAGCombinerInfo & DCI) const1545 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
1546 DAGCombinerInfo &DCI) const {
1547 SelectionDAG &DAG = DCI.DAG;
1548 SDLoc SL(N);
1549
1550 SDValue LHS = N->getOperand(0);
1551 SDValue RHS = N->getOperand(1);
1552 EVT VT = LHS.getValueType();
1553
1554 if (VT != MVT::f32 && VT != MVT::f64)
1555 return SDValue();
1556
1557 // Match isinf pattern
1558 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
1559 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
1560 if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
1561 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1562 if (!CRHS)
1563 return SDValue();
1564
1565 const APFloat &APF = CRHS->getValueAPF();
1566 if (APF.isInfinity() && !APF.isNegative()) {
1567 unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
1568 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
1569 LHS.getOperand(0), DAG.getConstant(Mask, MVT::i32));
1570 }
1571 }
1572
1573 return SDValue();
1574 }
1575
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const1576 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
1577 DAGCombinerInfo &DCI) const {
1578 SelectionDAG &DAG = DCI.DAG;
1579 SDLoc DL(N);
1580
1581 switch (N->getOpcode()) {
1582 default:
1583 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1584 case ISD::SETCC:
1585 return performSetCCCombine(N, DCI);
1586 case ISD::FMAXNUM: // TODO: What about fmax_legacy?
1587 case ISD::FMINNUM:
1588 case AMDGPUISD::SMAX:
1589 case AMDGPUISD::SMIN:
1590 case AMDGPUISD::UMAX:
1591 case AMDGPUISD::UMIN: {
1592 if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
1593 N->getValueType(0) != MVT::f64 &&
1594 getTargetMachine().getOptLevel() > CodeGenOpt::None)
1595 return performMin3Max3Combine(N, DCI);
1596 break;
1597 }
1598
1599 case AMDGPUISD::CVT_F32_UBYTE0:
1600 case AMDGPUISD::CVT_F32_UBYTE1:
1601 case AMDGPUISD::CVT_F32_UBYTE2:
1602 case AMDGPUISD::CVT_F32_UBYTE3: {
1603 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
1604
1605 SDValue Src = N->getOperand(0);
1606 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
1607
1608 APInt KnownZero, KnownOne;
1609 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
1610 !DCI.isBeforeLegalizeOps());
1611 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
1612 if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
1613 TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
1614 DCI.CommitTargetLoweringOpt(TLO);
1615 }
1616
1617 break;
1618 }
1619
1620 case ISD::UINT_TO_FP: {
1621 return performUCharToFloatCombine(N, DCI);
1622
1623 case ISD::FADD: {
1624 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
1625 break;
1626
1627 EVT VT = N->getValueType(0);
1628 if (VT != MVT::f32)
1629 break;
1630
1631 SDValue LHS = N->getOperand(0);
1632 SDValue RHS = N->getOperand(1);
1633
1634 // These should really be instruction patterns, but writing patterns with
1635 // source modiifiers is a pain.
1636
1637 // fadd (fadd (a, a), b) -> mad 2.0, a, b
1638 if (LHS.getOpcode() == ISD::FADD) {
1639 SDValue A = LHS.getOperand(0);
1640 if (A == LHS.getOperand(1)) {
1641 const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
1642 return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, RHS);
1643 }
1644 }
1645
1646 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
1647 if (RHS.getOpcode() == ISD::FADD) {
1648 SDValue A = RHS.getOperand(0);
1649 if (A == RHS.getOperand(1)) {
1650 const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
1651 return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, LHS);
1652 }
1653 }
1654
1655 break;
1656 }
1657 case ISD::FSUB: {
1658 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
1659 break;
1660
1661 EVT VT = N->getValueType(0);
1662
1663 // Try to get the fneg to fold into the source modifier. This undoes generic
1664 // DAG combines and folds them into the mad.
1665 if (VT == MVT::f32) {
1666 SDValue LHS = N->getOperand(0);
1667 SDValue RHS = N->getOperand(1);
1668
1669 if (LHS.getOpcode() == ISD::FMUL) {
1670 // (fsub (fmul a, b), c) -> mad a, b, (fneg c)
1671
1672 SDValue A = LHS.getOperand(0);
1673 SDValue B = LHS.getOperand(1);
1674 SDValue C = DAG.getNode(ISD::FNEG, DL, VT, RHS);
1675
1676 return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C);
1677 }
1678
1679 if (RHS.getOpcode() == ISD::FMUL) {
1680 // (fsub c, (fmul a, b)) -> mad (fneg a), b, c
1681
1682 SDValue A = DAG.getNode(ISD::FNEG, DL, VT, RHS.getOperand(0));
1683 SDValue B = RHS.getOperand(1);
1684 SDValue C = LHS;
1685
1686 return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C);
1687 }
1688
1689 if (LHS.getOpcode() == ISD::FADD) {
1690 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
1691
1692 SDValue A = LHS.getOperand(0);
1693 if (A == LHS.getOperand(1)) {
1694 const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
1695 SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS);
1696
1697 return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, NegRHS);
1698 }
1699 }
1700
1701 if (RHS.getOpcode() == ISD::FADD) {
1702 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
1703
1704 SDValue A = RHS.getOperand(0);
1705 if (A == RHS.getOperand(1)) {
1706 const SDValue NegTwo = DAG.getConstantFP(-2.0, MVT::f32);
1707 return DAG.getNode(AMDGPUISD::MAD, DL, VT, NegTwo, A, LHS);
1708 }
1709 }
1710 }
1711
1712 break;
1713 }
1714 }
1715 case ISD::LOAD:
1716 case ISD::STORE:
1717 case ISD::ATOMIC_LOAD:
1718 case ISD::ATOMIC_STORE:
1719 case ISD::ATOMIC_CMP_SWAP:
1720 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
1721 case ISD::ATOMIC_SWAP:
1722 case ISD::ATOMIC_LOAD_ADD:
1723 case ISD::ATOMIC_LOAD_SUB:
1724 case ISD::ATOMIC_LOAD_AND:
1725 case ISD::ATOMIC_LOAD_OR:
1726 case ISD::ATOMIC_LOAD_XOR:
1727 case ISD::ATOMIC_LOAD_NAND:
1728 case ISD::ATOMIC_LOAD_MIN:
1729 case ISD::ATOMIC_LOAD_MAX:
1730 case ISD::ATOMIC_LOAD_UMIN:
1731 case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics.
1732 if (DCI.isBeforeLegalize())
1733 break;
1734
1735 MemSDNode *MemNode = cast<MemSDNode>(N);
1736 SDValue Ptr = MemNode->getBasePtr();
1737
1738 // TODO: We could also do this for multiplies.
1739 unsigned AS = MemNode->getAddressSpace();
1740 if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
1741 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
1742 if (NewPtr) {
1743 SmallVector<SDValue, 8> NewOps;
1744 for (unsigned I = 0, E = MemNode->getNumOperands(); I != E; ++I)
1745 NewOps.push_back(MemNode->getOperand(I));
1746
1747 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
1748 return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
1749 }
1750 }
1751 break;
1752 }
1753 case ISD::AND:
1754 return performAndCombine(N, DCI);
1755 case ISD::OR:
1756 return performOrCombine(N, DCI);
1757 case AMDGPUISD::FP_CLASS:
1758 return performClassCombine(N, DCI);
1759 }
1760 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1761 }
1762
1763 /// \brief Test if RegClass is one of the VSrc classes
isVSrc(unsigned RegClass)1764 static bool isVSrc(unsigned RegClass) {
1765 switch(RegClass) {
1766 default: return false;
1767 case AMDGPU::VS_32RegClassID:
1768 case AMDGPU::VS_64RegClassID:
1769 return true;
1770 }
1771 }
1772
1773 /// \brief Analyze the possible immediate value Op
1774 ///
1775 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
1776 /// and the immediate value if it's a literal immediate
analyzeImmediate(const SDNode * N) const1777 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
1778
1779 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
1780 getTargetMachine().getSubtargetImpl()->getInstrInfo());
1781
1782 if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
1783 if (Node->getZExtValue() >> 32)
1784 return -1;
1785
1786 if (TII->isInlineConstant(Node->getAPIntValue()))
1787 return 0;
1788
1789 return Node->getZExtValue();
1790 }
1791
1792 if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
1793 if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt()))
1794 return 0;
1795
1796 if (Node->getValueType(0) == MVT::f32)
1797 return FloatToBits(Node->getValueAPF().convertToFloat());
1798
1799 return -1;
1800 }
1801
1802 return -1;
1803 }
1804
getRegClassForNode(SelectionDAG & DAG,const SDValue & Op) const1805 const TargetRegisterClass *SITargetLowering::getRegClassForNode(
1806 SelectionDAG &DAG, const SDValue &Op) const {
1807 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
1808 getTargetMachine().getSubtargetImpl()->getInstrInfo());
1809 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1810
1811 if (!Op->isMachineOpcode()) {
1812 switch(Op->getOpcode()) {
1813 case ISD::CopyFromReg: {
1814 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1815 unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg();
1816 if (TargetRegisterInfo::isVirtualRegister(Reg)) {
1817 return MRI.getRegClass(Reg);
1818 }
1819 return TRI.getPhysRegClass(Reg);
1820 }
1821 default: return nullptr;
1822 }
1823 }
1824 const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode());
1825 int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass;
1826 if (OpClassID != -1) {
1827 return TRI.getRegClass(OpClassID);
1828 }
1829 switch(Op.getMachineOpcode()) {
1830 case AMDGPU::COPY_TO_REGCLASS:
1831 // Operand 1 is the register class id for COPY_TO_REGCLASS instructions.
1832 OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
1833
1834 // If the COPY_TO_REGCLASS instruction is copying to a VSrc register
1835 // class, then the register class for the value could be either a
1836 // VReg or and SReg. In order to get a more accurate
1837 if (isVSrc(OpClassID))
1838 return getRegClassForNode(DAG, Op.getOperand(0));
1839
1840 return TRI.getRegClass(OpClassID);
1841 case AMDGPU::EXTRACT_SUBREG: {
1842 int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1843 const TargetRegisterClass *SuperClass =
1844 getRegClassForNode(DAG, Op.getOperand(0));
1845 return TRI.getSubClassWithSubReg(SuperClass, SubIdx);
1846 }
1847 case AMDGPU::REG_SEQUENCE:
1848 // Operand 0 is the register class id for REG_SEQUENCE instructions.
1849 return TRI.getRegClass(
1850 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue());
1851 default:
1852 return getRegClassFor(Op.getSimpleValueType());
1853 }
1854 }
1855
1856 /// \brief Does "Op" fit into register class "RegClass" ?
fitsRegClass(SelectionDAG & DAG,const SDValue & Op,unsigned RegClass) const1857 bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
1858 unsigned RegClass) const {
1859 const TargetRegisterInfo *TRI =
1860 getTargetMachine().getSubtargetImpl()->getRegisterInfo();
1861 const TargetRegisterClass *RC = getRegClassForNode(DAG, Op);
1862 if (!RC) {
1863 return false;
1864 }
1865 return TRI->getRegClass(RegClass)->hasSubClassEq(RC);
1866 }
1867
1868 /// \brief Helper function for adjustWritemask
SubIdx2Lane(unsigned Idx)1869 static unsigned SubIdx2Lane(unsigned Idx) {
1870 switch (Idx) {
1871 default: return 0;
1872 case AMDGPU::sub0: return 0;
1873 case AMDGPU::sub1: return 1;
1874 case AMDGPU::sub2: return 2;
1875 case AMDGPU::sub3: return 3;
1876 }
1877 }
1878
1879 /// \brief Adjust the writemask of MIMG instructions
adjustWritemask(MachineSDNode * & Node,SelectionDAG & DAG) const1880 void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
1881 SelectionDAG &DAG) const {
1882 SDNode *Users[4] = { };
1883 unsigned Lane = 0;
1884 unsigned OldDmask = Node->getConstantOperandVal(0);
1885 unsigned NewDmask = 0;
1886
1887 // Try to figure out the used register components
1888 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
1889 I != E; ++I) {
1890
1891 // Abort if we can't understand the usage
1892 if (!I->isMachineOpcode() ||
1893 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
1894 return;
1895
1896 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
1897 // Note that subregs are packed, i.e. Lane==0 is the first bit set
1898 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
1899 // set, etc.
1900 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
1901
1902 // Set which texture component corresponds to the lane.
1903 unsigned Comp;
1904 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
1905 assert(Dmask);
1906 Comp = countTrailingZeros(Dmask);
1907 Dmask &= ~(1 << Comp);
1908 }
1909
1910 // Abort if we have more than one user per component
1911 if (Users[Lane])
1912 return;
1913
1914 Users[Lane] = *I;
1915 NewDmask |= 1 << Comp;
1916 }
1917
1918 // Abort if there's no change
1919 if (NewDmask == OldDmask)
1920 return;
1921
1922 // Adjust the writemask in the node
1923 std::vector<SDValue> Ops;
1924 Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32));
1925 for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
1926 Ops.push_back(Node->getOperand(i));
1927 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
1928
1929 // If we only got one lane, replace it with a copy
1930 // (if NewDmask has only one bit set...)
1931 if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
1932 SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, MVT::i32);
1933 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
1934 SDLoc(), Users[Lane]->getValueType(0),
1935 SDValue(Node, 0), RC);
1936 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
1937 return;
1938 }
1939
1940 // Update the users of the node with the new indices
1941 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
1942
1943 SDNode *User = Users[i];
1944 if (!User)
1945 continue;
1946
1947 SDValue Op = DAG.getTargetConstant(Idx, MVT::i32);
1948 DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
1949
1950 switch (Idx) {
1951 default: break;
1952 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
1953 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
1954 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
1955 }
1956 }
1957 }
1958
1959 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
1960 /// with frame index operands.
1961 /// LLVM assumes that inputs are to these instructions are registers.
legalizeTargetIndependentNode(SDNode * Node,SelectionDAG & DAG) const1962 void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
1963 SelectionDAG &DAG) const {
1964
1965 SmallVector<SDValue, 8> Ops;
1966 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
1967 if (!isa<FrameIndexSDNode>(Node->getOperand(i))) {
1968 Ops.push_back(Node->getOperand(i));
1969 continue;
1970 }
1971
1972 SDLoc DL(Node);
1973 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
1974 Node->getOperand(i).getValueType(),
1975 Node->getOperand(i)), 0));
1976 }
1977
1978 DAG.UpdateNodeOperands(Node, Ops);
1979 }
1980
1981 /// \brief Fold the instructions after selecting them.
PostISelFolding(MachineSDNode * Node,SelectionDAG & DAG) const1982 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
1983 SelectionDAG &DAG) const {
1984 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
1985 getTargetMachine().getSubtargetImpl()->getInstrInfo());
1986 Node = AdjustRegClass(Node, DAG);
1987
1988 if (TII->isMIMG(Node->getMachineOpcode()))
1989 adjustWritemask(Node, DAG);
1990
1991 if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG ||
1992 Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) {
1993 legalizeTargetIndependentNode(Node, DAG);
1994 return Node;
1995 }
1996 return Node;
1997 }
1998
1999 /// \brief Assign the register class depending on the number of
2000 /// bits set in the writemask
AdjustInstrPostInstrSelection(MachineInstr * MI,SDNode * Node) const2001 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
2002 SDNode *Node) const {
2003 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
2004 getTargetMachine().getSubtargetImpl()->getInstrInfo());
2005
2006 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
2007 TII->legalizeOperands(MI);
2008
2009 if (TII->isMIMG(MI->getOpcode())) {
2010 unsigned VReg = MI->getOperand(0).getReg();
2011 unsigned Writemask = MI->getOperand(1).getImm();
2012 unsigned BitsSet = 0;
2013 for (unsigned i = 0; i < 4; ++i)
2014 BitsSet += Writemask & (1 << i) ? 1 : 0;
2015
2016 const TargetRegisterClass *RC;
2017 switch (BitsSet) {
2018 default: return;
2019 case 1: RC = &AMDGPU::VGPR_32RegClass; break;
2020 case 2: RC = &AMDGPU::VReg_64RegClass; break;
2021 case 3: RC = &AMDGPU::VReg_96RegClass; break;
2022 }
2023
2024 unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
2025 MI->setDesc(TII->get(NewOpcode));
2026 MRI.setRegClass(VReg, RC);
2027 return;
2028 }
2029
2030 // Replace unused atomics with the no return version.
2031 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode());
2032 if (NoRetAtomicOp != -1) {
2033 if (!Node->hasAnyUseOfValue(0)) {
2034 MI->setDesc(TII->get(NoRetAtomicOp));
2035 MI->RemoveOperand(0);
2036 }
2037
2038 return;
2039 }
2040 }
2041
buildSMovImm32(SelectionDAG & DAG,SDLoc DL,uint64_t Val)2042 static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) {
2043 SDValue K = DAG.getTargetConstant(Val, MVT::i32);
2044 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
2045 }
2046
wrapAddr64Rsrc(SelectionDAG & DAG,SDLoc DL,SDValue Ptr) const2047 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
2048 SDLoc DL,
2049 SDValue Ptr) const {
2050 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
2051 getTargetMachine().getSubtargetImpl()->getInstrInfo());
2052 #if 1
2053 // XXX - Workaround for moveToVALU not handling different register class
2054 // inserts for REG_SEQUENCE.
2055
2056 // Build the half of the subregister with the constants.
2057 const SDValue Ops0[] = {
2058 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, MVT::i32),
2059 buildSMovImm32(DAG, DL, 0),
2060 DAG.getTargetConstant(AMDGPU::sub0, MVT::i32),
2061 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
2062 DAG.getTargetConstant(AMDGPU::sub1, MVT::i32)
2063 };
2064
2065 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
2066 MVT::v2i32, Ops0), 0);
2067
2068 // Combine the constants and the pointer.
2069 const SDValue Ops1[] = {
2070 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
2071 Ptr,
2072 DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
2073 SubRegHi,
2074 DAG.getTargetConstant(AMDGPU::sub2_sub3, MVT::i32)
2075 };
2076
2077 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
2078 #else
2079 const SDValue Ops[] = {
2080 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
2081 Ptr,
2082 DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
2083 buildSMovImm32(DAG, DL, 0),
2084 DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
2085 buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32),
2086 DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
2087 };
2088
2089 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
2090
2091 #endif
2092 }
2093
2094 /// \brief Return a resource descriptor with the 'Add TID' bit enabled
2095 /// The TID (Thread ID) is multipled by the stride value (bits [61:48]
2096 /// of the resource descriptor) to create an offset, which is added to the
2097 /// resource ponter.
buildRSRC(SelectionDAG & DAG,SDLoc DL,SDValue Ptr,uint32_t RsrcDword1,uint64_t RsrcDword2And3) const2098 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
2099 SDLoc DL,
2100 SDValue Ptr,
2101 uint32_t RsrcDword1,
2102 uint64_t RsrcDword2And3) const {
2103 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
2104 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
2105 if (RsrcDword1) {
2106 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
2107 DAG.getConstant(RsrcDword1, MVT::i32)), 0);
2108 }
2109
2110 SDValue DataLo = buildSMovImm32(DAG, DL,
2111 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
2112 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
2113
2114 const SDValue Ops[] = {
2115 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
2116 PtrLo,
2117 DAG.getTargetConstant(AMDGPU::sub0, MVT::i32),
2118 PtrHi,
2119 DAG.getTargetConstant(AMDGPU::sub1, MVT::i32),
2120 DataLo,
2121 DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
2122 DataHi,
2123 DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
2124 };
2125
2126 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
2127 }
2128
buildScratchRSRC(SelectionDAG & DAG,SDLoc DL,SDValue Ptr) const2129 MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
2130 SDLoc DL,
2131 SDValue Ptr) const {
2132 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
2133 getTargetMachine().getSubtargetImpl()->getInstrInfo());
2134 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE |
2135 0xffffffff; // Size
2136
2137 return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
2138 }
2139
AdjustRegClass(MachineSDNode * N,SelectionDAG & DAG) const2140 MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N,
2141 SelectionDAG &DAG) const {
2142
2143 SDLoc DL(N);
2144 unsigned NewOpcode = N->getMachineOpcode();
2145
2146 switch (N->getMachineOpcode()) {
2147 default: return N;
2148 case AMDGPU::S_LOAD_DWORD_IMM:
2149 NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
2150 // Fall-through
2151 case AMDGPU::S_LOAD_DWORDX2_SGPR:
2152 if (NewOpcode == N->getMachineOpcode()) {
2153 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
2154 }
2155 // Fall-through
2156 case AMDGPU::S_LOAD_DWORDX4_IMM:
2157 case AMDGPU::S_LOAD_DWORDX4_SGPR: {
2158 if (NewOpcode == N->getMachineOpcode()) {
2159 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
2160 }
2161 if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) {
2162 return N;
2163 }
2164 ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1));
2165
2166 const SDValue Zero64 = DAG.getTargetConstant(0, MVT::i64);
2167 SDValue Ptr(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Zero64), 0);
2168 MachineSDNode *RSrc = wrapAddr64Rsrc(DAG, DL, Ptr);
2169
2170 SmallVector<SDValue, 8> Ops;
2171 Ops.push_back(SDValue(RSrc, 0));
2172 Ops.push_back(N->getOperand(0));
2173
2174 // The immediate offset is in dwords on SI and in bytes on VI.
2175 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2176 Ops.push_back(DAG.getTargetConstant(Offset->getSExtValue(), MVT::i32));
2177 else
2178 Ops.push_back(DAG.getTargetConstant(Offset->getSExtValue() << 2, MVT::i32));
2179
2180 // Copy remaining operands so we keep any chain and glue nodes that follow
2181 // the normal operands.
2182 for (unsigned I = 2, E = N->getNumOperands(); I != E; ++I)
2183 Ops.push_back(N->getOperand(I));
2184
2185 return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops);
2186 }
2187 }
2188 }
2189
CreateLiveInRegister(SelectionDAG & DAG,const TargetRegisterClass * RC,unsigned Reg,EVT VT) const2190 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
2191 const TargetRegisterClass *RC,
2192 unsigned Reg, EVT VT) const {
2193 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
2194
2195 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
2196 cast<RegisterSDNode>(VReg)->getReg(), VT);
2197 }
2198