1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "SIISelLowering.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/FloatingPointMode.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
23 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
24 #include "llvm/BinaryFormat/ELF.h"
25 #include "llvm/CodeGen/Analysis.h"
26 #include "llvm/CodeGen/FunctionLoweringInfo.h"
27 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
29 #include "llvm/CodeGen/MachineFrameInfo.h"
30 #include "llvm/CodeGen/MachineFunction.h"
31 #include "llvm/CodeGen/MachineLoopInfo.h"
32 #include "llvm/IR/DiagnosticInfo.h"
33 #include "llvm/IR/IRBuilder.h"
34 #include "llvm/IR/IntrinsicInst.h"
35 #include "llvm/IR/IntrinsicsAMDGPU.h"
36 #include "llvm/IR/IntrinsicsR600.h"
37 #include "llvm/Support/CommandLine.h"
38 #include "llvm/Support/ModRef.h"
39 #include "llvm/Support/KnownBits.h"
40
41 using namespace llvm;
42
43 #define DEBUG_TYPE "si-lower"
44
45 STATISTIC(NumTailCalls, "Number of tail calls");
46
47 static cl::opt<bool> DisableLoopAlignment(
48 "amdgpu-disable-loop-alignment",
49 cl::desc("Do not align and prefetch loops"),
50 cl::init(false));
51
52 static cl::opt<bool> UseDivergentRegisterIndexing(
53 "amdgpu-use-divergent-register-indexing",
54 cl::Hidden,
55 cl::desc("Use indirect register addressing for divergent indexes"),
56 cl::init(false));
57
hasFP32Denormals(const MachineFunction & MF)58 static bool hasFP32Denormals(const MachineFunction &MF) {
59 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
60 return Info->getMode().allFP32Denormals();
61 }
62
hasFP64FP16Denormals(const MachineFunction & MF)63 static bool hasFP64FP16Denormals(const MachineFunction &MF) {
64 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
65 return Info->getMode().allFP64FP16Denormals();
66 }
67
findFirstFreeSGPR(CCState & CCInfo)68 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
69 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
70 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
71 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
72 return AMDGPU::SGPR0 + Reg;
73 }
74 }
75 llvm_unreachable("Cannot allocate sgpr");
76 }
77
SITargetLowering(const TargetMachine & TM,const GCNSubtarget & STI)78 SITargetLowering::SITargetLowering(const TargetMachine &TM,
79 const GCNSubtarget &STI)
80 : AMDGPUTargetLowering(TM, STI),
81 Subtarget(&STI) {
82 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
83 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
84
85 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
86 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
87
88 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
89
90 const SIRegisterInfo *TRI = STI.getRegisterInfo();
91 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
92
93 addRegisterClass(MVT::f64, V64RegClass);
94 addRegisterClass(MVT::v2f32, V64RegClass);
95
96 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
97 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
98
99 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
100 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
101
102 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
103 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
104
105 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
106 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
107
108 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
109 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
110
111 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
112 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
113
114 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
115 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
116
117 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
118 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
119
120 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
121 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
122
123 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
124 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
125
126 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
127 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
128
129 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
130 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
131
132 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
133 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
134
135 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
136 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
137
138 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
139 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
140
141 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
142 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
143
144 if (Subtarget->has16BitInsts()) {
145 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
146 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
147
148 // Unless there are also VOP3P operations, not operations are really legal.
149 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
150 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
151 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
152 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
153 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
154 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
155 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
156 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
157 }
158
159 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
160 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
161
162 computeRegisterProperties(Subtarget->getRegisterInfo());
163
164 // The boolean content concept here is too inflexible. Compares only ever
165 // really produce a 1-bit result. Any copy/extend from these will turn into a
166 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
167 // it's what most targets use.
168 setBooleanContents(ZeroOrOneBooleanContent);
169 setBooleanVectorContents(ZeroOrOneBooleanContent);
170
171 // We need to custom lower vector stores from local memory
172 setOperationAction(ISD::LOAD,
173 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
174 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
175 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
176 MVT::i1, MVT::v32i32},
177 Custom);
178
179 setOperationAction(ISD::STORE,
180 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
181 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
182 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
183 MVT::i1, MVT::v32i32},
184 Custom);
185
186 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
187 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
188 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
189 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
190 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
191 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
192 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
193 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
194 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
195 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
196 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
197 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
198 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
199 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
200 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
201 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
202
203 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
204 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
205 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
206 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
207 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
208 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
209 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
210
211 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
212
213 setOperationAction(ISD::SELECT, MVT::i1, Promote);
214 setOperationAction(ISD::SELECT, MVT::i64, Custom);
215 setOperationAction(ISD::SELECT, MVT::f64, Promote);
216 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
217
218 setOperationAction(ISD::SELECT_CC,
219 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
220
221 setOperationAction(ISD::SETCC, MVT::i1, Promote);
222 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
223 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
224
225 setOperationAction(ISD::TRUNCATE,
226 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
227 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
228 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
229 Expand);
230 setOperationAction(ISD::FP_ROUND,
231 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
232 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
233 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
234 Expand);
235
236 setOperationAction(ISD::SIGN_EXTEND_INREG,
237 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
238 MVT::v3i16, MVT::v4i16, MVT::Other},
239 Custom);
240
241 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
242 setOperationAction(ISD::BR_CC,
243 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
244
245 setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
246
247 setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i32, Legal);
248
249 setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64,
250 Expand);
251
252 #if 0
253 setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i64, Legal);
254 #endif
255
256 // We only support LOAD/STORE and vector manipulation ops for vectors
257 // with > 4 elements.
258 for (MVT VT :
259 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
260 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
261 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
262 MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
263 MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16,
264 MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
265 MVT::v32i32, MVT::v32f32}) {
266 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
267 switch (Op) {
268 case ISD::LOAD:
269 case ISD::STORE:
270 case ISD::BUILD_VECTOR:
271 case ISD::BITCAST:
272 case ISD::UNDEF:
273 case ISD::EXTRACT_VECTOR_ELT:
274 case ISD::INSERT_VECTOR_ELT:
275 case ISD::EXTRACT_SUBVECTOR:
276 case ISD::SCALAR_TO_VECTOR:
277 case ISD::IS_FPCLASS:
278 break;
279 case ISD::INSERT_SUBVECTOR:
280 case ISD::CONCAT_VECTORS:
281 setOperationAction(Op, VT, Custom);
282 break;
283 default:
284 setOperationAction(Op, VT, Expand);
285 break;
286 }
287 }
288 }
289
290 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
291
292 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
293 // is expanded to avoid having two separate loops in case the index is a VGPR.
294
295 // Most operations are naturally 32-bit vector operations. We only support
296 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
297 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
298 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
299 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
300
301 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
302 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
303
304 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
305 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
306
307 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
308 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
309 }
310
311 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
312 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
313 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
314
315 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
316 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
317
318 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
319 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
320
321 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
322 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
323 }
324
325 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
326 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
327 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
328
329 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
330 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
331
332 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
333 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
334
335 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
336 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
337 }
338
339 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
340 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
341 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
342
343 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
344 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
345
346 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
347 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
348
349 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
350 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
351 }
352
353 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
354 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
355 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
356
357 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
358 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
359
360 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
361 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
362
363 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
364 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
365 }
366
367 setOperationAction(ISD::VECTOR_SHUFFLE,
368 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
369 Expand);
370
371 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16}, Custom);
372
373 // Avoid stack access for these.
374 // TODO: Generalize to more vector types.
375 setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
376 {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
377 MVT::v4i16, MVT::v4f16},
378 Custom);
379
380 // Deal with vec3 vector operations when widened to vec4.
381 setOperationAction(ISD::INSERT_SUBVECTOR,
382 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
383
384 // Deal with vec5/6/7 vector operations when widened to vec8.
385 setOperationAction(ISD::INSERT_SUBVECTOR,
386 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
387 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
388 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
389 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
390 Custom);
391
392 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
393 // and output demarshalling
394 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
395
396 // We can't return success/failure, only the old value,
397 // let LLVM add the comparison
398 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
399 Expand);
400
401 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
402
403 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
404
405 // FIXME: This should be narrowed to i32, but that only happens if i64 is
406 // illegal.
407 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
408 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
409
410 // On SI this is s_memtime and s_memrealtime on VI.
411 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
412 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
413
414 if (Subtarget->has16BitInsts()) {
415 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
416 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
417 }
418
419 if (Subtarget->hasMadMacF32Insts())
420 setOperationAction(ISD::FMAD, MVT::f32, Legal);
421
422 if (!Subtarget->hasBFI())
423 // fcopysign can be done in a single instruction with BFI.
424 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
425
426 if (!Subtarget->hasBCNT(32))
427 setOperationAction(ISD::CTPOP, MVT::i32, Expand);
428
429 if (!Subtarget->hasBCNT(64))
430 setOperationAction(ISD::CTPOP, MVT::i64, Expand);
431
432 if (Subtarget->hasFFBH())
433 setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
434
435 if (Subtarget->hasFFBL())
436 setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
437
438 // We only really have 32-bit BFE instructions (and 16-bit on VI).
439 //
440 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
441 // effort to match them now. We want this to be false for i64 cases when the
442 // extraction isn't restricted to the upper or lower half. Ideally we would
443 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
444 // span the midpoint are probably relatively rare, so don't worry about them
445 // for now.
446 if (Subtarget->hasBFE())
447 setHasExtractBitsInsn(true);
448
449 // Clamp modifier on add/sub
450 if (Subtarget->hasIntClamp())
451 setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal);
452
453 if (Subtarget->hasAddNoCarry())
454 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
455 Legal);
456
457 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
458 Custom);
459
460 // These are really only legal for ieee_mode functions. We should be avoiding
461 // them for functions that don't have ieee_mode enabled, so just say they are
462 // legal.
463 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
464 {MVT::f32, MVT::f64}, Legal);
465
466 if (Subtarget->haveRoundOpsF64())
467 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FRINT}, MVT::f64, Legal);
468 else
469 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR},
470 MVT::f64, Custom);
471
472 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
473
474 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
475 setOperationAction(ISD::FDIV, MVT::f64, Custom);
476
477 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
478 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
479
480 if (Subtarget->has16BitInsts()) {
481 setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
482 ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
483 MVT::i16, Legal);
484
485 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
486
487 setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
488 MVT::i16, Expand);
489
490 setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
491 ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
492 ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
493 ISD::CTPOP},
494 MVT::i16, Promote);
495
496 setOperationAction(ISD::LOAD, MVT::i16, Custom);
497
498 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
499
500 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
501 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
502 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
503 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
504
505 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
506
507 // F16 - Constant Actions.
508 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
509
510 // F16 - Load/Store Actions.
511 setOperationAction(ISD::LOAD, MVT::f16, Promote);
512 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
513 setOperationAction(ISD::STORE, MVT::f16, Promote);
514 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
515
516 // F16 - VOP1 Actions.
517 setOperationAction(
518 {ISD::FP_ROUND, ISD::FCOS, ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
519 MVT::f16, Custom);
520
521 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
522
523 setOperationAction(
524 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP},
525 MVT::f16, Promote);
526
527 // F16 - VOP2 Actions.
528 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
529
530 setOperationAction(ISD::FDIV, MVT::f16, Custom);
531
532 // F16 - VOP3 Actions.
533 setOperationAction(ISD::FMA, MVT::f16, Legal);
534 if (STI.hasMadF16())
535 setOperationAction(ISD::FMAD, MVT::f16, Legal);
536
537 for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
538 MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
539 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
540 switch (Op) {
541 case ISD::LOAD:
542 case ISD::STORE:
543 case ISD::BUILD_VECTOR:
544 case ISD::BITCAST:
545 case ISD::UNDEF:
546 case ISD::EXTRACT_VECTOR_ELT:
547 case ISD::INSERT_VECTOR_ELT:
548 case ISD::INSERT_SUBVECTOR:
549 case ISD::EXTRACT_SUBVECTOR:
550 case ISD::SCALAR_TO_VECTOR:
551 case ISD::IS_FPCLASS:
552 break;
553 case ISD::CONCAT_VECTORS:
554 setOperationAction(Op, VT, Custom);
555 break;
556 default:
557 setOperationAction(Op, VT, Expand);
558 break;
559 }
560 }
561 }
562
563 // v_perm_b32 can handle either of these.
564 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
565 setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);
566
567 // XXX - Do these do anything? Vector constants turn into build_vector.
568 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
569
570 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16}, Legal);
571
572 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
573 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
574 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
575 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
576
577 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
578 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
579 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
580 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
581
582 setOperationAction(ISD::AND, MVT::v2i16, Promote);
583 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
584 setOperationAction(ISD::OR, MVT::v2i16, Promote);
585 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
586 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
587 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
588
589 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
590 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
591 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
592 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
593
594 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
595 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
596 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
597 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
598
599 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
600 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
601 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
602 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
603
604 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
605 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
606 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
607 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
608
609 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
610 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
611 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
612 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
613
614 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
615 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
616 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
617 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
618
619 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
620 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
621 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
622 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
623
624 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
625 MVT::v2i32, Expand);
626 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
627
628 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
629 MVT::v4i32, Expand);
630
631 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
632 MVT::v8i32, Expand);
633
634 if (!Subtarget->hasVOP3PInsts())
635 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16}, Custom);
636
637 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
638 // This isn't really legal, but this avoids the legalizer unrolling it (and
639 // allows matching fneg (fabs x) patterns)
640 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
641
642 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
643 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
644
645 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
646 {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom);
647
648 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
649 {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand);
650
651 for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
652 setOperationAction(
653 {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
654 Vec16, Custom);
655 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
656 }
657 }
658
659 if (Subtarget->hasVOP3PInsts()) {
660 setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
661 ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
662 ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
663 MVT::v2i16, Legal);
664
665 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
666 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
667 MVT::v2f16, Legal);
668
669 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16},
670 Custom);
671
672 setOperationAction(ISD::VECTOR_SHUFFLE,
673 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
674 MVT::v16f16, MVT::v16i16},
675 Custom);
676
677 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
678 // Split vector operations.
679 setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
680 ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
681 ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
682 ISD::SSUBSAT},
683 VT, Custom);
684
685 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
686 // Split vector operations.
687 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
688 VT, Custom);
689
690 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
691 Custom);
692
693 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
694 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16}, Custom);
695
696 if (Subtarget->hasPackedFP32Ops()) {
697 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
698 MVT::v2f32, Legal);
699 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA},
700 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
701 Custom);
702 }
703 }
704
705 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
706
707 if (Subtarget->has16BitInsts()) {
708 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
709 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
710 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
711 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
712 } else {
713 // Legalization hack.
714 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
715
716 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
717 }
718
719 setOperationAction(ISD::SELECT,
720 {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
721 MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
722 Custom);
723
724 setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
725
726 if (Subtarget->hasMad64_32())
727 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
728
729 setOperationAction(ISD::INTRINSIC_WO_CHAIN,
730 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
731 MVT::v2i16, MVT::v2f16},
732 Custom);
733
734 setOperationAction(ISD::INTRINSIC_W_CHAIN,
735 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
736 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
737 MVT::i16, MVT::i8},
738 Custom);
739
740 setOperationAction(ISD::INTRINSIC_VOID,
741 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
742 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
743 MVT::i8},
744 Custom);
745
746 setTargetDAGCombine({ISD::ADD,
747 ISD::ADDCARRY,
748 ISD::SUB,
749 ISD::SUBCARRY,
750 ISD::FADD,
751 ISD::FSUB,
752 ISD::FMINNUM,
753 ISD::FMAXNUM,
754 ISD::FMINNUM_IEEE,
755 ISD::FMAXNUM_IEEE,
756 ISD::FMA,
757 ISD::SMIN,
758 ISD::SMAX,
759 ISD::UMIN,
760 ISD::UMAX,
761 ISD::SETCC,
762 ISD::AND,
763 ISD::OR,
764 ISD::XOR,
765 ISD::SINT_TO_FP,
766 ISD::UINT_TO_FP,
767 ISD::FCANONICALIZE,
768 ISD::SCALAR_TO_VECTOR,
769 ISD::ZERO_EXTEND,
770 ISD::SIGN_EXTEND_INREG,
771 ISD::EXTRACT_VECTOR_ELT,
772 ISD::INSERT_VECTOR_ELT});
773
774 // All memory operations. Some folding on the pointer operand is done to help
775 // matching the constant offsets in the addressing modes.
776 setTargetDAGCombine({ISD::LOAD,
777 ISD::STORE,
778 ISD::ATOMIC_LOAD,
779 ISD::ATOMIC_STORE,
780 ISD::ATOMIC_CMP_SWAP,
781 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
782 ISD::ATOMIC_SWAP,
783 ISD::ATOMIC_LOAD_ADD,
784 ISD::ATOMIC_LOAD_SUB,
785 ISD::ATOMIC_LOAD_AND,
786 ISD::ATOMIC_LOAD_OR,
787 ISD::ATOMIC_LOAD_XOR,
788 ISD::ATOMIC_LOAD_NAND,
789 ISD::ATOMIC_LOAD_MIN,
790 ISD::ATOMIC_LOAD_MAX,
791 ISD::ATOMIC_LOAD_UMIN,
792 ISD::ATOMIC_LOAD_UMAX,
793 ISD::ATOMIC_LOAD_FADD,
794 ISD::INTRINSIC_VOID,
795 ISD::INTRINSIC_W_CHAIN});
796
797 // FIXME: In other contexts we pretend this is a per-function property.
798 setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
799
800 setSchedulingPreference(Sched::RegPressure);
801 }
802
getSubtarget() const803 const GCNSubtarget *SITargetLowering::getSubtarget() const {
804 return Subtarget;
805 }
806
807 //===----------------------------------------------------------------------===//
808 // TargetLowering queries
809 //===----------------------------------------------------------------------===//
810
811 // v_mad_mix* support a conversion from f16 to f32.
812 //
813 // There is only one special case when denormals are enabled we don't currently,
814 // where this is OK to use.
isFPExtFoldable(const SelectionDAG & DAG,unsigned Opcode,EVT DestVT,EVT SrcVT) const815 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
816 EVT DestVT, EVT SrcVT) const {
817 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
818 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
819 DestVT.getScalarType() == MVT::f32 &&
820 SrcVT.getScalarType() == MVT::f16 &&
821 // TODO: This probably only requires no input flushing?
822 !hasFP32Denormals(DAG.getMachineFunction());
823 }
824
isFPExtFoldable(const MachineInstr & MI,unsigned Opcode,LLT DestTy,LLT SrcTy) const825 bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
826 LLT DestTy, LLT SrcTy) const {
827 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
828 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
829 DestTy.getScalarSizeInBits() == 32 &&
830 SrcTy.getScalarSizeInBits() == 16 &&
831 // TODO: This probably only requires no input flushing?
832 !hasFP32Denormals(*MI.getMF());
833 }
834
isShuffleMaskLegal(ArrayRef<int>,EVT) const835 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
836 // SI has some legal vector types, but no legal vector operations. Say no
837 // shuffles are legal in order to prefer scalarizing some vector operations.
838 return false;
839 }
840
getRegisterTypeForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT) const841 MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
842 CallingConv::ID CC,
843 EVT VT) const {
844 if (CC == CallingConv::AMDGPU_KERNEL)
845 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
846
847 if (VT.isVector()) {
848 EVT ScalarVT = VT.getScalarType();
849 unsigned Size = ScalarVT.getSizeInBits();
850 if (Size == 16) {
851 if (Subtarget->has16BitInsts()) {
852 if (VT.isInteger())
853 return MVT::v2i16;
854 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
855 }
856 return VT.isInteger() ? MVT::i32 : MVT::f32;
857 }
858
859 if (Size < 16)
860 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
861 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
862 }
863
864 if (VT.getSizeInBits() > 32)
865 return MVT::i32;
866
867 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
868 }
869
getNumRegistersForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT) const870 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
871 CallingConv::ID CC,
872 EVT VT) const {
873 if (CC == CallingConv::AMDGPU_KERNEL)
874 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
875
876 if (VT.isVector()) {
877 unsigned NumElts = VT.getVectorNumElements();
878 EVT ScalarVT = VT.getScalarType();
879 unsigned Size = ScalarVT.getSizeInBits();
880
881 // FIXME: Should probably promote 8-bit vectors to i16.
882 if (Size == 16 && Subtarget->has16BitInsts())
883 return (NumElts + 1) / 2;
884
885 if (Size <= 32)
886 return NumElts;
887
888 if (Size > 32)
889 return NumElts * ((Size + 31) / 32);
890 } else if (VT.getSizeInBits() > 32)
891 return (VT.getSizeInBits() + 31) / 32;
892
893 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
894 }
895
getVectorTypeBreakdownForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT,EVT & IntermediateVT,unsigned & NumIntermediates,MVT & RegisterVT) const896 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
897 LLVMContext &Context, CallingConv::ID CC,
898 EVT VT, EVT &IntermediateVT,
899 unsigned &NumIntermediates, MVT &RegisterVT) const {
900 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
901 unsigned NumElts = VT.getVectorNumElements();
902 EVT ScalarVT = VT.getScalarType();
903 unsigned Size = ScalarVT.getSizeInBits();
904 // FIXME: We should fix the ABI to be the same on targets without 16-bit
905 // support, but unless we can properly handle 3-vectors, it will be still be
906 // inconsistent.
907 if (Size == 16 && Subtarget->has16BitInsts()) {
908 if (ScalarVT == MVT::bf16) {
909 RegisterVT = MVT::i32;
910 IntermediateVT = MVT::v2bf16;
911 } else {
912 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
913 IntermediateVT = RegisterVT;
914 }
915 NumIntermediates = (NumElts + 1) / 2;
916 return NumIntermediates;
917 }
918
919 if (Size == 32) {
920 RegisterVT = ScalarVT.getSimpleVT();
921 IntermediateVT = RegisterVT;
922 NumIntermediates = NumElts;
923 return NumIntermediates;
924 }
925
926 if (Size < 16 && Subtarget->has16BitInsts()) {
927 // FIXME: Should probably form v2i16 pieces
928 RegisterVT = MVT::i16;
929 IntermediateVT = ScalarVT;
930 NumIntermediates = NumElts;
931 return NumIntermediates;
932 }
933
934
935 if (Size != 16 && Size <= 32) {
936 RegisterVT = MVT::i32;
937 IntermediateVT = ScalarVT;
938 NumIntermediates = NumElts;
939 return NumIntermediates;
940 }
941
942 if (Size > 32) {
943 RegisterVT = MVT::i32;
944 IntermediateVT = RegisterVT;
945 NumIntermediates = NumElts * ((Size + 31) / 32);
946 return NumIntermediates;
947 }
948 }
949
950 return TargetLowering::getVectorTypeBreakdownForCallingConv(
951 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
952 }
953
memVTFromLoadIntrData(Type * Ty,unsigned MaxNumLanes)954 static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
955 assert(MaxNumLanes != 0);
956
957 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
958 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
959 return EVT::getVectorVT(Ty->getContext(),
960 EVT::getEVT(VT->getElementType()),
961 NumElts);
962 }
963
964 return EVT::getEVT(Ty);
965 }
966
967 // Peek through TFE struct returns to only use the data size.
memVTFromLoadIntrReturn(Type * Ty,unsigned MaxNumLanes)968 static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
969 auto *ST = dyn_cast<StructType>(Ty);
970 if (!ST)
971 return memVTFromLoadIntrData(Ty, MaxNumLanes);
972
973 // TFE intrinsics return an aggregate type.
974 assert(ST->getNumContainedTypes() == 2 &&
975 ST->getContainedType(1)->isIntegerTy(32));
976 return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
977 }
978
getTgtMemIntrinsic(IntrinsicInfo & Info,const CallInst & CI,MachineFunction & MF,unsigned IntrID) const979 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
980 const CallInst &CI,
981 MachineFunction &MF,
982 unsigned IntrID) const {
983 Info.flags = MachineMemOperand::MONone;
984 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
985 Info.flags |= MachineMemOperand::MOInvariant;
986
987 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
988 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
989 AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
990 (Intrinsic::ID)IntrID);
991 MemoryEffects ME = Attr.getMemoryEffects();
992 if (ME.doesNotAccessMemory())
993 return false;
994
995 // TODO: Should images get their own address space?
996 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_FAT_POINTER;
997
998 if (RsrcIntr->IsImage)
999 Info.align.reset();
1000
1001 Info.flags |= MachineMemOperand::MODereferenceable;
1002 if (ME.onlyReadsMemory()) {
1003 unsigned MaxNumLanes = 4;
1004
1005 if (RsrcIntr->IsImage) {
1006 const AMDGPU::ImageDimIntrinsicInfo *Intr
1007 = AMDGPU::getImageDimIntrinsicInfo(IntrID);
1008 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1009 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1010
1011 if (!BaseOpcode->Gather4) {
1012 // If this isn't a gather, we may have excess loaded elements in the
1013 // IR type. Check the dmask for the real number of elements loaded.
1014 unsigned DMask
1015 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1016 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1017 }
1018 }
1019
1020 Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
1021
1022 // FIXME: What does alignment mean for an image?
1023 Info.opc = ISD::INTRINSIC_W_CHAIN;
1024 Info.flags |= MachineMemOperand::MOLoad;
1025 } else if (ME.onlyWritesMemory()) {
1026 Info.opc = ISD::INTRINSIC_VOID;
1027
1028 Type *DataTy = CI.getArgOperand(0)->getType();
1029 if (RsrcIntr->IsImage) {
1030 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1031 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1032 Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
1033 } else
1034 Info.memVT = EVT::getEVT(DataTy);
1035
1036 Info.flags |= MachineMemOperand::MOStore;
1037 } else {
1038 // Atomic
1039 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1040 ISD::INTRINSIC_W_CHAIN;
1041 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1042 Info.flags |= MachineMemOperand::MOLoad |
1043 MachineMemOperand::MOStore |
1044 MachineMemOperand::MODereferenceable;
1045
1046 // XXX - Should this be volatile without known ordering?
1047 Info.flags |= MachineMemOperand::MOVolatile;
1048
1049 switch (IntrID) {
1050 default:
1051 break;
1052 case Intrinsic::amdgcn_raw_buffer_load_lds:
1053 case Intrinsic::amdgcn_struct_buffer_load_lds: {
1054 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1055 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1056 return true;
1057 }
1058 }
1059 }
1060 return true;
1061 }
1062
1063 switch (IntrID) {
1064 case Intrinsic::amdgcn_atomic_inc:
1065 case Intrinsic::amdgcn_atomic_dec:
1066 case Intrinsic::amdgcn_ds_ordered_add:
1067 case Intrinsic::amdgcn_ds_ordered_swap:
1068 case Intrinsic::amdgcn_ds_fadd:
1069 case Intrinsic::amdgcn_ds_fmin:
1070 case Intrinsic::amdgcn_ds_fmax: {
1071 Info.opc = ISD::INTRINSIC_W_CHAIN;
1072 Info.memVT = MVT::getVT(CI.getType());
1073 Info.ptrVal = CI.getOperand(0);
1074 Info.align.reset();
1075 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1076
1077 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1078 if (!Vol->isZero())
1079 Info.flags |= MachineMemOperand::MOVolatile;
1080
1081 return true;
1082 }
1083 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1084 Info.opc = ISD::INTRINSIC_W_CHAIN;
1085 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1086 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_FAT_POINTER;
1087 Info.align.reset();
1088 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1089
1090 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1091 if (!Vol || !Vol->isZero())
1092 Info.flags |= MachineMemOperand::MOVolatile;
1093
1094 return true;
1095 }
1096 case Intrinsic::amdgcn_ds_append:
1097 case Intrinsic::amdgcn_ds_consume: {
1098 Info.opc = ISD::INTRINSIC_W_CHAIN;
1099 Info.memVT = MVT::getVT(CI.getType());
1100 Info.ptrVal = CI.getOperand(0);
1101 Info.align.reset();
1102 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1103
1104 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1105 if (!Vol->isZero())
1106 Info.flags |= MachineMemOperand::MOVolatile;
1107
1108 return true;
1109 }
1110 case Intrinsic::amdgcn_global_atomic_csub: {
1111 Info.opc = ISD::INTRINSIC_W_CHAIN;
1112 Info.memVT = MVT::getVT(CI.getType());
1113 Info.ptrVal = CI.getOperand(0);
1114 Info.align.reset();
1115 Info.flags |= MachineMemOperand::MOLoad |
1116 MachineMemOperand::MOStore |
1117 MachineMemOperand::MOVolatile;
1118 return true;
1119 }
1120 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1121 Info.opc = ISD::INTRINSIC_W_CHAIN;
1122 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1123
1124 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_FAT_POINTER;
1125 Info.align.reset();
1126 Info.flags |= MachineMemOperand::MOLoad |
1127 MachineMemOperand::MODereferenceable;
1128 return true;
1129 }
1130 case Intrinsic::amdgcn_global_atomic_fadd:
1131 case Intrinsic::amdgcn_global_atomic_fmin:
1132 case Intrinsic::amdgcn_global_atomic_fmax:
1133 case Intrinsic::amdgcn_flat_atomic_fadd:
1134 case Intrinsic::amdgcn_flat_atomic_fmin:
1135 case Intrinsic::amdgcn_flat_atomic_fmax:
1136 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1137 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1138 Info.opc = ISD::INTRINSIC_W_CHAIN;
1139 Info.memVT = MVT::getVT(CI.getType());
1140 Info.ptrVal = CI.getOperand(0);
1141 Info.align.reset();
1142 Info.flags |= MachineMemOperand::MOLoad |
1143 MachineMemOperand::MOStore |
1144 MachineMemOperand::MODereferenceable |
1145 MachineMemOperand::MOVolatile;
1146 return true;
1147 }
1148 case Intrinsic::amdgcn_ds_gws_init:
1149 case Intrinsic::amdgcn_ds_gws_barrier:
1150 case Intrinsic::amdgcn_ds_gws_sema_v:
1151 case Intrinsic::amdgcn_ds_gws_sema_br:
1152 case Intrinsic::amdgcn_ds_gws_sema_p:
1153 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1154 Info.opc = ISD::INTRINSIC_VOID;
1155
1156 const GCNTargetMachine &TM =
1157 static_cast<const GCNTargetMachine &>(getTargetMachine());
1158
1159 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1160 Info.ptrVal = MFI->getGWSPSV(TM);
1161
1162 // This is an abstract access, but we need to specify a type and size.
1163 Info.memVT = MVT::i32;
1164 Info.size = 4;
1165 Info.align = Align(4);
1166
1167 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1168 Info.flags |= MachineMemOperand::MOLoad;
1169 else
1170 Info.flags |= MachineMemOperand::MOStore;
1171 return true;
1172 }
1173 case Intrinsic::amdgcn_global_load_lds: {
1174 Info.opc = ISD::INTRINSIC_VOID;
1175 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1176 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1177 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1178 MachineMemOperand::MOVolatile;
1179 return true;
1180 }
1181 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1182 Info.opc = ISD::INTRINSIC_W_CHAIN;
1183
1184 const GCNTargetMachine &TM =
1185 static_cast<const GCNTargetMachine &>(getTargetMachine());
1186
1187 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1188 Info.ptrVal = MFI->getGWSPSV(TM);
1189
1190 // This is an abstract access, but we need to specify a type and size.
1191 Info.memVT = MVT::i32;
1192 Info.size = 4;
1193 Info.align = Align(4);
1194
1195 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1196 return true;
1197 }
1198 default:
1199 return false;
1200 }
1201 }
1202
getAddrModeArguments(IntrinsicInst * II,SmallVectorImpl<Value * > & Ops,Type * & AccessTy) const1203 bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
1204 SmallVectorImpl<Value*> &Ops,
1205 Type *&AccessTy) const {
1206 switch (II->getIntrinsicID()) {
1207 case Intrinsic::amdgcn_atomic_inc:
1208 case Intrinsic::amdgcn_atomic_dec:
1209 case Intrinsic::amdgcn_ds_ordered_add:
1210 case Intrinsic::amdgcn_ds_ordered_swap:
1211 case Intrinsic::amdgcn_ds_append:
1212 case Intrinsic::amdgcn_ds_consume:
1213 case Intrinsic::amdgcn_ds_fadd:
1214 case Intrinsic::amdgcn_ds_fmin:
1215 case Intrinsic::amdgcn_ds_fmax:
1216 case Intrinsic::amdgcn_global_atomic_fadd:
1217 case Intrinsic::amdgcn_flat_atomic_fadd:
1218 case Intrinsic::amdgcn_flat_atomic_fmin:
1219 case Intrinsic::amdgcn_flat_atomic_fmax:
1220 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1221 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1222 case Intrinsic::amdgcn_global_atomic_csub: {
1223 Value *Ptr = II->getArgOperand(0);
1224 AccessTy = II->getType();
1225 Ops.push_back(Ptr);
1226 return true;
1227 }
1228 default:
1229 return false;
1230 }
1231 }
1232
isLegalFlatAddressingMode(const AddrMode & AM) const1233 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
1234 if (!Subtarget->hasFlatInstOffsets()) {
1235 // Flat instructions do not have offsets, and only have the register
1236 // address.
1237 return AM.BaseOffs == 0 && AM.Scale == 0;
1238 }
1239
1240 return AM.Scale == 0 &&
1241 (AM.BaseOffs == 0 ||
1242 Subtarget->getInstrInfo()->isLegalFLATOffset(
1243 AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, SIInstrFlags::FLAT));
1244 }
1245
isLegalGlobalAddressingMode(const AddrMode & AM) const1246 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1247 if (Subtarget->hasFlatGlobalInsts())
1248 return AM.Scale == 0 &&
1249 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1250 AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS,
1251 SIInstrFlags::FlatGlobal));
1252
1253 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1254 // Assume the we will use FLAT for all global memory accesses
1255 // on VI.
1256 // FIXME: This assumption is currently wrong. On VI we still use
1257 // MUBUF instructions for the r + i addressing mode. As currently
1258 // implemented, the MUBUF instructions only work on buffer < 4GB.
1259 // It may be possible to support > 4GB buffers with MUBUF instructions,
1260 // by setting the stride value in the resource descriptor which would
1261 // increase the size limit to (stride * 4GB). However, this is risky,
1262 // because it has never been validated.
1263 return isLegalFlatAddressingMode(AM);
1264 }
1265
1266 return isLegalMUBUFAddressingMode(AM);
1267 }
1268
isLegalMUBUFAddressingMode(const AddrMode & AM) const1269 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1270 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1271 // additionally can do r + r + i with addr64. 32-bit has more addressing
1272 // mode options. Depending on the resource constant, it can also do
1273 // (i64 r0) + (i32 r1) * (i14 i).
1274 //
1275 // Private arrays end up using a scratch buffer most of the time, so also
1276 // assume those use MUBUF instructions. Scratch loads / stores are currently
1277 // implemented as mubuf instructions with offen bit set, so slightly
1278 // different than the normal addr64.
1279 if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
1280 return false;
1281
1282 // FIXME: Since we can split immediate into soffset and immediate offset,
1283 // would it make sense to allow any immediate?
1284
1285 switch (AM.Scale) {
1286 case 0: // r + i or just i, depending on HasBaseReg.
1287 return true;
1288 case 1:
1289 return true; // We have r + r or r + i.
1290 case 2:
1291 if (AM.HasBaseReg) {
1292 // Reject 2 * r + r.
1293 return false;
1294 }
1295
1296 // Allow 2 * r as r + r
1297 // Or 2 * r + i is allowed as r + r + i.
1298 return true;
1299 default: // Don't allow n * r
1300 return false;
1301 }
1302 }
1303
isLegalAddressingMode(const DataLayout & DL,const AddrMode & AM,Type * Ty,unsigned AS,Instruction * I) const1304 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1305 const AddrMode &AM, Type *Ty,
1306 unsigned AS, Instruction *I) const {
1307 // No global is ever allowed as a base.
1308 if (AM.BaseGV)
1309 return false;
1310
1311 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1312 return isLegalGlobalAddressingMode(AM);
1313
1314 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1315 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1316 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
1317 // If the offset isn't a multiple of 4, it probably isn't going to be
1318 // correctly aligned.
1319 // FIXME: Can we get the real alignment here?
1320 if (AM.BaseOffs % 4 != 0)
1321 return isLegalMUBUFAddressingMode(AM);
1322
1323 // There are no SMRD extloads, so if we have to do a small type access we
1324 // will use a MUBUF load.
1325 // FIXME?: We also need to do this if unaligned, but we don't know the
1326 // alignment here.
1327 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1328 return isLegalGlobalAddressingMode(AM);
1329
1330 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1331 // SMRD instructions have an 8-bit, dword offset on SI.
1332 if (!isUInt<8>(AM.BaseOffs / 4))
1333 return false;
1334 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1335 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1336 // in 8-bits, it can use a smaller encoding.
1337 if (!isUInt<32>(AM.BaseOffs / 4))
1338 return false;
1339 } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1340 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1341 if (!isUInt<20>(AM.BaseOffs))
1342 return false;
1343 } else
1344 llvm_unreachable("unhandled generation");
1345
1346 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1347 return true;
1348
1349 if (AM.Scale == 1 && AM.HasBaseReg)
1350 return true;
1351
1352 return false;
1353
1354 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1355 return isLegalMUBUFAddressingMode(AM);
1356 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1357 AS == AMDGPUAS::REGION_ADDRESS) {
1358 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1359 // field.
1360 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1361 // an 8-bit dword offset but we don't know the alignment here.
1362 if (!isUInt<16>(AM.BaseOffs))
1363 return false;
1364
1365 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1366 return true;
1367
1368 if (AM.Scale == 1 && AM.HasBaseReg)
1369 return true;
1370
1371 return false;
1372 } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1373 AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1374 // For an unknown address space, this usually means that this is for some
1375 // reason being used for pure arithmetic, and not based on some addressing
1376 // computation. We don't have instructions that compute pointers with any
1377 // addressing modes, so treat them as having no offset like flat
1378 // instructions.
1379 return isLegalFlatAddressingMode(AM);
1380 }
1381
1382 // Assume a user alias of global for unknown address spaces.
1383 return isLegalGlobalAddressingMode(AM);
1384 }
1385
canMergeStoresTo(unsigned AS,EVT MemVT,const MachineFunction & MF) const1386 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1387 const MachineFunction &MF) const {
1388 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1389 return (MemVT.getSizeInBits() <= 4 * 32);
1390 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1391 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1392 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1393 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1394 return (MemVT.getSizeInBits() <= 2 * 32);
1395 }
1396 return true;
1397 }
1398
allowsMisalignedMemoryAccessesImpl(unsigned Size,unsigned AddrSpace,Align Alignment,MachineMemOperand::Flags Flags,unsigned * IsFast) const1399 bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1400 unsigned Size, unsigned AddrSpace, Align Alignment,
1401 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1402 if (IsFast)
1403 *IsFast = 0;
1404
1405 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1406 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1407 // Check if alignment requirements for ds_read/write instructions are
1408 // disabled.
1409 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1410 return false;
1411
1412 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1413 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1414 Alignment < RequiredAlignment)
1415 return false;
1416
1417 // Either, the alignment requirements are "enabled", or there is an
1418 // unaligned LDS access related hardware bug though alignment requirements
1419 // are "disabled". In either case, we need to check for proper alignment
1420 // requirements.
1421 //
1422 switch (Size) {
1423 case 64:
1424 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1425 // address is negative, then the instruction is incorrectly treated as
1426 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1427 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1428 // load later in the SILoadStoreOptimizer.
1429 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1430 return false;
1431
1432 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1433 // can do a 4 byte aligned, 8 byte access in a single operation using
1434 // ds_read2/write2_b32 with adjacent offsets.
1435 RequiredAlignment = Align(4);
1436
1437 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1438 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1439 // ds_write2_b32 depending on the alignment. In either case with either
1440 // alignment there is no faster way of doing this.
1441
1442 // The numbers returned here and below are not additive, it is a 'speed
1443 // rank'. They are just meant to be compared to decide if a certain way
1444 // of lowering an operation is faster than another. For that purpose
1445 // naturally aligned operation gets it bitsize to indicate that "it
1446 // operates with a speed comparable to N-bit wide load". With the full
1447 // alignment ds128 is slower than ds96 for example. If underaligned it
1448 // is comparable to a speed of a single dword access, which would then
1449 // mean 32 < 128 and it is faster to issue a wide load regardless.
1450 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1451 // wider load which will not be aligned anymore the latter is slower.
1452 if (IsFast)
1453 *IsFast = (Alignment >= RequiredAlignment) ? 64
1454 : (Alignment < Align(4)) ? 32
1455 : 1;
1456 return true;
1457 }
1458
1459 break;
1460 case 96:
1461 if (!Subtarget->hasDS96AndDS128())
1462 return false;
1463
1464 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1465 // gfx8 and older.
1466
1467 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1468 // Naturally aligned access is fastest. However, also report it is Fast
1469 // if memory is aligned less than DWORD. A narrow load or store will be
1470 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1471 // be more of them, so overall we will pay less penalty issuing a single
1472 // instruction.
1473
1474 // See comment on the values above.
1475 if (IsFast)
1476 *IsFast = (Alignment >= RequiredAlignment) ? 96
1477 : (Alignment < Align(4)) ? 32
1478 : 1;
1479 return true;
1480 }
1481
1482 break;
1483 case 128:
1484 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1485 return false;
1486
1487 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1488 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1489 // single operation using ds_read2/write2_b64.
1490 RequiredAlignment = Align(8);
1491
1492 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1493 // Naturally aligned access is fastest. However, also report it is Fast
1494 // if memory is aligned less than DWORD. A narrow load or store will be
1495 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1496 // will be more of them, so overall we will pay less penalty issuing a
1497 // single instruction.
1498
1499 // See comment on the values above.
1500 if (IsFast)
1501 *IsFast = (Alignment >= RequiredAlignment) ? 128
1502 : (Alignment < Align(4)) ? 32
1503 : 1;
1504 return true;
1505 }
1506
1507 break;
1508 default:
1509 if (Size > 32)
1510 return false;
1511
1512 break;
1513 }
1514
1515 // See comment on the values above.
1516 // Note that we have a single-dword or sub-dword here, so if underaligned
1517 // it is a slowest possible access, hence returned value is 0.
1518 if (IsFast)
1519 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1520
1521 return Alignment >= RequiredAlignment ||
1522 Subtarget->hasUnalignedDSAccessEnabled();
1523 }
1524
1525 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1526 bool AlignedBy4 = Alignment >= Align(4);
1527 if (IsFast)
1528 *IsFast = AlignedBy4;
1529
1530 return AlignedBy4 ||
1531 Subtarget->enableFlatScratch() ||
1532 Subtarget->hasUnalignedScratchAccess();
1533 }
1534
1535 // FIXME: We have to be conservative here and assume that flat operations
1536 // will access scratch. If we had access to the IR function, then we
1537 // could determine if any private memory was used in the function.
1538 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1539 !Subtarget->hasUnalignedScratchAccess()) {
1540 bool AlignedBy4 = Alignment >= Align(4);
1541 if (IsFast)
1542 *IsFast = AlignedBy4;
1543
1544 return AlignedBy4;
1545 }
1546
1547 if (Subtarget->hasUnalignedBufferAccessEnabled()) {
1548 // If we have a uniform constant load, it still requires using a slow
1549 // buffer instruction if unaligned.
1550 if (IsFast) {
1551 // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
1552 // 2-byte alignment is worse than 1 unless doing a 2-byte access.
1553 *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1554 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1555 Alignment >= Align(4) : Alignment != Align(2);
1556 }
1557
1558 return true;
1559 }
1560
1561 // Smaller than dword value must be aligned.
1562 if (Size < 32)
1563 return false;
1564
1565 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1566 // byte-address are ignored, thus forcing Dword alignment.
1567 // This applies to private, global, and constant memory.
1568 if (IsFast)
1569 *IsFast = 1;
1570
1571 return Size >= 32 && Alignment >= Align(4);
1572 }
1573
allowsMisalignedMemoryAccesses(EVT VT,unsigned AddrSpace,Align Alignment,MachineMemOperand::Flags Flags,unsigned * IsFast) const1574 bool SITargetLowering::allowsMisalignedMemoryAccesses(
1575 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1576 unsigned *IsFast) const {
1577 return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
1578 Alignment, Flags, IsFast);
1579 }
1580
getOptimalMemOpType(const MemOp & Op,const AttributeList & FuncAttributes) const1581 EVT SITargetLowering::getOptimalMemOpType(
1582 const MemOp &Op, const AttributeList &FuncAttributes) const {
1583 // FIXME: Should account for address space here.
1584
1585 // The default fallback uses the private pointer size as a guess for a type to
1586 // use. Make sure we switch these to 64-bit accesses.
1587
1588 if (Op.size() >= 16 &&
1589 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1590 return MVT::v4i32;
1591
1592 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1593 return MVT::v2i32;
1594
1595 // Use the default.
1596 return MVT::Other;
1597 }
1598
isMemOpHasNoClobberedMemOperand(const SDNode * N) const1599 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1600 const MemSDNode *MemNode = cast<MemSDNode>(N);
1601 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1602 }
1603
isNonGlobalAddrSpace(unsigned AS)1604 bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
1605 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1606 AS == AMDGPUAS::PRIVATE_ADDRESS;
1607 }
1608
isFreeAddrSpaceCast(unsigned SrcAS,unsigned DestAS) const1609 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
1610 unsigned DestAS) const {
1611 // Flat -> private/local is a simple truncate.
1612 // Flat -> global is no-op
1613 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1614 return true;
1615
1616 const GCNTargetMachine &TM =
1617 static_cast<const GCNTargetMachine &>(getTargetMachine());
1618 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1619 }
1620
isMemOpUniform(const SDNode * N) const1621 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1622 const MemSDNode *MemNode = cast<MemSDNode>(N);
1623
1624 return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1625 }
1626
1627 TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(MVT VT) const1628 SITargetLowering::getPreferredVectorAction(MVT VT) const {
1629 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1630 VT.getScalarType().bitsLE(MVT::i16))
1631 return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
1632 return TargetLoweringBase::getPreferredVectorAction(VT);
1633 }
1634
shouldConvertConstantLoadToIntImm(const APInt & Imm,Type * Ty) const1635 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1636 Type *Ty) const {
1637 // FIXME: Could be smarter if called for vector constants.
1638 return true;
1639 }
1640
isExtractSubvectorCheap(EVT ResVT,EVT SrcVT,unsigned Index) const1641 bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
1642 unsigned Index) const {
1643 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
1644 return false;
1645
1646 // TODO: Add more cases that are cheap.
1647 return Index == 0;
1648 }
1649
isTypeDesirableForOp(unsigned Op,EVT VT) const1650 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
1651 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1652 switch (Op) {
1653 case ISD::LOAD:
1654 case ISD::STORE:
1655
1656 // These operations are done with 32-bit instructions anyway.
1657 case ISD::AND:
1658 case ISD::OR:
1659 case ISD::XOR:
1660 case ISD::SELECT:
1661 // TODO: Extensions?
1662 return true;
1663 default:
1664 return false;
1665 }
1666 }
1667
1668 // SimplifySetCC uses this function to determine whether or not it should
1669 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1670 if (VT == MVT::i1 && Op == ISD::SETCC)
1671 return false;
1672
1673 return TargetLowering::isTypeDesirableForOp(Op, VT);
1674 }
1675
lowerKernArgParameterPtr(SelectionDAG & DAG,const SDLoc & SL,SDValue Chain,uint64_t Offset) const1676 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1677 const SDLoc &SL,
1678 SDValue Chain,
1679 uint64_t Offset) const {
1680 const DataLayout &DL = DAG.getDataLayout();
1681 MachineFunction &MF = DAG.getMachineFunction();
1682 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1683
1684 const ArgDescriptor *InputPtrReg;
1685 const TargetRegisterClass *RC;
1686 LLT ArgTy;
1687 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
1688
1689 std::tie(InputPtrReg, RC, ArgTy) =
1690 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1691
1692 // We may not have the kernarg segment argument if we have no kernel
1693 // arguments.
1694 if (!InputPtrReg)
1695 return DAG.getConstant(0, SL, PtrVT);
1696
1697 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1698 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1699 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1700
1701 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
1702 }
1703
getImplicitArgPtr(SelectionDAG & DAG,const SDLoc & SL) const1704 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1705 const SDLoc &SL) const {
1706 uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1707 FIRST_IMPLICIT);
1708 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1709 }
1710
getLDSKernelId(SelectionDAG & DAG,const SDLoc & SL) const1711 SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1712 const SDLoc &SL) const {
1713
1714 Function &F = DAG.getMachineFunction().getFunction();
1715 std::optional<uint32_t> KnownSize =
1716 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
1717 if (KnownSize.has_value())
1718 return DAG.getConstant(*KnownSize, SL, MVT::i32);
1719 return SDValue();
1720 }
1721
convertArgType(SelectionDAG & DAG,EVT VT,EVT MemVT,const SDLoc & SL,SDValue Val,bool Signed,const ISD::InputArg * Arg) const1722 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1723 const SDLoc &SL, SDValue Val,
1724 bool Signed,
1725 const ISD::InputArg *Arg) const {
1726 // First, if it is a widened vector, narrow it.
1727 if (VT.isVector() &&
1728 VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1729 EVT NarrowedVT =
1730 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
1731 VT.getVectorNumElements());
1732 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1733 DAG.getConstant(0, SL, MVT::i32));
1734 }
1735
1736 // Then convert the vector elements or scalar value.
1737 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1738 VT.bitsLT(MemVT)) {
1739 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1740 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1741 }
1742
1743 if (MemVT.isFloatingPoint())
1744 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1745 else if (Signed)
1746 Val = DAG.getSExtOrTrunc(Val, SL, VT);
1747 else
1748 Val = DAG.getZExtOrTrunc(Val, SL, VT);
1749
1750 return Val;
1751 }
1752
lowerKernargMemParameter(SelectionDAG & DAG,EVT VT,EVT MemVT,const SDLoc & SL,SDValue Chain,uint64_t Offset,Align Alignment,bool Signed,const ISD::InputArg * Arg) const1753 SDValue SITargetLowering::lowerKernargMemParameter(
1754 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
1755 uint64_t Offset, Align Alignment, bool Signed,
1756 const ISD::InputArg *Arg) const {
1757 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1758
1759 // Try to avoid using an extload by loading earlier than the argument address,
1760 // and extracting the relevant bits. The load should hopefully be merged with
1761 // the previous argument.
1762 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
1763 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1764 int64_t AlignDownOffset = alignDown(Offset, 4);
1765 int64_t OffsetDiff = Offset - AlignDownOffset;
1766
1767 EVT IntVT = MemVT.changeTypeToInteger();
1768
1769 // TODO: If we passed in the base kernel offset we could have a better
1770 // alignment than 4, but we don't really need it.
1771 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1772 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
1773 MachineMemOperand::MODereferenceable |
1774 MachineMemOperand::MOInvariant);
1775
1776 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1777 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1778
1779 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1780 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1781 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1782
1783
1784 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1785 }
1786
1787 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1788 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
1789 MachineMemOperand::MODereferenceable |
1790 MachineMemOperand::MOInvariant);
1791
1792 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1793 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1794 }
1795
lowerStackParameter(SelectionDAG & DAG,CCValAssign & VA,const SDLoc & SL,SDValue Chain,const ISD::InputArg & Arg) const1796 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1797 const SDLoc &SL, SDValue Chain,
1798 const ISD::InputArg &Arg) const {
1799 MachineFunction &MF = DAG.getMachineFunction();
1800 MachineFrameInfo &MFI = MF.getFrameInfo();
1801
1802 if (Arg.Flags.isByVal()) {
1803 unsigned Size = Arg.Flags.getByValSize();
1804 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1805 return DAG.getFrameIndex(FrameIdx, MVT::i32);
1806 }
1807
1808 unsigned ArgOffset = VA.getLocMemOffset();
1809 unsigned ArgSize = VA.getValVT().getStoreSize();
1810
1811 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1812
1813 // Create load nodes to retrieve arguments from the stack.
1814 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1815 SDValue ArgValue;
1816
1817 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1818 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
1819 MVT MemVT = VA.getValVT();
1820
1821 switch (VA.getLocInfo()) {
1822 default:
1823 break;
1824 case CCValAssign::BCvt:
1825 MemVT = VA.getLocVT();
1826 break;
1827 case CCValAssign::SExt:
1828 ExtType = ISD::SEXTLOAD;
1829 break;
1830 case CCValAssign::ZExt:
1831 ExtType = ISD::ZEXTLOAD;
1832 break;
1833 case CCValAssign::AExt:
1834 ExtType = ISD::EXTLOAD;
1835 break;
1836 }
1837
1838 ArgValue = DAG.getExtLoad(
1839 ExtType, SL, VA.getLocVT(), Chain, FIN,
1840 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1841 MemVT);
1842 return ArgValue;
1843 }
1844
getPreloadedValue(SelectionDAG & DAG,const SIMachineFunctionInfo & MFI,EVT VT,AMDGPUFunctionArgInfo::PreloadedValue PVID) const1845 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1846 const SIMachineFunctionInfo &MFI,
1847 EVT VT,
1848 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1849 const ArgDescriptor *Reg;
1850 const TargetRegisterClass *RC;
1851 LLT Ty;
1852
1853 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
1854 if (!Reg) {
1855 if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
1856 // It's possible for a kernarg intrinsic call to appear in a kernel with
1857 // no allocated segment, in which case we do not add the user sgpr
1858 // argument, so just return null.
1859 return DAG.getConstant(0, SDLoc(), VT);
1860 }
1861
1862 // It's undefined behavior if a function marked with the amdgpu-no-*
1863 // attributes uses the corresponding intrinsic.
1864 return DAG.getUNDEF(VT);
1865 }
1866
1867 return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1868 }
1869
processPSInputArgs(SmallVectorImpl<ISD::InputArg> & Splits,CallingConv::ID CallConv,ArrayRef<ISD::InputArg> Ins,BitVector & Skipped,FunctionType * FType,SIMachineFunctionInfo * Info)1870 static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1871 CallingConv::ID CallConv,
1872 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
1873 FunctionType *FType,
1874 SIMachineFunctionInfo *Info) {
1875 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1876 const ISD::InputArg *Arg = &Ins[I];
1877
1878 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1879 "vector type argument should have been split");
1880
1881 // First check if it's a PS input addr.
1882 if (CallConv == CallingConv::AMDGPU_PS &&
1883 !Arg->Flags.isInReg() && PSInputNum <= 15) {
1884 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1885
1886 // Inconveniently only the first part of the split is marked as isSplit,
1887 // so skip to the end. We only want to increment PSInputNum once for the
1888 // entire split argument.
1889 if (Arg->Flags.isSplit()) {
1890 while (!Arg->Flags.isSplitEnd()) {
1891 assert((!Arg->VT.isVector() ||
1892 Arg->VT.getScalarSizeInBits() == 16) &&
1893 "unexpected vector split in ps argument type");
1894 if (!SkipArg)
1895 Splits.push_back(*Arg);
1896 Arg = &Ins[++I];
1897 }
1898 }
1899
1900 if (SkipArg) {
1901 // We can safely skip PS inputs.
1902 Skipped.set(Arg->getOrigArgIndex());
1903 ++PSInputNum;
1904 continue;
1905 }
1906
1907 Info->markPSInputAllocated(PSInputNum);
1908 if (Arg->Used)
1909 Info->markPSInputEnabled(PSInputNum);
1910
1911 ++PSInputNum;
1912 }
1913
1914 Splits.push_back(*Arg);
1915 }
1916 }
1917
1918 // Allocate special inputs passed in VGPRs.
allocateSpecialEntryInputVGPRs(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const1919 void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1920 MachineFunction &MF,
1921 const SIRegisterInfo &TRI,
1922 SIMachineFunctionInfo &Info) const {
1923 const LLT S32 = LLT::scalar(32);
1924 MachineRegisterInfo &MRI = MF.getRegInfo();
1925
1926 if (Info.hasWorkItemIDX()) {
1927 Register Reg = AMDGPU::VGPR0;
1928 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1929
1930 CCInfo.AllocateReg(Reg);
1931 unsigned Mask = (Subtarget->hasPackedTID() &&
1932 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
1933 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
1934 }
1935
1936 if (Info.hasWorkItemIDY()) {
1937 assert(Info.hasWorkItemIDX());
1938 if (Subtarget->hasPackedTID()) {
1939 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1940 0x3ff << 10));
1941 } else {
1942 unsigned Reg = AMDGPU::VGPR1;
1943 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1944
1945 CCInfo.AllocateReg(Reg);
1946 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1947 }
1948 }
1949
1950 if (Info.hasWorkItemIDZ()) {
1951 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
1952 if (Subtarget->hasPackedTID()) {
1953 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1954 0x3ff << 20));
1955 } else {
1956 unsigned Reg = AMDGPU::VGPR2;
1957 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1958
1959 CCInfo.AllocateReg(Reg);
1960 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1961 }
1962 }
1963 }
1964
1965 // Try to allocate a VGPR at the end of the argument list, or if no argument
1966 // VGPRs are left allocating a stack slot.
1967 // If \p Mask is is given it indicates bitfield position in the register.
1968 // If \p Arg is given use it with new ]p Mask instead of allocating new.
allocateVGPR32Input(CCState & CCInfo,unsigned Mask=~0u,ArgDescriptor Arg=ArgDescriptor ())1969 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
1970 ArgDescriptor Arg = ArgDescriptor()) {
1971 if (Arg.isSet())
1972 return ArgDescriptor::createArg(Arg, Mask);
1973
1974 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1975 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1976 if (RegIdx == ArgVGPRs.size()) {
1977 // Spill to stack required.
1978 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
1979
1980 return ArgDescriptor::createStack(Offset, Mask);
1981 }
1982
1983 unsigned Reg = ArgVGPRs[RegIdx];
1984 Reg = CCInfo.AllocateReg(Reg);
1985 assert(Reg != AMDGPU::NoRegister);
1986
1987 MachineFunction &MF = CCInfo.getMachineFunction();
1988 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1989 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
1990 return ArgDescriptor::createRegister(Reg, Mask);
1991 }
1992
allocateSGPR32InputImpl(CCState & CCInfo,const TargetRegisterClass * RC,unsigned NumArgRegs)1993 static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1994 const TargetRegisterClass *RC,
1995 unsigned NumArgRegs) {
1996 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
1997 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1998 if (RegIdx == ArgSGPRs.size())
1999 report_fatal_error("ran out of SGPRs for arguments");
2000
2001 unsigned Reg = ArgSGPRs[RegIdx];
2002 Reg = CCInfo.AllocateReg(Reg);
2003 assert(Reg != AMDGPU::NoRegister);
2004
2005 MachineFunction &MF = CCInfo.getMachineFunction();
2006 MF.addLiveIn(Reg, RC);
2007 return ArgDescriptor::createRegister(Reg);
2008 }
2009
2010 // If this has a fixed position, we still should allocate the register in the
2011 // CCInfo state. Technically we could get away with this for values passed
2012 // outside of the normal argument range.
allocateFixedSGPRInputImpl(CCState & CCInfo,const TargetRegisterClass * RC,MCRegister Reg)2013 static void allocateFixedSGPRInputImpl(CCState &CCInfo,
2014 const TargetRegisterClass *RC,
2015 MCRegister Reg) {
2016 Reg = CCInfo.AllocateReg(Reg);
2017 assert(Reg != AMDGPU::NoRegister);
2018 MachineFunction &MF = CCInfo.getMachineFunction();
2019 MF.addLiveIn(Reg, RC);
2020 }
2021
allocateSGPR32Input(CCState & CCInfo,ArgDescriptor & Arg)2022 static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2023 if (Arg) {
2024 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2025 Arg.getRegister());
2026 } else
2027 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2028 }
2029
allocateSGPR64Input(CCState & CCInfo,ArgDescriptor & Arg)2030 static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2031 if (Arg) {
2032 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2033 Arg.getRegister());
2034 } else
2035 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2036 }
2037
2038 /// Allocate implicit function VGPR arguments at the end of allocated user
2039 /// arguments.
allocateSpecialInputVGPRs(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const2040 void SITargetLowering::allocateSpecialInputVGPRs(
2041 CCState &CCInfo, MachineFunction &MF,
2042 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2043 const unsigned Mask = 0x3ff;
2044 ArgDescriptor Arg;
2045
2046 if (Info.hasWorkItemIDX()) {
2047 Arg = allocateVGPR32Input(CCInfo, Mask);
2048 Info.setWorkItemIDX(Arg);
2049 }
2050
2051 if (Info.hasWorkItemIDY()) {
2052 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2053 Info.setWorkItemIDY(Arg);
2054 }
2055
2056 if (Info.hasWorkItemIDZ())
2057 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2058 }
2059
2060 /// Allocate implicit function VGPR arguments in fixed registers.
allocateSpecialInputVGPRsFixed(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const2061 void SITargetLowering::allocateSpecialInputVGPRsFixed(
2062 CCState &CCInfo, MachineFunction &MF,
2063 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2064 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2065 if (!Reg)
2066 report_fatal_error("failed to allocated VGPR for implicit arguments");
2067
2068 const unsigned Mask = 0x3ff;
2069 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2070 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2071 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2072 }
2073
allocateSpecialInputSGPRs(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const2074 void SITargetLowering::allocateSpecialInputSGPRs(
2075 CCState &CCInfo,
2076 MachineFunction &MF,
2077 const SIRegisterInfo &TRI,
2078 SIMachineFunctionInfo &Info) const {
2079 auto &ArgInfo = Info.getArgInfo();
2080
2081 // TODO: Unify handling with private memory pointers.
2082 if (Info.hasDispatchPtr())
2083 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2084
2085 if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
2086 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2087
2088 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2089 // constant offset from the kernarg segment.
2090 if (Info.hasImplicitArgPtr())
2091 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2092
2093 if (Info.hasDispatchID())
2094 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2095
2096 // flat_scratch_init is not applicable for non-kernel functions.
2097
2098 if (Info.hasWorkGroupIDX())
2099 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2100
2101 if (Info.hasWorkGroupIDY())
2102 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2103
2104 if (Info.hasWorkGroupIDZ())
2105 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2106
2107 if (Info.hasLDSKernelId())
2108 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2109 }
2110
2111 // Allocate special inputs passed in user SGPRs.
allocateHSAUserSGPRs(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const2112 void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
2113 MachineFunction &MF,
2114 const SIRegisterInfo &TRI,
2115 SIMachineFunctionInfo &Info) const {
2116 if (Info.hasImplicitBufferPtr()) {
2117 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2118 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2119 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2120 }
2121
2122 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2123 if (Info.hasPrivateSegmentBuffer()) {
2124 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2125 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2126 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2127 }
2128
2129 if (Info.hasDispatchPtr()) {
2130 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2131 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2132 CCInfo.AllocateReg(DispatchPtrReg);
2133 }
2134
2135 if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
2136 Register QueuePtrReg = Info.addQueuePtr(TRI);
2137 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2138 CCInfo.AllocateReg(QueuePtrReg);
2139 }
2140
2141 if (Info.hasKernargSegmentPtr()) {
2142 MachineRegisterInfo &MRI = MF.getRegInfo();
2143 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2144 CCInfo.AllocateReg(InputPtrReg);
2145
2146 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2147 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2148 }
2149
2150 if (Info.hasDispatchID()) {
2151 Register DispatchIDReg = Info.addDispatchID(TRI);
2152 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2153 CCInfo.AllocateReg(DispatchIDReg);
2154 }
2155
2156 if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2157 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2158 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2159 CCInfo.AllocateReg(FlatScratchInitReg);
2160 }
2161
2162 if (Info.hasLDSKernelId()) {
2163 Register Reg = Info.addLDSKernelId();
2164 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2165 CCInfo.AllocateReg(Reg);
2166 }
2167
2168 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2169 // these from the dispatch pointer.
2170 }
2171
2172 // Allocate special input registers that are initialized per-wave.
allocateSystemSGPRs(CCState & CCInfo,MachineFunction & MF,SIMachineFunctionInfo & Info,CallingConv::ID CallConv,bool IsShader) const2173 void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
2174 MachineFunction &MF,
2175 SIMachineFunctionInfo &Info,
2176 CallingConv::ID CallConv,
2177 bool IsShader) const {
2178 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2179 // Note: user SGPRs are handled by the front-end for graphics shaders
2180 // Pad up the used user SGPRs with dead inputs.
2181 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2182
2183 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2184 // rely on it to reach 16 since if we end up having no stack usage, it will
2185 // not really be added.
2186 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2187 Info.hasWorkGroupIDY() +
2188 Info.hasWorkGroupIDZ() +
2189 Info.hasWorkGroupInfo();
2190 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2191 Register Reg = Info.addReservedUserSGPR();
2192 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2193 CCInfo.AllocateReg(Reg);
2194 }
2195 }
2196
2197 if (Info.hasWorkGroupIDX()) {
2198 Register Reg = Info.addWorkGroupIDX();
2199 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2200 CCInfo.AllocateReg(Reg);
2201 }
2202
2203 if (Info.hasWorkGroupIDY()) {
2204 Register Reg = Info.addWorkGroupIDY();
2205 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2206 CCInfo.AllocateReg(Reg);
2207 }
2208
2209 if (Info.hasWorkGroupIDZ()) {
2210 Register Reg = Info.addWorkGroupIDZ();
2211 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2212 CCInfo.AllocateReg(Reg);
2213 }
2214
2215 if (Info.hasWorkGroupInfo()) {
2216 Register Reg = Info.addWorkGroupInfo();
2217 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2218 CCInfo.AllocateReg(Reg);
2219 }
2220
2221 if (Info.hasPrivateSegmentWaveByteOffset()) {
2222 // Scratch wave offset passed in system SGPR.
2223 unsigned PrivateSegmentWaveByteOffsetReg;
2224
2225 if (IsShader) {
2226 PrivateSegmentWaveByteOffsetReg =
2227 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2228
2229 // This is true if the scratch wave byte offset doesn't have a fixed
2230 // location.
2231 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2232 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2233 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2234 }
2235 } else
2236 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2237
2238 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2239 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2240 }
2241
2242 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2243 Info.getNumPreloadedSGPRs() >= 16);
2244 }
2245
reservePrivateMemoryRegs(const TargetMachine & TM,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info)2246 static void reservePrivateMemoryRegs(const TargetMachine &TM,
2247 MachineFunction &MF,
2248 const SIRegisterInfo &TRI,
2249 SIMachineFunctionInfo &Info) {
2250 // Now that we've figured out where the scratch register inputs are, see if
2251 // should reserve the arguments and use them directly.
2252 MachineFrameInfo &MFI = MF.getFrameInfo();
2253 bool HasStackObjects = MFI.hasStackObjects();
2254 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2255
2256 // Record that we know we have non-spill stack objects so we don't need to
2257 // check all stack objects later.
2258 if (HasStackObjects)
2259 Info.setHasNonSpillStackObjects(true);
2260
2261 // Everything live out of a block is spilled with fast regalloc, so it's
2262 // almost certain that spilling will be required.
2263 if (TM.getOptLevel() == CodeGenOpt::None)
2264 HasStackObjects = true;
2265
2266 // For now assume stack access is needed in any callee functions, so we need
2267 // the scratch registers to pass in.
2268 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2269
2270 if (!ST.enableFlatScratch()) {
2271 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2272 // If we have stack objects, we unquestionably need the private buffer
2273 // resource. For the Code Object V2 ABI, this will be the first 4 user
2274 // SGPR inputs. We can reserve those and use them directly.
2275
2276 Register PrivateSegmentBufferReg =
2277 Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
2278 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2279 } else {
2280 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2281 // We tentatively reserve the last registers (skipping the last registers
2282 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2283 // we'll replace these with the ones immediately after those which were
2284 // really allocated. In the prologue copies will be inserted from the
2285 // argument to these reserved registers.
2286
2287 // Without HSA, relocations are used for the scratch pointer and the
2288 // buffer resource setup is always inserted in the prologue. Scratch wave
2289 // offset is still in an input SGPR.
2290 Info.setScratchRSrcReg(ReservedBufferReg);
2291 }
2292 }
2293
2294 MachineRegisterInfo &MRI = MF.getRegInfo();
2295
2296 // For entry functions we have to set up the stack pointer if we use it,
2297 // whereas non-entry functions get this "for free". This means there is no
2298 // intrinsic advantage to using S32 over S34 in cases where we do not have
2299 // calls but do need a frame pointer (i.e. if we are requested to have one
2300 // because frame pointer elimination is disabled). To keep things simple we
2301 // only ever use S32 as the call ABI stack pointer, and so using it does not
2302 // imply we need a separate frame pointer.
2303 //
2304 // Try to use s32 as the SP, but move it if it would interfere with input
2305 // arguments. This won't work with calls though.
2306 //
2307 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2308 // registers.
2309 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2310 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2311 } else {
2312 assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
2313
2314 if (MFI.hasCalls())
2315 report_fatal_error("call in graphics shader with too many input SGPRs");
2316
2317 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2318 if (!MRI.isLiveIn(Reg)) {
2319 Info.setStackPtrOffsetReg(Reg);
2320 break;
2321 }
2322 }
2323
2324 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2325 report_fatal_error("failed to find register for SP");
2326 }
2327
2328 // hasFP should be accurate for entry functions even before the frame is
2329 // finalized, because it does not rely on the known stack size, only
2330 // properties like whether variable sized objects are present.
2331 if (ST.getFrameLowering()->hasFP(MF)) {
2332 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2333 }
2334 }
2335
supportSplitCSR(MachineFunction * MF) const2336 bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
2337 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2338 return !Info->isEntryFunction();
2339 }
2340
initializeSplitCSR(MachineBasicBlock * Entry) const2341 void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
2342
2343 }
2344
insertCopiesSplitCSR(MachineBasicBlock * Entry,const SmallVectorImpl<MachineBasicBlock * > & Exits) const2345 void SITargetLowering::insertCopiesSplitCSR(
2346 MachineBasicBlock *Entry,
2347 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2348 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2349
2350 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2351 if (!IStart)
2352 return;
2353
2354 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2355 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2356 MachineBasicBlock::iterator MBBI = Entry->begin();
2357 for (const MCPhysReg *I = IStart; *I; ++I) {
2358 const TargetRegisterClass *RC = nullptr;
2359 if (AMDGPU::SReg_64RegClass.contains(*I))
2360 RC = &AMDGPU::SGPR_64RegClass;
2361 else if (AMDGPU::SReg_32RegClass.contains(*I))
2362 RC = &AMDGPU::SGPR_32RegClass;
2363 else
2364 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2365
2366 Register NewVR = MRI->createVirtualRegister(RC);
2367 // Create copy from CSR to a virtual register.
2368 Entry->addLiveIn(*I);
2369 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2370 .addReg(*I);
2371
2372 // Insert the copy-back instructions right before the terminator.
2373 for (auto *Exit : Exits)
2374 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2375 TII->get(TargetOpcode::COPY), *I)
2376 .addReg(NewVR);
2377 }
2378 }
2379
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & DL,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const2380 SDValue SITargetLowering::LowerFormalArguments(
2381 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2382 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2383 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2384 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2385
2386 MachineFunction &MF = DAG.getMachineFunction();
2387 const Function &Fn = MF.getFunction();
2388 FunctionType *FType = MF.getFunction().getFunctionType();
2389 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2390
2391 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2392 DiagnosticInfoUnsupported NoGraphicsHSA(
2393 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2394 DAG.getContext()->diagnose(NoGraphicsHSA);
2395 return DAG.getEntryNode();
2396 }
2397
2398 Info->allocateKnownAddressLDSGlobal(Fn);
2399
2400 SmallVector<ISD::InputArg, 16> Splits;
2401 SmallVector<CCValAssign, 16> ArgLocs;
2402 BitVector Skipped(Ins.size());
2403 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2404 *DAG.getContext());
2405
2406 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2407 bool IsKernel = AMDGPU::isKernel(CallConv);
2408 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2409
2410 if (IsGraphics) {
2411 assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
2412 (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
2413 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2414 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
2415 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2416 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2417 }
2418
2419 if (CallConv == CallingConv::AMDGPU_PS) {
2420 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2421
2422 // At least one interpolation mode must be enabled or else the GPU will
2423 // hang.
2424 //
2425 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2426 // set PSInputAddr, the user wants to enable some bits after the compilation
2427 // based on run-time states. Since we can't know what the final PSInputEna
2428 // will look like, so we shouldn't do anything here and the user should take
2429 // responsibility for the correct programming.
2430 //
2431 // Otherwise, the following restrictions apply:
2432 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2433 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2434 // enabled too.
2435 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2436 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2437 CCInfo.AllocateReg(AMDGPU::VGPR0);
2438 CCInfo.AllocateReg(AMDGPU::VGPR1);
2439 Info->markPSInputAllocated(0);
2440 Info->markPSInputEnabled(0);
2441 }
2442 if (Subtarget->isAmdPalOS()) {
2443 // For isAmdPalOS, the user does not enable some bits after compilation
2444 // based on run-time states; the register values being generated here are
2445 // the final ones set in hardware. Therefore we need to apply the
2446 // workaround to PSInputAddr and PSInputEnable together. (The case where
2447 // a bit is set in PSInputAddr but not PSInputEnable is where the
2448 // frontend set up an input arg for a particular interpolation mode, but
2449 // nothing uses that input arg. Really we should have an earlier pass
2450 // that removes such an arg.)
2451 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2452 if ((PsInputBits & 0x7F) == 0 ||
2453 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2454 Info->markPSInputEnabled(countTrailingZeros(Info->getPSInputAddr()));
2455 }
2456 } else if (IsKernel) {
2457 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2458 } else {
2459 Splits.append(Ins.begin(), Ins.end());
2460 }
2461
2462 if (IsEntryFunc) {
2463 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2464 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2465 } else if (!IsGraphics) {
2466 // For the fixed ABI, pass workitem IDs in the last argument register.
2467 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2468 }
2469
2470 if (IsKernel) {
2471 analyzeFormalArgumentsCompute(CCInfo, Ins);
2472 } else {
2473 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2474 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2475 }
2476
2477 SmallVector<SDValue, 16> Chains;
2478
2479 // FIXME: This is the minimum kernel argument alignment. We should improve
2480 // this to the maximum alignment of the arguments.
2481 //
2482 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2483 // kern arg offset.
2484 const Align KernelArgBaseAlign = Align(16);
2485
2486 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2487 const ISD::InputArg &Arg = Ins[i];
2488 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2489 InVals.push_back(DAG.getUNDEF(Arg.VT));
2490 continue;
2491 }
2492
2493 CCValAssign &VA = ArgLocs[ArgIdx++];
2494 MVT VT = VA.getLocVT();
2495
2496 if (IsEntryFunc && VA.isMemLoc()) {
2497 VT = Ins[i].VT;
2498 EVT MemVT = VA.getLocVT();
2499
2500 const uint64_t Offset = VA.getLocMemOffset();
2501 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2502
2503 if (Arg.Flags.isByRef()) {
2504 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2505
2506 const GCNTargetMachine &TM =
2507 static_cast<const GCNTargetMachine &>(getTargetMachine());
2508 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2509 Arg.Flags.getPointerAddrSpace())) {
2510 Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS,
2511 Arg.Flags.getPointerAddrSpace());
2512 }
2513
2514 InVals.push_back(Ptr);
2515 continue;
2516 }
2517
2518 SDValue Arg = lowerKernargMemParameter(
2519 DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2520 Chains.push_back(Arg.getValue(1));
2521
2522 auto *ParamTy =
2523 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2524 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2525 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2526 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2527 // On SI local pointers are just offsets into LDS, so they are always
2528 // less than 16-bits. On CI and newer they could potentially be
2529 // real pointers, so we can't guarantee their size.
2530 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2531 DAG.getValueType(MVT::i16));
2532 }
2533
2534 InVals.push_back(Arg);
2535 continue;
2536 } else if (!IsEntryFunc && VA.isMemLoc()) {
2537 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2538 InVals.push_back(Val);
2539 if (!Arg.Flags.isByVal())
2540 Chains.push_back(Val.getValue(1));
2541 continue;
2542 }
2543
2544 assert(VA.isRegLoc() && "Parameter must be in a register!");
2545
2546 Register Reg = VA.getLocReg();
2547 const TargetRegisterClass *RC = nullptr;
2548 if (AMDGPU::VGPR_32RegClass.contains(Reg))
2549 RC = &AMDGPU::VGPR_32RegClass;
2550 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
2551 RC = &AMDGPU::SGPR_32RegClass;
2552 else
2553 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
2554 EVT ValVT = VA.getValVT();
2555
2556 Reg = MF.addLiveIn(Reg, RC);
2557 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2558
2559 if (Arg.Flags.isSRet()) {
2560 // The return object should be reasonably addressable.
2561
2562 // FIXME: This helps when the return is a real sret. If it is a
2563 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2564 // extra copy is inserted in SelectionDAGBuilder which obscures this.
2565 unsigned NumBits
2566 = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
2567 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2568 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2569 }
2570
2571 // If this is an 8 or 16-bit value, it is really passed promoted
2572 // to 32 bits. Insert an assert[sz]ext to capture this, then
2573 // truncate to the right size.
2574 switch (VA.getLocInfo()) {
2575 case CCValAssign::Full:
2576 break;
2577 case CCValAssign::BCvt:
2578 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2579 break;
2580 case CCValAssign::SExt:
2581 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2582 DAG.getValueType(ValVT));
2583 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2584 break;
2585 case CCValAssign::ZExt:
2586 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2587 DAG.getValueType(ValVT));
2588 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2589 break;
2590 case CCValAssign::AExt:
2591 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2592 break;
2593 default:
2594 llvm_unreachable("Unknown loc info!");
2595 }
2596
2597 InVals.push_back(Val);
2598 }
2599
2600 // Start adding system SGPRs.
2601 if (IsEntryFunc) {
2602 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
2603 } else {
2604 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2605 if (!IsGraphics)
2606 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2607 }
2608
2609 auto &ArgUsageInfo =
2610 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2611 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2612
2613 unsigned StackArgSize = CCInfo.getNextStackOffset();
2614 Info->setBytesInStackArgArea(StackArgSize);
2615
2616 return Chains.empty() ? Chain :
2617 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2618 }
2619
2620 // TODO: If return values can't fit in registers, we should return as many as
2621 // possible in registers before passing on stack.
CanLowerReturn(CallingConv::ID CallConv,MachineFunction & MF,bool IsVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,LLVMContext & Context) const2622 bool SITargetLowering::CanLowerReturn(
2623 CallingConv::ID CallConv,
2624 MachineFunction &MF, bool IsVarArg,
2625 const SmallVectorImpl<ISD::OutputArg> &Outs,
2626 LLVMContext &Context) const {
2627 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2628 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2629 // for shaders. Vector types should be explicitly handled by CC.
2630 if (AMDGPU::isEntryFunctionCC(CallConv))
2631 return true;
2632
2633 SmallVector<CCValAssign, 16> RVLocs;
2634 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2635 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2636 }
2637
2638 SDValue
LowerReturn(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SDLoc & DL,SelectionDAG & DAG) const2639 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2640 bool isVarArg,
2641 const SmallVectorImpl<ISD::OutputArg> &Outs,
2642 const SmallVectorImpl<SDValue> &OutVals,
2643 const SDLoc &DL, SelectionDAG &DAG) const {
2644 MachineFunction &MF = DAG.getMachineFunction();
2645 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2646
2647 if (AMDGPU::isKernel(CallConv)) {
2648 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2649 OutVals, DL, DAG);
2650 }
2651
2652 bool IsShader = AMDGPU::isShader(CallConv);
2653
2654 Info->setIfReturnsVoid(Outs.empty());
2655 bool IsWaveEnd = Info->returnsVoid() && IsShader;
2656
2657 // CCValAssign - represent the assignment of the return value to a location.
2658 SmallVector<CCValAssign, 48> RVLocs;
2659 SmallVector<ISD::OutputArg, 48> Splits;
2660
2661 // CCState - Info about the registers and stack slots.
2662 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2663 *DAG.getContext());
2664
2665 // Analyze outgoing return values.
2666 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2667
2668 SDValue Flag;
2669 SmallVector<SDValue, 48> RetOps;
2670 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2671
2672 // Copy the result values into the output registers.
2673 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2674 ++I, ++RealRVLocIdx) {
2675 CCValAssign &VA = RVLocs[I];
2676 assert(VA.isRegLoc() && "Can only return in registers!");
2677 // TODO: Partially return in registers if return values don't fit.
2678 SDValue Arg = OutVals[RealRVLocIdx];
2679
2680 // Copied from other backends.
2681 switch (VA.getLocInfo()) {
2682 case CCValAssign::Full:
2683 break;
2684 case CCValAssign::BCvt:
2685 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2686 break;
2687 case CCValAssign::SExt:
2688 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2689 break;
2690 case CCValAssign::ZExt:
2691 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2692 break;
2693 case CCValAssign::AExt:
2694 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2695 break;
2696 default:
2697 llvm_unreachable("Unknown loc info!");
2698 }
2699
2700 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2701 Flag = Chain.getValue(1);
2702 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2703 }
2704
2705 // FIXME: Does sret work properly?
2706 if (!Info->isEntryFunction()) {
2707 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2708 const MCPhysReg *I =
2709 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2710 if (I) {
2711 for (; *I; ++I) {
2712 if (AMDGPU::SReg_64RegClass.contains(*I))
2713 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2714 else if (AMDGPU::SReg_32RegClass.contains(*I))
2715 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2716 else
2717 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2718 }
2719 }
2720 }
2721
2722 // Update chain and glue.
2723 RetOps[0] = Chain;
2724 if (Flag.getNode())
2725 RetOps.push_back(Flag);
2726
2727 unsigned Opc = AMDGPUISD::ENDPGM;
2728 if (!IsWaveEnd)
2729 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
2730 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2731 }
2732
LowerCallResult(SDValue Chain,SDValue InFlag,CallingConv::ID CallConv,bool IsVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & DL,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals,bool IsThisReturn,SDValue ThisVal) const2733 SDValue SITargetLowering::LowerCallResult(
2734 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2735 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2736 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2737 SDValue ThisVal) const {
2738 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2739
2740 // Assign locations to each value returned by this call.
2741 SmallVector<CCValAssign, 16> RVLocs;
2742 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2743 *DAG.getContext());
2744 CCInfo.AnalyzeCallResult(Ins, RetCC);
2745
2746 // Copy all of the result registers out of their specified physreg.
2747 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2748 CCValAssign VA = RVLocs[i];
2749 SDValue Val;
2750
2751 if (VA.isRegLoc()) {
2752 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2753 Chain = Val.getValue(1);
2754 InFlag = Val.getValue(2);
2755 } else if (VA.isMemLoc()) {
2756 report_fatal_error("TODO: return values in memory");
2757 } else
2758 llvm_unreachable("unknown argument location type");
2759
2760 switch (VA.getLocInfo()) {
2761 case CCValAssign::Full:
2762 break;
2763 case CCValAssign::BCvt:
2764 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2765 break;
2766 case CCValAssign::ZExt:
2767 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2768 DAG.getValueType(VA.getValVT()));
2769 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2770 break;
2771 case CCValAssign::SExt:
2772 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2773 DAG.getValueType(VA.getValVT()));
2774 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2775 break;
2776 case CCValAssign::AExt:
2777 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2778 break;
2779 default:
2780 llvm_unreachable("Unknown loc info!");
2781 }
2782
2783 InVals.push_back(Val);
2784 }
2785
2786 return Chain;
2787 }
2788
2789 // Add code to pass special inputs required depending on used features separate
2790 // from the explicit user arguments present in the IR.
passSpecialInputs(CallLoweringInfo & CLI,CCState & CCInfo,const SIMachineFunctionInfo & Info,SmallVectorImpl<std::pair<unsigned,SDValue>> & RegsToPass,SmallVectorImpl<SDValue> & MemOpChains,SDValue Chain) const2791 void SITargetLowering::passSpecialInputs(
2792 CallLoweringInfo &CLI,
2793 CCState &CCInfo,
2794 const SIMachineFunctionInfo &Info,
2795 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2796 SmallVectorImpl<SDValue> &MemOpChains,
2797 SDValue Chain) const {
2798 // If we don't have a call site, this was a call inserted by
2799 // legalization. These can never use special inputs.
2800 if (!CLI.CB)
2801 return;
2802
2803 SelectionDAG &DAG = CLI.DAG;
2804 const SDLoc &DL = CLI.DL;
2805 const Function &F = DAG.getMachineFunction().getFunction();
2806
2807 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2808 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2809
2810 const AMDGPUFunctionArgInfo *CalleeArgInfo
2811 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
2812 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
2813 auto &ArgUsageInfo =
2814 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2815 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2816 }
2817
2818 // TODO: Unify with private memory register handling. This is complicated by
2819 // the fact that at least in kernels, the input argument is not necessarily
2820 // in the same location as the input.
2821 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
2822 StringLiteral> ImplicitAttrs[] = {
2823 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
2824 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
2825 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
2826 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
2827 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
2828 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
2829 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
2830 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
2831 };
2832
2833 for (auto Attr : ImplicitAttrs) {
2834 const ArgDescriptor *OutgoingArg;
2835 const TargetRegisterClass *ArgRC;
2836 LLT ArgTy;
2837
2838 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
2839
2840 // If the callee does not use the attribute value, skip copying the value.
2841 if (CLI.CB->hasFnAttr(Attr.second))
2842 continue;
2843
2844 std::tie(OutgoingArg, ArgRC, ArgTy) =
2845 CalleeArgInfo->getPreloadedValue(InputID);
2846 if (!OutgoingArg)
2847 continue;
2848
2849 const ArgDescriptor *IncomingArg;
2850 const TargetRegisterClass *IncomingArgRC;
2851 LLT Ty;
2852 std::tie(IncomingArg, IncomingArgRC, Ty) =
2853 CallerArgInfo.getPreloadedValue(InputID);
2854 assert(IncomingArgRC == ArgRC);
2855
2856 // All special arguments are ints for now.
2857 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2858 SDValue InputReg;
2859
2860 if (IncomingArg) {
2861 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2862 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
2863 // The implicit arg ptr is special because it doesn't have a corresponding
2864 // input for kernels, and is computed from the kernarg segment pointer.
2865 InputReg = getImplicitArgPtr(DAG, DL);
2866 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
2867 std::optional<uint32_t> Id =
2868 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
2869 if (Id.has_value()) {
2870 InputReg = DAG.getConstant(*Id, DL, ArgVT);
2871 } else {
2872 InputReg = DAG.getUNDEF(ArgVT);
2873 }
2874 } else {
2875 // We may have proven the input wasn't needed, although the ABI is
2876 // requiring it. We just need to allocate the register appropriately.
2877 InputReg = DAG.getUNDEF(ArgVT);
2878 }
2879
2880 if (OutgoingArg->isRegister()) {
2881 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2882 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
2883 report_fatal_error("failed to allocate implicit input argument");
2884 } else {
2885 unsigned SpecialArgOffset =
2886 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
2887 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2888 SpecialArgOffset);
2889 MemOpChains.push_back(ArgStore);
2890 }
2891 }
2892
2893 // Pack workitem IDs into a single register or pass it as is if already
2894 // packed.
2895 const ArgDescriptor *OutgoingArg;
2896 const TargetRegisterClass *ArgRC;
2897 LLT Ty;
2898
2899 std::tie(OutgoingArg, ArgRC, Ty) =
2900 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2901 if (!OutgoingArg)
2902 std::tie(OutgoingArg, ArgRC, Ty) =
2903 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2904 if (!OutgoingArg)
2905 std::tie(OutgoingArg, ArgRC, Ty) =
2906 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2907 if (!OutgoingArg)
2908 return;
2909
2910 const ArgDescriptor *IncomingArgX = std::get<0>(
2911 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X));
2912 const ArgDescriptor *IncomingArgY = std::get<0>(
2913 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
2914 const ArgDescriptor *IncomingArgZ = std::get<0>(
2915 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
2916
2917 SDValue InputReg;
2918 SDLoc SL;
2919
2920 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
2921 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
2922 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
2923
2924 // If incoming ids are not packed we need to pack them.
2925 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
2926 NeedWorkItemIDX) {
2927 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
2928 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
2929 } else {
2930 InputReg = DAG.getConstant(0, DL, MVT::i32);
2931 }
2932 }
2933
2934 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
2935 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
2936 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
2937 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
2938 DAG.getShiftAmountConstant(10, MVT::i32, SL));
2939 InputReg = InputReg.getNode() ?
2940 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
2941 }
2942
2943 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
2944 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
2945 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
2946 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
2947 DAG.getShiftAmountConstant(20, MVT::i32, SL));
2948 InputReg = InputReg.getNode() ?
2949 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
2950 }
2951
2952 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
2953 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
2954 // We're in a situation where the outgoing function requires the workitem
2955 // ID, but the calling function does not have it (e.g a graphics function
2956 // calling a C calling convention function). This is illegal, but we need
2957 // to produce something.
2958 InputReg = DAG.getUNDEF(MVT::i32);
2959 } else {
2960 // Workitem ids are already packed, any of present incoming arguments
2961 // will carry all required fields.
2962 ArgDescriptor IncomingArg = ArgDescriptor::createArg(
2963 IncomingArgX ? *IncomingArgX :
2964 IncomingArgY ? *IncomingArgY :
2965 *IncomingArgZ, ~0u);
2966 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
2967 }
2968 }
2969
2970 if (OutgoingArg->isRegister()) {
2971 if (InputReg)
2972 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2973
2974 CCInfo.AllocateReg(OutgoingArg->getRegister());
2975 } else {
2976 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
2977 if (InputReg) {
2978 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2979 SpecialArgOffset);
2980 MemOpChains.push_back(ArgStore);
2981 }
2982 }
2983 }
2984
canGuaranteeTCO(CallingConv::ID CC)2985 static bool canGuaranteeTCO(CallingConv::ID CC) {
2986 return CC == CallingConv::Fast;
2987 }
2988
2989 /// Return true if we might ever do TCO for calls with this calling convention.
mayTailCallThisCC(CallingConv::ID CC)2990 static bool mayTailCallThisCC(CallingConv::ID CC) {
2991 switch (CC) {
2992 case CallingConv::C:
2993 case CallingConv::AMDGPU_Gfx:
2994 return true;
2995 default:
2996 return canGuaranteeTCO(CC);
2997 }
2998 }
2999
isEligibleForTailCallOptimization(SDValue Callee,CallingConv::ID CalleeCC,bool IsVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SmallVectorImpl<ISD::InputArg> & Ins,SelectionDAG & DAG) const3000 bool SITargetLowering::isEligibleForTailCallOptimization(
3001 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3002 const SmallVectorImpl<ISD::OutputArg> &Outs,
3003 const SmallVectorImpl<SDValue> &OutVals,
3004 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3005 if (!mayTailCallThisCC(CalleeCC))
3006 return false;
3007
3008 // For a divergent call target, we need to do a waterfall loop over the
3009 // possible callees which precludes us from using a simple jump.
3010 if (Callee->isDivergent())
3011 return false;
3012
3013 MachineFunction &MF = DAG.getMachineFunction();
3014 const Function &CallerF = MF.getFunction();
3015 CallingConv::ID CallerCC = CallerF.getCallingConv();
3016 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3017 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3018
3019 // Kernels aren't callable, and don't have a live in return address so it
3020 // doesn't make sense to do a tail call with entry functions.
3021 if (!CallerPreserved)
3022 return false;
3023
3024 bool CCMatch = CallerCC == CalleeCC;
3025
3026 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3027 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3028 return true;
3029 return false;
3030 }
3031
3032 // TODO: Can we handle var args?
3033 if (IsVarArg)
3034 return false;
3035
3036 for (const Argument &Arg : CallerF.args()) {
3037 if (Arg.hasByValAttr())
3038 return false;
3039 }
3040
3041 LLVMContext &Ctx = *DAG.getContext();
3042
3043 // Check that the call results are passed in the same way.
3044 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3045 CCAssignFnForCall(CalleeCC, IsVarArg),
3046 CCAssignFnForCall(CallerCC, IsVarArg)))
3047 return false;
3048
3049 // The callee has to preserve all registers the caller needs to preserve.
3050 if (!CCMatch) {
3051 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3052 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3053 return false;
3054 }
3055
3056 // Nothing more to check if the callee is taking no arguments.
3057 if (Outs.empty())
3058 return true;
3059
3060 SmallVector<CCValAssign, 16> ArgLocs;
3061 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3062
3063 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3064
3065 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3066 // If the stack arguments for this call do not fit into our own save area then
3067 // the call cannot be made tail.
3068 // TODO: Is this really necessary?
3069 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3070 return false;
3071
3072 const MachineRegisterInfo &MRI = MF.getRegInfo();
3073 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3074 }
3075
mayBeEmittedAsTailCall(const CallInst * CI) const3076 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3077 if (!CI->isTailCall())
3078 return false;
3079
3080 const Function *ParentFn = CI->getParent()->getParent();
3081 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
3082 return false;
3083 return true;
3084 }
3085
3086 // The wave scratch offset register is used as the global base pointer.
LowerCall(CallLoweringInfo & CLI,SmallVectorImpl<SDValue> & InVals) const3087 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3088 SmallVectorImpl<SDValue> &InVals) const {
3089 SelectionDAG &DAG = CLI.DAG;
3090 const SDLoc &DL = CLI.DL;
3091 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3092 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3093 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3094 SDValue Chain = CLI.Chain;
3095 SDValue Callee = CLI.Callee;
3096 bool &IsTailCall = CLI.IsTailCall;
3097 CallingConv::ID CallConv = CLI.CallConv;
3098 bool IsVarArg = CLI.IsVarArg;
3099 bool IsSibCall = false;
3100 bool IsThisReturn = false;
3101 MachineFunction &MF = DAG.getMachineFunction();
3102
3103 if (Callee.isUndef() || isNullConstant(Callee)) {
3104 if (!CLI.IsTailCall) {
3105 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3106 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3107 }
3108
3109 return Chain;
3110 }
3111
3112 if (IsVarArg) {
3113 return lowerUnhandledCall(CLI, InVals,
3114 "unsupported call to variadic function ");
3115 }
3116
3117 if (!CLI.CB)
3118 report_fatal_error("unsupported libcall legalization");
3119
3120 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3121 return lowerUnhandledCall(CLI, InVals,
3122 "unsupported required tail call to function ");
3123 }
3124
3125 if (AMDGPU::isShader(CallConv)) {
3126 // Note the issue is with the CC of the called function, not of the call
3127 // itself.
3128 return lowerUnhandledCall(CLI, InVals,
3129 "unsupported call to a shader function ");
3130 }
3131
3132 if (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
3133 CallConv != CallingConv::AMDGPU_Gfx) {
3134 // Only allow calls with specific calling conventions.
3135 return lowerUnhandledCall(CLI, InVals,
3136 "unsupported calling convention for call from "
3137 "graphics shader of function ");
3138 }
3139
3140 if (IsTailCall) {
3141 IsTailCall = isEligibleForTailCallOptimization(
3142 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3143 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
3144 report_fatal_error("failed to perform tail call elimination on a call "
3145 "site marked musttail");
3146 }
3147
3148 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3149
3150 // A sibling call is one where we're under the usual C ABI and not planning
3151 // to change that but can still do a tail call:
3152 if (!TailCallOpt && IsTailCall)
3153 IsSibCall = true;
3154
3155 if (IsTailCall)
3156 ++NumTailCalls;
3157 }
3158
3159 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3160 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3161 SmallVector<SDValue, 8> MemOpChains;
3162
3163 // Analyze operands of the call, assigning locations to each operand.
3164 SmallVector<CCValAssign, 16> ArgLocs;
3165 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3166 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3167
3168 if (CallConv != CallingConv::AMDGPU_Gfx) {
3169 // With a fixed ABI, allocate fixed registers before user arguments.
3170 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3171 }
3172
3173 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3174
3175 // Get a count of how many bytes are to be pushed on the stack.
3176 unsigned NumBytes = CCInfo.getNextStackOffset();
3177
3178 if (IsSibCall) {
3179 // Since we're not changing the ABI to make this a tail call, the memory
3180 // operands are already available in the caller's incoming argument space.
3181 NumBytes = 0;
3182 }
3183
3184 // FPDiff is the byte offset of the call's argument area from the callee's.
3185 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3186 // by this amount for a tail call. In a sibling call it must be 0 because the
3187 // caller will deallocate the entire stack and the callee still expects its
3188 // arguments to begin at SP+0. Completely unused for non-tail calls.
3189 int32_t FPDiff = 0;
3190 MachineFrameInfo &MFI = MF.getFrameInfo();
3191
3192 // Adjust the stack pointer for the new arguments...
3193 // These operations are automatically eliminated by the prolog/epilog pass
3194 if (!IsSibCall) {
3195 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3196
3197 if (!Subtarget->enableFlatScratch()) {
3198 SmallVector<SDValue, 4> CopyFromChains;
3199
3200 // In the HSA case, this should be an identity copy.
3201 SDValue ScratchRSrcReg
3202 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3203 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3204 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3205 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3206 }
3207 }
3208
3209 MVT PtrVT = MVT::i32;
3210
3211 // Walk the register/memloc assignments, inserting copies/loads.
3212 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3213 CCValAssign &VA = ArgLocs[i];
3214 SDValue Arg = OutVals[i];
3215
3216 // Promote the value if needed.
3217 switch (VA.getLocInfo()) {
3218 case CCValAssign::Full:
3219 break;
3220 case CCValAssign::BCvt:
3221 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3222 break;
3223 case CCValAssign::ZExt:
3224 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3225 break;
3226 case CCValAssign::SExt:
3227 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3228 break;
3229 case CCValAssign::AExt:
3230 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3231 break;
3232 case CCValAssign::FPExt:
3233 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3234 break;
3235 default:
3236 llvm_unreachable("Unknown loc info!");
3237 }
3238
3239 if (VA.isRegLoc()) {
3240 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3241 } else {
3242 assert(VA.isMemLoc());
3243
3244 SDValue DstAddr;
3245 MachinePointerInfo DstInfo;
3246
3247 unsigned LocMemOffset = VA.getLocMemOffset();
3248 int32_t Offset = LocMemOffset;
3249
3250 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3251 MaybeAlign Alignment;
3252
3253 if (IsTailCall) {
3254 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3255 unsigned OpSize = Flags.isByVal() ?
3256 Flags.getByValSize() : VA.getValVT().getStoreSize();
3257
3258 // FIXME: We can have better than the minimum byval required alignment.
3259 Alignment =
3260 Flags.isByVal()
3261 ? Flags.getNonZeroByValAlign()
3262 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3263
3264 Offset = Offset + FPDiff;
3265 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3266
3267 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3268 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3269
3270 // Make sure any stack arguments overlapping with where we're storing
3271 // are loaded before this eventual operation. Otherwise they'll be
3272 // clobbered.
3273
3274 // FIXME: Why is this really necessary? This seems to just result in a
3275 // lot of code to copy the stack and write them back to the same
3276 // locations, which are supposed to be immutable?
3277 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3278 } else {
3279 // Stores to the argument stack area are relative to the stack pointer.
3280 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3281 MVT::i32);
3282 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3283 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3284 Alignment =
3285 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3286 }
3287
3288 if (Outs[i].Flags.isByVal()) {
3289 SDValue SizeNode =
3290 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3291 SDValue Cpy =
3292 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3293 Outs[i].Flags.getNonZeroByValAlign(),
3294 /*isVol = */ false, /*AlwaysInline = */ true,
3295 /*isTailCall = */ false, DstInfo,
3296 MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
3297
3298 MemOpChains.push_back(Cpy);
3299 } else {
3300 SDValue Store =
3301 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3302 MemOpChains.push_back(Store);
3303 }
3304 }
3305 }
3306
3307 if (!MemOpChains.empty())
3308 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3309
3310 // Build a sequence of copy-to-reg nodes chained together with token chain
3311 // and flag operands which copy the outgoing args into the appropriate regs.
3312 SDValue InFlag;
3313 for (auto &RegToPass : RegsToPass) {
3314 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3315 RegToPass.second, InFlag);
3316 InFlag = Chain.getValue(1);
3317 }
3318
3319
3320 // We don't usually want to end the call-sequence here because we would tidy
3321 // the frame up *after* the call, however in the ABI-changing tail-call case
3322 // we've carefully laid out the parameters so that when sp is reset they'll be
3323 // in the correct location.
3324 if (IsTailCall && !IsSibCall) {
3325 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InFlag, DL);
3326 InFlag = Chain.getValue(1);
3327 }
3328
3329 std::vector<SDValue> Ops;
3330 Ops.push_back(Chain);
3331 Ops.push_back(Callee);
3332 // Add a redundant copy of the callee global which will not be legalized, as
3333 // we need direct access to the callee later.
3334 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3335 const GlobalValue *GV = GSD->getGlobal();
3336 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3337 } else {
3338 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3339 }
3340
3341 if (IsTailCall) {
3342 // Each tail call may have to adjust the stack by a different amount, so
3343 // this information must travel along with the operation for eventual
3344 // consumption by emitEpilogue.
3345 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3346 }
3347
3348 // Add argument registers to the end of the list so that they are known live
3349 // into the call.
3350 for (auto &RegToPass : RegsToPass) {
3351 Ops.push_back(DAG.getRegister(RegToPass.first,
3352 RegToPass.second.getValueType()));
3353 }
3354
3355 // Add a register mask operand representing the call-preserved registers.
3356
3357 auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
3358 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3359 assert(Mask && "Missing call preserved mask for calling convention");
3360 Ops.push_back(DAG.getRegisterMask(Mask));
3361
3362 if (InFlag.getNode())
3363 Ops.push_back(InFlag);
3364
3365 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3366
3367 // If we're doing a tall call, use a TC_RETURN here rather than an
3368 // actual call instruction.
3369 if (IsTailCall) {
3370 MFI.setHasTailCall();
3371 return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
3372 }
3373
3374 // Returns a chain and a flag for retval copy to use.
3375 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3376 Chain = Call.getValue(0);
3377 InFlag = Call.getValue(1);
3378
3379 uint64_t CalleePopBytes = NumBytes;
3380 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InFlag, DL);
3381 if (!Ins.empty())
3382 InFlag = Chain.getValue(1);
3383
3384 // Handle result values, copying them out of physregs into vregs that we
3385 // return.
3386 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3387 InVals, IsThisReturn,
3388 IsThisReturn ? OutVals[0] : SDValue());
3389 }
3390
3391 // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3392 // except for applying the wave size scale to the increment amount.
lowerDYNAMIC_STACKALLOCImpl(SDValue Op,SelectionDAG & DAG) const3393 SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
3394 SDValue Op, SelectionDAG &DAG) const {
3395 const MachineFunction &MF = DAG.getMachineFunction();
3396 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3397
3398 SDLoc dl(Op);
3399 EVT VT = Op.getValueType();
3400 SDValue Tmp1 = Op;
3401 SDValue Tmp2 = Op.getValue(1);
3402 SDValue Tmp3 = Op.getOperand(2);
3403 SDValue Chain = Tmp1.getOperand(0);
3404
3405 Register SPReg = Info->getStackPtrOffsetReg();
3406
3407 // Chain the dynamic stack allocation so that it doesn't modify the stack
3408 // pointer when other instructions are using the stack.
3409 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3410
3411 SDValue Size = Tmp2.getOperand(1);
3412 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3413 Chain = SP.getValue(1);
3414 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3415 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3416 const TargetFrameLowering *TFL = ST.getFrameLowering();
3417 unsigned Opc =
3418 TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3419 ISD::ADD : ISD::SUB;
3420
3421 SDValue ScaledSize = DAG.getNode(
3422 ISD::SHL, dl, VT, Size,
3423 DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
3424
3425 Align StackAlign = TFL->getStackAlign();
3426 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3427 if (Alignment && *Alignment > StackAlign) {
3428 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3429 DAG.getConstant(-(uint64_t)Alignment->value()
3430 << ST.getWavefrontSizeLog2(),
3431 dl, VT));
3432 }
3433
3434 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3435 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3436
3437 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3438 }
3439
LowerDYNAMIC_STACKALLOC(SDValue Op,SelectionDAG & DAG) const3440 SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
3441 SelectionDAG &DAG) const {
3442 // We only handle constant sizes here to allow non-entry block, static sized
3443 // allocas. A truly dynamic value is more difficult to support because we
3444 // don't know if the size value is uniform or not. If the size isn't uniform,
3445 // we would need to do a wave reduction to get the maximum size to know how
3446 // much to increment the uniform stack pointer.
3447 SDValue Size = Op.getOperand(1);
3448 if (isa<ConstantSDNode>(Size))
3449 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3450
3451 return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
3452 }
3453
getRegisterByName(const char * RegName,LLT VT,const MachineFunction & MF) const3454 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
3455 const MachineFunction &MF) const {
3456 Register Reg = StringSwitch<Register>(RegName)
3457 .Case("m0", AMDGPU::M0)
3458 .Case("exec", AMDGPU::EXEC)
3459 .Case("exec_lo", AMDGPU::EXEC_LO)
3460 .Case("exec_hi", AMDGPU::EXEC_HI)
3461 .Case("flat_scratch", AMDGPU::FLAT_SCR)
3462 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3463 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3464 .Default(Register());
3465
3466 if (Reg == AMDGPU::NoRegister) {
3467 report_fatal_error(Twine("invalid register name \""
3468 + StringRef(RegName) + "\"."));
3469
3470 }
3471
3472 if (!Subtarget->hasFlatScrRegister() &&
3473 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
3474 report_fatal_error(Twine("invalid register \""
3475 + StringRef(RegName) + "\" for subtarget."));
3476 }
3477
3478 switch (Reg) {
3479 case AMDGPU::M0:
3480 case AMDGPU::EXEC_LO:
3481 case AMDGPU::EXEC_HI:
3482 case AMDGPU::FLAT_SCR_LO:
3483 case AMDGPU::FLAT_SCR_HI:
3484 if (VT.getSizeInBits() == 32)
3485 return Reg;
3486 break;
3487 case AMDGPU::EXEC:
3488 case AMDGPU::FLAT_SCR:
3489 if (VT.getSizeInBits() == 64)
3490 return Reg;
3491 break;
3492 default:
3493 llvm_unreachable("missing register type checking");
3494 }
3495
3496 report_fatal_error(Twine("invalid type for register \""
3497 + StringRef(RegName) + "\"."));
3498 }
3499
3500 // If kill is not the last instruction, split the block so kill is always a
3501 // proper terminator.
3502 MachineBasicBlock *
splitKillBlock(MachineInstr & MI,MachineBasicBlock * BB) const3503 SITargetLowering::splitKillBlock(MachineInstr &MI,
3504 MachineBasicBlock *BB) const {
3505 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
3506 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3507 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
3508 return SplitBB;
3509 }
3510
3511 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3512 // \p MI will be the only instruction in the loop body block. Otherwise, it will
3513 // be the first instruction in the remainder block.
3514 //
3515 /// \returns { LoopBody, Remainder }
3516 static std::pair<MachineBasicBlock *, MachineBasicBlock *>
splitBlockForLoop(MachineInstr & MI,MachineBasicBlock & MBB,bool InstInLoop)3517 splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
3518 MachineFunction *MF = MBB.getParent();
3519 MachineBasicBlock::iterator I(&MI);
3520
3521 // To insert the loop we need to split the block. Move everything after this
3522 // point to a new block, and insert a new empty block between the two.
3523 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
3524 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
3525 MachineFunction::iterator MBBI(MBB);
3526 ++MBBI;
3527
3528 MF->insert(MBBI, LoopBB);
3529 MF->insert(MBBI, RemainderBB);
3530
3531 LoopBB->addSuccessor(LoopBB);
3532 LoopBB->addSuccessor(RemainderBB);
3533
3534 // Move the rest of the block into a new block.
3535 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3536
3537 if (InstInLoop) {
3538 auto Next = std::next(I);
3539
3540 // Move instruction to loop body.
3541 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
3542
3543 // Move the rest of the block.
3544 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
3545 } else {
3546 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3547 }
3548
3549 MBB.addSuccessor(LoopBB);
3550
3551 return std::pair(LoopBB, RemainderBB);
3552 }
3553
3554 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
bundleInstWithWaitcnt(MachineInstr & MI) const3555 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
3556 MachineBasicBlock *MBB = MI.getParent();
3557 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3558 auto I = MI.getIterator();
3559 auto E = std::next(I);
3560
3561 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
3562 .addImm(0);
3563
3564 MIBundleBuilder Bundler(*MBB, I, E);
3565 finalizeBundle(*MBB, Bundler.begin());
3566 }
3567
3568 MachineBasicBlock *
emitGWSMemViolTestLoop(MachineInstr & MI,MachineBasicBlock * BB) const3569 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
3570 MachineBasicBlock *BB) const {
3571 const DebugLoc &DL = MI.getDebugLoc();
3572
3573 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3574
3575 MachineBasicBlock *LoopBB;
3576 MachineBasicBlock *RemainderBB;
3577 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3578
3579 // Apparently kill flags are only valid if the def is in the same block?
3580 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
3581 Src->setIsKill(false);
3582
3583 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
3584
3585 MachineBasicBlock::iterator I = LoopBB->end();
3586
3587 const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
3588 AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
3589
3590 // Clear TRAP_STS.MEM_VIOL
3591 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
3592 .addImm(0)
3593 .addImm(EncodedReg);
3594
3595 bundleInstWithWaitcnt(MI);
3596
3597 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3598
3599 // Load and check TRAP_STS.MEM_VIOL
3600 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
3601 .addImm(EncodedReg);
3602
3603 // FIXME: Do we need to use an isel pseudo that may clobber scc?
3604 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
3605 .addReg(Reg, RegState::Kill)
3606 .addImm(0);
3607 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3608 .addMBB(LoopBB);
3609
3610 return RemainderBB;
3611 }
3612
3613 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3614 // wavefront. If the value is uniform and just happens to be in a VGPR, this
3615 // will only do one iteration. In the worst case, this will loop 64 times.
3616 //
3617 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3618 static MachineBasicBlock::iterator
emitLoadM0FromVGPRLoop(const SIInstrInfo * TII,MachineRegisterInfo & MRI,MachineBasicBlock & OrigBB,MachineBasicBlock & LoopBB,const DebugLoc & DL,const MachineOperand & Idx,unsigned InitReg,unsigned ResultReg,unsigned PhiReg,unsigned InitSaveExecReg,int Offset,bool UseGPRIdxMode,Register & SGPRIdxReg)3619 emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
3620 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3621 const DebugLoc &DL, const MachineOperand &Idx,
3622 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
3623 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
3624 Register &SGPRIdxReg) {
3625
3626 MachineFunction *MF = OrigBB.getParent();
3627 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3628 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3629 MachineBasicBlock::iterator I = LoopBB.begin();
3630
3631 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3632 Register PhiExec = MRI.createVirtualRegister(BoolRC);
3633 Register NewExec = MRI.createVirtualRegister(BoolRC);
3634 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3635 Register CondReg = MRI.createVirtualRegister(BoolRC);
3636
3637 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
3638 .addReg(InitReg)
3639 .addMBB(&OrigBB)
3640 .addReg(ResultReg)
3641 .addMBB(&LoopBB);
3642
3643 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
3644 .addReg(InitSaveExecReg)
3645 .addMBB(&OrigBB)
3646 .addReg(NewExec)
3647 .addMBB(&LoopBB);
3648
3649 // Read the next variant <- also loop target.
3650 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3651 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
3652
3653 // Compare the just read M0 value to all possible Idx values.
3654 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3655 .addReg(CurrentIdxReg)
3656 .addReg(Idx.getReg(), 0, Idx.getSubReg());
3657
3658 // Update EXEC, save the original EXEC value to VCC.
3659 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3660 : AMDGPU::S_AND_SAVEEXEC_B64),
3661 NewExec)
3662 .addReg(CondReg, RegState::Kill);
3663
3664 MRI.setSimpleHint(NewExec, CondReg);
3665
3666 if (UseGPRIdxMode) {
3667 if (Offset == 0) {
3668 SGPRIdxReg = CurrentIdxReg;
3669 } else {
3670 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3671 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3672 .addReg(CurrentIdxReg, RegState::Kill)
3673 .addImm(Offset);
3674 }
3675 } else {
3676 // Move index from VCC into M0
3677 if (Offset == 0) {
3678 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3679 .addReg(CurrentIdxReg, RegState::Kill);
3680 } else {
3681 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3682 .addReg(CurrentIdxReg, RegState::Kill)
3683 .addImm(Offset);
3684 }
3685 }
3686
3687 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3688 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3689 MachineInstr *InsertPt =
3690 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3691 : AMDGPU::S_XOR_B64_term), Exec)
3692 .addReg(Exec)
3693 .addReg(NewExec);
3694
3695 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3696 // s_cbranch_scc0?
3697
3698 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3699 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
3700 .addMBB(&LoopBB);
3701
3702 return InsertPt->getIterator();
3703 }
3704
3705 // This has slightly sub-optimal regalloc when the source vector is killed by
3706 // the read. The register allocator does not understand that the kill is
3707 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
3708 // subregister from it, using 1 more VGPR than necessary. This was saved when
3709 // this was expanded after register allocation.
3710 static MachineBasicBlock::iterator
loadM0FromVGPR(const SIInstrInfo * TII,MachineBasicBlock & MBB,MachineInstr & MI,unsigned InitResultReg,unsigned PhiReg,int Offset,bool UseGPRIdxMode,Register & SGPRIdxReg)3711 loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
3712 unsigned InitResultReg, unsigned PhiReg, int Offset,
3713 bool UseGPRIdxMode, Register &SGPRIdxReg) {
3714 MachineFunction *MF = MBB.getParent();
3715 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3716 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3717 MachineRegisterInfo &MRI = MF->getRegInfo();
3718 const DebugLoc &DL = MI.getDebugLoc();
3719 MachineBasicBlock::iterator I(&MI);
3720
3721 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3722 Register DstReg = MI.getOperand(0).getReg();
3723 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
3724 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
3725 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3726 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3727
3728 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3729
3730 // Save the EXEC mask
3731 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
3732 .addReg(Exec);
3733
3734 MachineBasicBlock *LoopBB;
3735 MachineBasicBlock *RemainderBB;
3736 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
3737
3738 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3739
3740 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3741 InitResultReg, DstReg, PhiReg, TmpExec,
3742 Offset, UseGPRIdxMode, SGPRIdxReg);
3743
3744 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
3745 MachineFunction::iterator MBBI(LoopBB);
3746 ++MBBI;
3747 MF->insert(MBBI, LandingPad);
3748 LoopBB->removeSuccessor(RemainderBB);
3749 LandingPad->addSuccessor(RemainderBB);
3750 LoopBB->addSuccessor(LandingPad);
3751 MachineBasicBlock::iterator First = LandingPad->begin();
3752 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
3753 .addReg(SaveExec);
3754
3755 return InsPt;
3756 }
3757
3758 // Returns subreg index, offset
3759 static std::pair<unsigned, int>
computeIndirectRegAndOffset(const SIRegisterInfo & TRI,const TargetRegisterClass * SuperRC,unsigned VecReg,int Offset)3760 computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
3761 const TargetRegisterClass *SuperRC,
3762 unsigned VecReg,
3763 int Offset) {
3764 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3765
3766 // Skip out of bounds offsets, or else we would end up using an undefined
3767 // register.
3768 if (Offset >= NumElts || Offset < 0)
3769 return std::pair(AMDGPU::sub0, Offset);
3770
3771 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
3772 }
3773
setM0ToIndexFromSGPR(const SIInstrInfo * TII,MachineRegisterInfo & MRI,MachineInstr & MI,int Offset)3774 static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
3775 MachineRegisterInfo &MRI, MachineInstr &MI,
3776 int Offset) {
3777 MachineBasicBlock *MBB = MI.getParent();
3778 const DebugLoc &DL = MI.getDebugLoc();
3779 MachineBasicBlock::iterator I(&MI);
3780
3781 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3782
3783 assert(Idx->getReg() != AMDGPU::NoRegister);
3784
3785 if (Offset == 0) {
3786 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
3787 } else {
3788 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3789 .add(*Idx)
3790 .addImm(Offset);
3791 }
3792 }
3793
getIndirectSGPRIdx(const SIInstrInfo * TII,MachineRegisterInfo & MRI,MachineInstr & MI,int Offset)3794 static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
3795 MachineRegisterInfo &MRI, MachineInstr &MI,
3796 int Offset) {
3797 MachineBasicBlock *MBB = MI.getParent();
3798 const DebugLoc &DL = MI.getDebugLoc();
3799 MachineBasicBlock::iterator I(&MI);
3800
3801 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3802
3803 if (Offset == 0)
3804 return Idx->getReg();
3805
3806 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3807 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3808 .add(*Idx)
3809 .addImm(Offset);
3810 return Tmp;
3811 }
3812
emitIndirectSrc(MachineInstr & MI,MachineBasicBlock & MBB,const GCNSubtarget & ST)3813 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
3814 MachineBasicBlock &MBB,
3815 const GCNSubtarget &ST) {
3816 const SIInstrInfo *TII = ST.getInstrInfo();
3817 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3818 MachineFunction *MF = MBB.getParent();
3819 MachineRegisterInfo &MRI = MF->getRegInfo();
3820
3821 Register Dst = MI.getOperand(0).getReg();
3822 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3823 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3824 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3825
3826 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3827 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3828
3829 unsigned SubReg;
3830 std::tie(SubReg, Offset)
3831 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3832
3833 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3834
3835 // Check for a SGPR index.
3836 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3837 MachineBasicBlock::iterator I(&MI);
3838 const DebugLoc &DL = MI.getDebugLoc();
3839
3840 if (UseGPRIdxMode) {
3841 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3842 // to avoid interfering with other uses, so probably requires a new
3843 // optimization pass.
3844 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
3845
3846 const MCInstrDesc &GPRIDXDesc =
3847 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3848 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3849 .addReg(SrcReg)
3850 .addReg(Idx)
3851 .addImm(SubReg);
3852 } else {
3853 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
3854
3855 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3856 .addReg(SrcReg, 0, SubReg)
3857 .addReg(SrcReg, RegState::Implicit);
3858 }
3859
3860 MI.eraseFromParent();
3861
3862 return &MBB;
3863 }
3864
3865 // Control flow needs to be inserted if indexing with a VGPR.
3866 const DebugLoc &DL = MI.getDebugLoc();
3867 MachineBasicBlock::iterator I(&MI);
3868
3869 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3870 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3871
3872 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3873
3874 Register SGPRIdxReg;
3875 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
3876 UseGPRIdxMode, SGPRIdxReg);
3877
3878 MachineBasicBlock *LoopBB = InsPt->getParent();
3879
3880 if (UseGPRIdxMode) {
3881 const MCInstrDesc &GPRIDXDesc =
3882 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3883
3884 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3885 .addReg(SrcReg)
3886 .addReg(SGPRIdxReg)
3887 .addImm(SubReg);
3888 } else {
3889 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3890 .addReg(SrcReg, 0, SubReg)
3891 .addReg(SrcReg, RegState::Implicit);
3892 }
3893
3894 MI.eraseFromParent();
3895
3896 return LoopBB;
3897 }
3898
emitIndirectDst(MachineInstr & MI,MachineBasicBlock & MBB,const GCNSubtarget & ST)3899 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
3900 MachineBasicBlock &MBB,
3901 const GCNSubtarget &ST) {
3902 const SIInstrInfo *TII = ST.getInstrInfo();
3903 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3904 MachineFunction *MF = MBB.getParent();
3905 MachineRegisterInfo &MRI = MF->getRegInfo();
3906
3907 Register Dst = MI.getOperand(0).getReg();
3908 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3909 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3910 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3911 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3912 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3913 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3914
3915 // This can be an immediate, but will be folded later.
3916 assert(Val->getReg());
3917
3918 unsigned SubReg;
3919 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3920 SrcVec->getReg(),
3921 Offset);
3922 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3923
3924 if (Idx->getReg() == AMDGPU::NoRegister) {
3925 MachineBasicBlock::iterator I(&MI);
3926 const DebugLoc &DL = MI.getDebugLoc();
3927
3928 assert(Offset == 0);
3929
3930 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3931 .add(*SrcVec)
3932 .add(*Val)
3933 .addImm(SubReg);
3934
3935 MI.eraseFromParent();
3936 return &MBB;
3937 }
3938
3939 // Check for a SGPR index.
3940 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3941 MachineBasicBlock::iterator I(&MI);
3942 const DebugLoc &DL = MI.getDebugLoc();
3943
3944 if (UseGPRIdxMode) {
3945 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
3946
3947 const MCInstrDesc &GPRIDXDesc =
3948 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3949 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3950 .addReg(SrcVec->getReg())
3951 .add(*Val)
3952 .addReg(Idx)
3953 .addImm(SubReg);
3954 } else {
3955 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
3956
3957 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3958 TRI.getRegSizeInBits(*VecRC), 32, false);
3959 BuildMI(MBB, I, DL, MovRelDesc, Dst)
3960 .addReg(SrcVec->getReg())
3961 .add(*Val)
3962 .addImm(SubReg);
3963 }
3964 MI.eraseFromParent();
3965 return &MBB;
3966 }
3967
3968 // Control flow needs to be inserted if indexing with a VGPR.
3969 if (Val->isReg())
3970 MRI.clearKillFlags(Val->getReg());
3971
3972 const DebugLoc &DL = MI.getDebugLoc();
3973
3974 Register PhiReg = MRI.createVirtualRegister(VecRC);
3975
3976 Register SGPRIdxReg;
3977 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
3978 UseGPRIdxMode, SGPRIdxReg);
3979 MachineBasicBlock *LoopBB = InsPt->getParent();
3980
3981 if (UseGPRIdxMode) {
3982 const MCInstrDesc &GPRIDXDesc =
3983 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3984
3985 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3986 .addReg(PhiReg)
3987 .add(*Val)
3988 .addReg(SGPRIdxReg)
3989 .addImm(AMDGPU::sub0);
3990 } else {
3991 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3992 TRI.getRegSizeInBits(*VecRC), 32, false);
3993 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
3994 .addReg(PhiReg)
3995 .add(*Val)
3996 .addImm(AMDGPU::sub0);
3997 }
3998
3999 MI.eraseFromParent();
4000 return LoopBB;
4001 }
4002
EmitInstrWithCustomInserter(MachineInstr & MI,MachineBasicBlock * BB) const4003 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
4004 MachineInstr &MI, MachineBasicBlock *BB) const {
4005
4006 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4007 MachineFunction *MF = BB->getParent();
4008 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
4009
4010 switch (MI.getOpcode()) {
4011 case AMDGPU::S_UADDO_PSEUDO:
4012 case AMDGPU::S_USUBO_PSEUDO: {
4013 const DebugLoc &DL = MI.getDebugLoc();
4014 MachineOperand &Dest0 = MI.getOperand(0);
4015 MachineOperand &Dest1 = MI.getOperand(1);
4016 MachineOperand &Src0 = MI.getOperand(2);
4017 MachineOperand &Src1 = MI.getOperand(3);
4018
4019 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4020 ? AMDGPU::S_ADD_I32
4021 : AMDGPU::S_SUB_I32;
4022 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4023
4024 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4025 .addImm(1)
4026 .addImm(0);
4027
4028 MI.eraseFromParent();
4029 return BB;
4030 }
4031 case AMDGPU::S_ADD_U64_PSEUDO:
4032 case AMDGPU::S_SUB_U64_PSEUDO: {
4033 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4034 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4035 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4036 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4037 const DebugLoc &DL = MI.getDebugLoc();
4038
4039 MachineOperand &Dest = MI.getOperand(0);
4040 MachineOperand &Src0 = MI.getOperand(1);
4041 MachineOperand &Src1 = MI.getOperand(2);
4042
4043 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4044 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4045
4046 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4047 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4048 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4049 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4050
4051 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4052 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4053 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4054 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4055
4056 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4057
4058 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4059 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4060 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
4061 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
4062 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4063 .addReg(DestSub0)
4064 .addImm(AMDGPU::sub0)
4065 .addReg(DestSub1)
4066 .addImm(AMDGPU::sub1);
4067 MI.eraseFromParent();
4068 return BB;
4069 }
4070 case AMDGPU::V_ADD_U64_PSEUDO:
4071 case AMDGPU::V_SUB_U64_PSEUDO: {
4072 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4073 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4074 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4075 const DebugLoc &DL = MI.getDebugLoc();
4076
4077 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
4078
4079 MachineOperand &Dest = MI.getOperand(0);
4080 MachineOperand &Src0 = MI.getOperand(1);
4081 MachineOperand &Src1 = MI.getOperand(2);
4082
4083 if (IsAdd && ST.hasLshlAddB64()) {
4084 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
4085 Dest.getReg())
4086 .add(Src0)
4087 .addImm(0)
4088 .add(Src1);
4089 TII->legalizeOperands(*Add);
4090 MI.eraseFromParent();
4091 return BB;
4092 }
4093
4094 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4095
4096 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4097 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4098
4099 Register CarryReg = MRI.createVirtualRegister(CarryRC);
4100 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
4101
4102 const TargetRegisterClass *Src0RC = Src0.isReg()
4103 ? MRI.getRegClass(Src0.getReg())
4104 : &AMDGPU::VReg_64RegClass;
4105 const TargetRegisterClass *Src1RC = Src1.isReg()
4106 ? MRI.getRegClass(Src1.getReg())
4107 : &AMDGPU::VReg_64RegClass;
4108
4109 const TargetRegisterClass *Src0SubRC =
4110 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
4111 const TargetRegisterClass *Src1SubRC =
4112 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
4113
4114 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
4115 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4116 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
4117 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4118
4119 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
4120 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4121 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
4122 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4123
4124 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4125 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4126 .addReg(CarryReg, RegState::Define)
4127 .add(SrcReg0Sub0)
4128 .add(SrcReg1Sub0)
4129 .addImm(0); // clamp bit
4130
4131 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4132 MachineInstr *HiHalf =
4133 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4134 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4135 .add(SrcReg0Sub1)
4136 .add(SrcReg1Sub1)
4137 .addReg(CarryReg, RegState::Kill)
4138 .addImm(0); // clamp bit
4139
4140 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4141 .addReg(DestSub0)
4142 .addImm(AMDGPU::sub0)
4143 .addReg(DestSub1)
4144 .addImm(AMDGPU::sub1);
4145 TII->legalizeOperands(*LoHalf);
4146 TII->legalizeOperands(*HiHalf);
4147 MI.eraseFromParent();
4148 return BB;
4149 }
4150 case AMDGPU::S_ADD_CO_PSEUDO:
4151 case AMDGPU::S_SUB_CO_PSEUDO: {
4152 // This pseudo has a chance to be selected
4153 // only from uniform add/subcarry node. All the VGPR operands
4154 // therefore assumed to be splat vectors.
4155 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4156 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4157 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4158 MachineBasicBlock::iterator MII = MI;
4159 const DebugLoc &DL = MI.getDebugLoc();
4160 MachineOperand &Dest = MI.getOperand(0);
4161 MachineOperand &CarryDest = MI.getOperand(1);
4162 MachineOperand &Src0 = MI.getOperand(2);
4163 MachineOperand &Src1 = MI.getOperand(3);
4164 MachineOperand &Src2 = MI.getOperand(4);
4165 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
4166 ? AMDGPU::S_ADDC_U32
4167 : AMDGPU::S_SUBB_U32;
4168 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
4169 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4170 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
4171 .addReg(Src0.getReg());
4172 Src0.setReg(RegOp0);
4173 }
4174 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
4175 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4176 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
4177 .addReg(Src1.getReg());
4178 Src1.setReg(RegOp1);
4179 }
4180 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4181 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
4182 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
4183 .addReg(Src2.getReg());
4184 Src2.setReg(RegOp2);
4185 }
4186
4187 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
4188 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
4189 assert(WaveSize == 64 || WaveSize == 32);
4190
4191 if (WaveSize == 64) {
4192 if (ST.hasScalarCompareEq64()) {
4193 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
4194 .addReg(Src2.getReg())
4195 .addImm(0);
4196 } else {
4197 const TargetRegisterClass *SubRC =
4198 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
4199 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
4200 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
4201 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
4202 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
4203 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4204
4205 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
4206 .add(Src2Sub0)
4207 .add(Src2Sub1);
4208
4209 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4210 .addReg(Src2_32, RegState::Kill)
4211 .addImm(0);
4212 }
4213 } else {
4214 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4215 .addReg(Src2.getReg())
4216 .addImm(0);
4217 }
4218
4219 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
4220
4221 unsigned SelOpc =
4222 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
4223
4224 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
4225 .addImm(-1)
4226 .addImm(0);
4227
4228 MI.eraseFromParent();
4229 return BB;
4230 }
4231 case AMDGPU::SI_INIT_M0: {
4232 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
4233 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4234 .add(MI.getOperand(0));
4235 MI.eraseFromParent();
4236 return BB;
4237 }
4238 case AMDGPU::GET_GROUPSTATICSIZE: {
4239 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
4240 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
4241 DebugLoc DL = MI.getDebugLoc();
4242 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
4243 .add(MI.getOperand(0))
4244 .addImm(MFI->getLDSSize());
4245 MI.eraseFromParent();
4246 return BB;
4247 }
4248 case AMDGPU::SI_INDIRECT_SRC_V1:
4249 case AMDGPU::SI_INDIRECT_SRC_V2:
4250 case AMDGPU::SI_INDIRECT_SRC_V4:
4251 case AMDGPU::SI_INDIRECT_SRC_V8:
4252 case AMDGPU::SI_INDIRECT_SRC_V9:
4253 case AMDGPU::SI_INDIRECT_SRC_V10:
4254 case AMDGPU::SI_INDIRECT_SRC_V11:
4255 case AMDGPU::SI_INDIRECT_SRC_V12:
4256 case AMDGPU::SI_INDIRECT_SRC_V16:
4257 case AMDGPU::SI_INDIRECT_SRC_V32:
4258 return emitIndirectSrc(MI, *BB, *getSubtarget());
4259 case AMDGPU::SI_INDIRECT_DST_V1:
4260 case AMDGPU::SI_INDIRECT_DST_V2:
4261 case AMDGPU::SI_INDIRECT_DST_V4:
4262 case AMDGPU::SI_INDIRECT_DST_V8:
4263 case AMDGPU::SI_INDIRECT_DST_V9:
4264 case AMDGPU::SI_INDIRECT_DST_V10:
4265 case AMDGPU::SI_INDIRECT_DST_V11:
4266 case AMDGPU::SI_INDIRECT_DST_V12:
4267 case AMDGPU::SI_INDIRECT_DST_V16:
4268 case AMDGPU::SI_INDIRECT_DST_V32:
4269 return emitIndirectDst(MI, *BB, *getSubtarget());
4270 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
4271 case AMDGPU::SI_KILL_I1_PSEUDO:
4272 return splitKillBlock(MI, BB);
4273 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
4274 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4275 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4276 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4277
4278 Register Dst = MI.getOperand(0).getReg();
4279 Register Src0 = MI.getOperand(1).getReg();
4280 Register Src1 = MI.getOperand(2).getReg();
4281 const DebugLoc &DL = MI.getDebugLoc();
4282 Register SrcCond = MI.getOperand(3).getReg();
4283
4284 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4285 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4286 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4287 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
4288
4289 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
4290 .addReg(SrcCond);
4291 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
4292 .addImm(0)
4293 .addReg(Src0, 0, AMDGPU::sub0)
4294 .addImm(0)
4295 .addReg(Src1, 0, AMDGPU::sub0)
4296 .addReg(SrcCondCopy);
4297 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
4298 .addImm(0)
4299 .addReg(Src0, 0, AMDGPU::sub1)
4300 .addImm(0)
4301 .addReg(Src1, 0, AMDGPU::sub1)
4302 .addReg(SrcCondCopy);
4303
4304 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
4305 .addReg(DstLo)
4306 .addImm(AMDGPU::sub0)
4307 .addReg(DstHi)
4308 .addImm(AMDGPU::sub1);
4309 MI.eraseFromParent();
4310 return BB;
4311 }
4312 case AMDGPU::SI_BR_UNDEF: {
4313 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4314 const DebugLoc &DL = MI.getDebugLoc();
4315 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4316 .add(MI.getOperand(0));
4317 Br->getOperand(1).setIsUndef(); // read undef SCC
4318 MI.eraseFromParent();
4319 return BB;
4320 }
4321 case AMDGPU::ADJCALLSTACKUP:
4322 case AMDGPU::ADJCALLSTACKDOWN: {
4323 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4324 MachineInstrBuilder MIB(*MF, &MI);
4325 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
4326 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
4327 return BB;
4328 }
4329 case AMDGPU::SI_CALL_ISEL: {
4330 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4331 const DebugLoc &DL = MI.getDebugLoc();
4332
4333 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
4334
4335 MachineInstrBuilder MIB;
4336 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
4337
4338 for (const MachineOperand &MO : MI.operands())
4339 MIB.add(MO);
4340
4341 MIB.cloneMemRefs(MI);
4342 MI.eraseFromParent();
4343 return BB;
4344 }
4345 case AMDGPU::V_ADD_CO_U32_e32:
4346 case AMDGPU::V_SUB_CO_U32_e32:
4347 case AMDGPU::V_SUBREV_CO_U32_e32: {
4348 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
4349 const DebugLoc &DL = MI.getDebugLoc();
4350 unsigned Opc = MI.getOpcode();
4351
4352 bool NeedClampOperand = false;
4353 if (TII->pseudoToMCOpcode(Opc) == -1) {
4354 Opc = AMDGPU::getVOPe64(Opc);
4355 NeedClampOperand = true;
4356 }
4357
4358 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
4359 if (TII->isVOP3(*I)) {
4360 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4361 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4362 I.addReg(TRI->getVCC(), RegState::Define);
4363 }
4364 I.add(MI.getOperand(1))
4365 .add(MI.getOperand(2));
4366 if (NeedClampOperand)
4367 I.addImm(0); // clamp bit for e64 encoding
4368
4369 TII->legalizeOperands(*I);
4370
4371 MI.eraseFromParent();
4372 return BB;
4373 }
4374 case AMDGPU::V_ADDC_U32_e32:
4375 case AMDGPU::V_SUBB_U32_e32:
4376 case AMDGPU::V_SUBBREV_U32_e32:
4377 // These instructions have an implicit use of vcc which counts towards the
4378 // constant bus limit.
4379 TII->legalizeOperands(MI);
4380 return BB;
4381 case AMDGPU::DS_GWS_INIT:
4382 case AMDGPU::DS_GWS_SEMA_BR:
4383 case AMDGPU::DS_GWS_BARRIER:
4384 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
4385 [[fallthrough]];
4386 case AMDGPU::DS_GWS_SEMA_V:
4387 case AMDGPU::DS_GWS_SEMA_P:
4388 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
4389 // A s_waitcnt 0 is required to be the instruction immediately following.
4390 if (getSubtarget()->hasGWSAutoReplay()) {
4391 bundleInstWithWaitcnt(MI);
4392 return BB;
4393 }
4394
4395 return emitGWSMemViolTestLoop(MI, BB);
4396 case AMDGPU::S_SETREG_B32: {
4397 // Try to optimize cases that only set the denormal mode or rounding mode.
4398 //
4399 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
4400 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
4401 // instead.
4402 //
4403 // FIXME: This could be predicates on the immediate, but tablegen doesn't
4404 // allow you to have a no side effect instruction in the output of a
4405 // sideeffecting pattern.
4406 unsigned ID, Offset, Width;
4407 AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
4408 if (ID != AMDGPU::Hwreg::ID_MODE)
4409 return BB;
4410
4411 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
4412 const unsigned SetMask = WidthMask << Offset;
4413
4414 if (getSubtarget()->hasDenormModeInst()) {
4415 unsigned SetDenormOp = 0;
4416 unsigned SetRoundOp = 0;
4417
4418 // The dedicated instructions can only set the whole denorm or round mode
4419 // at once, not a subset of bits in either.
4420 if (SetMask ==
4421 (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
4422 // If this fully sets both the round and denorm mode, emit the two
4423 // dedicated instructions for these.
4424 SetRoundOp = AMDGPU::S_ROUND_MODE;
4425 SetDenormOp = AMDGPU::S_DENORM_MODE;
4426 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
4427 SetRoundOp = AMDGPU::S_ROUND_MODE;
4428 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
4429 SetDenormOp = AMDGPU::S_DENORM_MODE;
4430 }
4431
4432 if (SetRoundOp || SetDenormOp) {
4433 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4434 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
4435 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
4436 unsigned ImmVal = Def->getOperand(1).getImm();
4437 if (SetRoundOp) {
4438 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
4439 .addImm(ImmVal & 0xf);
4440
4441 // If we also have the denorm mode, get just the denorm mode bits.
4442 ImmVal >>= 4;
4443 }
4444
4445 if (SetDenormOp) {
4446 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
4447 .addImm(ImmVal & 0xf);
4448 }
4449
4450 MI.eraseFromParent();
4451 return BB;
4452 }
4453 }
4454 }
4455
4456 // If only FP bits are touched, used the no side effects pseudo.
4457 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
4458 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
4459 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
4460
4461 return BB;
4462 }
4463 default:
4464 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
4465 }
4466 }
4467
hasBitPreservingFPLogic(EVT VT) const4468 bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
4469 return isTypeLegal(VT.getScalarType());
4470 }
4471
hasAtomicFaddRtnForTy(SDValue & Op) const4472 bool SITargetLowering::hasAtomicFaddRtnForTy(SDValue &Op) const {
4473 switch (Op.getValue(0).getSimpleValueType().SimpleTy) {
4474 case MVT::f32:
4475 return Subtarget->hasAtomicFaddRtnInsts();
4476 case MVT::v2f16:
4477 case MVT::f64:
4478 return Subtarget->hasGFX90AInsts();
4479 default:
4480 return false;
4481 }
4482 }
4483
enableAggressiveFMAFusion(EVT VT) const4484 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
4485 // This currently forces unfolding various combinations of fsub into fma with
4486 // free fneg'd operands. As long as we have fast FMA (controlled by
4487 // isFMAFasterThanFMulAndFAdd), we should perform these.
4488
4489 // When fma is quarter rate, for f64 where add / sub are at best half rate,
4490 // most of these combines appear to be cycle neutral but save on instruction
4491 // count / code size.
4492 return true;
4493 }
4494
enableAggressiveFMAFusion(LLT Ty) const4495 bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
4496
getSetCCResultType(const DataLayout & DL,LLVMContext & Ctx,EVT VT) const4497 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
4498 EVT VT) const {
4499 if (!VT.isVector()) {
4500 return MVT::i1;
4501 }
4502 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
4503 }
4504
getScalarShiftAmountTy(const DataLayout &,EVT VT) const4505 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
4506 // TODO: Should i16 be used always if legal? For now it would force VALU
4507 // shifts.
4508 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
4509 }
4510
getPreferredShiftAmountTy(LLT Ty) const4511 LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
4512 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
4513 ? Ty.changeElementSize(16)
4514 : Ty.changeElementSize(32);
4515 }
4516
4517 // Answering this is somewhat tricky and depends on the specific device which
4518 // have different rates for fma or all f64 operations.
4519 //
4520 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
4521 // regardless of which device (although the number of cycles differs between
4522 // devices), so it is always profitable for f64.
4523 //
4524 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
4525 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
4526 // which we can always do even without fused FP ops since it returns the same
4527 // result as the separate operations and since it is always full
4528 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
4529 // however does not support denormals, so we do report fma as faster if we have
4530 // a fast fma device and require denormals.
4531 //
isFMAFasterThanFMulAndFAdd(const MachineFunction & MF,EVT VT) const4532 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
4533 EVT VT) const {
4534 VT = VT.getScalarType();
4535
4536 switch (VT.getSimpleVT().SimpleTy) {
4537 case MVT::f32: {
4538 // If mad is not available this depends only on if f32 fma is full rate.
4539 if (!Subtarget->hasMadMacF32Insts())
4540 return Subtarget->hasFastFMAF32();
4541
4542 // Otherwise f32 mad is always full rate and returns the same result as
4543 // the separate operations so should be preferred over fma.
4544 // However does not support denormals.
4545 if (hasFP32Denormals(MF))
4546 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
4547
4548 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
4549 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
4550 }
4551 case MVT::f64:
4552 return true;
4553 case MVT::f16:
4554 return Subtarget->has16BitInsts() && hasFP64FP16Denormals(MF);
4555 default:
4556 break;
4557 }
4558
4559 return false;
4560 }
4561
isFMAFasterThanFMulAndFAdd(const MachineFunction & MF,LLT Ty) const4562 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
4563 LLT Ty) const {
4564 switch (Ty.getScalarSizeInBits()) {
4565 case 16:
4566 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
4567 case 32:
4568 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
4569 case 64:
4570 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
4571 default:
4572 break;
4573 }
4574
4575 return false;
4576 }
4577
isFMADLegal(const MachineInstr & MI,LLT Ty) const4578 bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
4579 if (!Ty.isScalar())
4580 return false;
4581
4582 if (Ty.getScalarSizeInBits() == 16)
4583 return Subtarget->hasMadF16() && !hasFP64FP16Denormals(*MI.getMF());
4584 if (Ty.getScalarSizeInBits() == 32)
4585 return Subtarget->hasMadMacF32Insts() && !hasFP32Denormals(*MI.getMF());
4586
4587 return false;
4588 }
4589
isFMADLegal(const SelectionDAG & DAG,const SDNode * N) const4590 bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
4591 const SDNode *N) const {
4592 // TODO: Check future ftz flag
4593 // v_mad_f32/v_mac_f32 do not support denormals.
4594 EVT VT = N->getValueType(0);
4595 if (VT == MVT::f32)
4596 return Subtarget->hasMadMacF32Insts() &&
4597 !hasFP32Denormals(DAG.getMachineFunction());
4598 if (VT == MVT::f16) {
4599 return Subtarget->hasMadF16() &&
4600 !hasFP64FP16Denormals(DAG.getMachineFunction());
4601 }
4602
4603 return false;
4604 }
4605
4606 //===----------------------------------------------------------------------===//
4607 // Custom DAG Lowering Operations
4608 //===----------------------------------------------------------------------===//
4609
4610 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
4611 // wider vector type is legal.
splitUnaryVectorOp(SDValue Op,SelectionDAG & DAG) const4612 SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
4613 SelectionDAG &DAG) const {
4614 unsigned Opc = Op.getOpcode();
4615 EVT VT = Op.getValueType();
4616 assert(VT == MVT::v4f16 || VT == MVT::v4i16);
4617
4618 SDValue Lo, Hi;
4619 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
4620
4621 SDLoc SL(Op);
4622 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
4623 Op->getFlags());
4624 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
4625 Op->getFlags());
4626
4627 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
4628 }
4629
4630 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
4631 // wider vector type is legal.
splitBinaryVectorOp(SDValue Op,SelectionDAG & DAG) const4632 SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
4633 SelectionDAG &DAG) const {
4634 unsigned Opc = Op.getOpcode();
4635 EVT VT = Op.getValueType();
4636 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
4637 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
4638 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
4639 VT == MVT::v32f32);
4640
4641 SDValue Lo0, Hi0;
4642 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
4643 SDValue Lo1, Hi1;
4644 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
4645
4646 SDLoc SL(Op);
4647
4648 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
4649 Op->getFlags());
4650 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
4651 Op->getFlags());
4652
4653 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
4654 }
4655
splitTernaryVectorOp(SDValue Op,SelectionDAG & DAG) const4656 SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
4657 SelectionDAG &DAG) const {
4658 unsigned Opc = Op.getOpcode();
4659 EVT VT = Op.getValueType();
4660 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
4661 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
4662 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
4663 VT == MVT::v32f32);
4664
4665 SDValue Lo0, Hi0;
4666 SDValue Op0 = Op.getOperand(0);
4667 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
4668 ? DAG.SplitVectorOperand(Op.getNode(), 0)
4669 : std::pair(Op0, Op0);
4670 SDValue Lo1, Hi1;
4671 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
4672 SDValue Lo2, Hi2;
4673 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
4674
4675 SDLoc SL(Op);
4676 auto ResVT = DAG.GetSplitDestVTs(VT);
4677
4678 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
4679 Op->getFlags());
4680 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
4681 Op->getFlags());
4682
4683 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
4684 }
4685
4686
LowerOperation(SDValue Op,SelectionDAG & DAG) const4687 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
4688 switch (Op.getOpcode()) {
4689 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
4690 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
4691 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
4692 case ISD::LOAD: {
4693 SDValue Result = LowerLOAD(Op, DAG);
4694 assert((!Result.getNode() ||
4695 Result.getNode()->getNumValues() == 2) &&
4696 "Load should return a value and a chain");
4697 return Result;
4698 }
4699
4700 case ISD::FSIN:
4701 case ISD::FCOS:
4702 return LowerTrig(Op, DAG);
4703 case ISD::SELECT: return LowerSELECT(Op, DAG);
4704 case ISD::FDIV: return LowerFDIV(Op, DAG);
4705 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
4706 case ISD::STORE: return LowerSTORE(Op, DAG);
4707 case ISD::GlobalAddress: {
4708 MachineFunction &MF = DAG.getMachineFunction();
4709 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
4710 return LowerGlobalAddress(MFI, Op, DAG);
4711 }
4712 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
4713 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
4714 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
4715 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
4716 case ISD::INSERT_SUBVECTOR:
4717 return lowerINSERT_SUBVECTOR(Op, DAG);
4718 case ISD::INSERT_VECTOR_ELT:
4719 return lowerINSERT_VECTOR_ELT(Op, DAG);
4720 case ISD::EXTRACT_VECTOR_ELT:
4721 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
4722 case ISD::VECTOR_SHUFFLE:
4723 return lowerVECTOR_SHUFFLE(Op, DAG);
4724 case ISD::SCALAR_TO_VECTOR:
4725 return lowerSCALAR_TO_VECTOR(Op, DAG);
4726 case ISD::BUILD_VECTOR:
4727 return lowerBUILD_VECTOR(Op, DAG);
4728 case ISD::FP_ROUND:
4729 return lowerFP_ROUND(Op, DAG);
4730 case ISD::FPTRUNC_ROUND: {
4731 unsigned Opc;
4732 SDLoc DL(Op);
4733
4734 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
4735 return SDValue();
4736
4737 // Get the rounding mode from the last operand
4738 int RoundMode = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4739 if (RoundMode == (int)RoundingMode::TowardPositive)
4740 Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD;
4741 else if (RoundMode == (int)RoundingMode::TowardNegative)
4742 Opc = AMDGPUISD::FPTRUNC_ROUND_DOWNWARD;
4743 else
4744 return SDValue();
4745
4746 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
4747 }
4748 case ISD::TRAP:
4749 return lowerTRAP(Op, DAG);
4750 case ISD::DEBUGTRAP:
4751 return lowerDEBUGTRAP(Op, DAG);
4752 case ISD::FABS:
4753 case ISD::FNEG:
4754 case ISD::FCANONICALIZE:
4755 case ISD::BSWAP:
4756 return splitUnaryVectorOp(Op, DAG);
4757 case ISD::FMINNUM:
4758 case ISD::FMAXNUM:
4759 return lowerFMINNUM_FMAXNUM(Op, DAG);
4760 case ISD::FMA:
4761 return splitTernaryVectorOp(Op, DAG);
4762 case ISD::FP_TO_SINT:
4763 case ISD::FP_TO_UINT:
4764 return LowerFP_TO_INT(Op, DAG);
4765 case ISD::SHL:
4766 case ISD::SRA:
4767 case ISD::SRL:
4768 case ISD::ADD:
4769 case ISD::SUB:
4770 case ISD::MUL:
4771 case ISD::SMIN:
4772 case ISD::SMAX:
4773 case ISD::UMIN:
4774 case ISD::UMAX:
4775 case ISD::FADD:
4776 case ISD::FMUL:
4777 case ISD::FMINNUM_IEEE:
4778 case ISD::FMAXNUM_IEEE:
4779 case ISD::UADDSAT:
4780 case ISD::USUBSAT:
4781 case ISD::SADDSAT:
4782 case ISD::SSUBSAT:
4783 return splitBinaryVectorOp(Op, DAG);
4784 case ISD::SMULO:
4785 case ISD::UMULO:
4786 return lowerXMULO(Op, DAG);
4787 case ISD::SMUL_LOHI:
4788 case ISD::UMUL_LOHI:
4789 return lowerXMUL_LOHI(Op, DAG);
4790 case ISD::DYNAMIC_STACKALLOC:
4791 return LowerDYNAMIC_STACKALLOC(Op, DAG);
4792 }
4793 return SDValue();
4794 }
4795
4796 // Used for D16: Casts the result of an instruction into the right vector,
4797 // packs values if loads return unpacked values.
adjustLoadValueTypeImpl(SDValue Result,EVT LoadVT,const SDLoc & DL,SelectionDAG & DAG,bool Unpacked)4798 static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
4799 const SDLoc &DL,
4800 SelectionDAG &DAG, bool Unpacked) {
4801 if (!LoadVT.isVector())
4802 return Result;
4803
4804 // Cast back to the original packed type or to a larger type that is a
4805 // multiple of 32 bit for D16. Widening the return type is a required for
4806 // legalization.
4807 EVT FittingLoadVT = LoadVT;
4808 if ((LoadVT.getVectorNumElements() % 2) == 1) {
4809 FittingLoadVT =
4810 EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
4811 LoadVT.getVectorNumElements() + 1);
4812 }
4813
4814 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
4815 // Truncate to v2i16/v4i16.
4816 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
4817
4818 // Workaround legalizer not scalarizing truncate after vector op
4819 // legalization but not creating intermediate vector trunc.
4820 SmallVector<SDValue, 4> Elts;
4821 DAG.ExtractVectorElements(Result, Elts);
4822 for (SDValue &Elt : Elts)
4823 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
4824
4825 // Pad illegal v1i16/v3fi6 to v4i16
4826 if ((LoadVT.getVectorNumElements() % 2) == 1)
4827 Elts.push_back(DAG.getUNDEF(MVT::i16));
4828
4829 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
4830
4831 // Bitcast to original type (v2f16/v4f16).
4832 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
4833 }
4834
4835 // Cast back to the original packed type.
4836 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
4837 }
4838
adjustLoadValueType(unsigned Opcode,MemSDNode * M,SelectionDAG & DAG,ArrayRef<SDValue> Ops,bool IsIntrinsic) const4839 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
4840 MemSDNode *M,
4841 SelectionDAG &DAG,
4842 ArrayRef<SDValue> Ops,
4843 bool IsIntrinsic) const {
4844 SDLoc DL(M);
4845
4846 bool Unpacked = Subtarget->hasUnpackedD16VMem();
4847 EVT LoadVT = M->getValueType(0);
4848
4849 EVT EquivLoadVT = LoadVT;
4850 if (LoadVT.isVector()) {
4851 if (Unpacked) {
4852 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4853 LoadVT.getVectorNumElements());
4854 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
4855 // Widen v3f16 to legal type
4856 EquivLoadVT =
4857 EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
4858 LoadVT.getVectorNumElements() + 1);
4859 }
4860 }
4861
4862 // Change from v4f16/v2f16 to EquivLoadVT.
4863 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
4864
4865 SDValue Load
4866 = DAG.getMemIntrinsicNode(
4867 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
4868 VTList, Ops, M->getMemoryVT(),
4869 M->getMemOperand());
4870
4871 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
4872
4873 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
4874 }
4875
lowerIntrinsicLoad(MemSDNode * M,bool IsFormat,SelectionDAG & DAG,ArrayRef<SDValue> Ops) const4876 SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
4877 SelectionDAG &DAG,
4878 ArrayRef<SDValue> Ops) const {
4879 SDLoc DL(M);
4880 EVT LoadVT = M->getValueType(0);
4881 EVT EltType = LoadVT.getScalarType();
4882 EVT IntVT = LoadVT.changeTypeToInteger();
4883
4884 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
4885
4886 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
4887 bool IsTFE = M->getNumValues() == 3;
4888
4889 unsigned Opc;
4890 if (IsFormat) {
4891 Opc = IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
4892 : AMDGPUISD::BUFFER_LOAD_FORMAT;
4893 } else {
4894 // TODO: Support non-format TFE loads.
4895 if (IsTFE)
4896 return SDValue();
4897 Opc = AMDGPUISD::BUFFER_LOAD;
4898 }
4899
4900 if (IsD16) {
4901 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
4902 }
4903
4904 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
4905 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
4906 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
4907
4908 if (isTypeLegal(LoadVT)) {
4909 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
4910 M->getMemOperand(), DAG);
4911 }
4912
4913 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
4914 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
4915 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
4916 M->getMemOperand(), DAG);
4917 return DAG.getMergeValues(
4918 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
4919 DL);
4920 }
4921
lowerICMPIntrinsic(const SITargetLowering & TLI,SDNode * N,SelectionDAG & DAG)4922 static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
4923 SDNode *N, SelectionDAG &DAG) {
4924 EVT VT = N->getValueType(0);
4925 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
4926 unsigned CondCode = CD->getZExtValue();
4927 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
4928 return DAG.getUNDEF(VT);
4929
4930 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
4931
4932 SDValue LHS = N->getOperand(1);
4933 SDValue RHS = N->getOperand(2);
4934
4935 SDLoc DL(N);
4936
4937 EVT CmpVT = LHS.getValueType();
4938 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
4939 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
4940 ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4941 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
4942 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
4943 }
4944
4945 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
4946
4947 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
4948 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
4949
4950 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
4951 DAG.getCondCode(CCOpcode));
4952 if (VT.bitsEq(CCVT))
4953 return SetCC;
4954 return DAG.getZExtOrTrunc(SetCC, DL, VT);
4955 }
4956
lowerFCMPIntrinsic(const SITargetLowering & TLI,SDNode * N,SelectionDAG & DAG)4957 static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
4958 SDNode *N, SelectionDAG &DAG) {
4959 EVT VT = N->getValueType(0);
4960 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
4961
4962 unsigned CondCode = CD->getZExtValue();
4963 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
4964 return DAG.getUNDEF(VT);
4965
4966 SDValue Src0 = N->getOperand(1);
4967 SDValue Src1 = N->getOperand(2);
4968 EVT CmpVT = Src0.getValueType();
4969 SDLoc SL(N);
4970
4971 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
4972 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
4973 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
4974 }
4975
4976 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
4977 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
4978 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
4979 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
4980 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
4981 Src1, DAG.getCondCode(CCOpcode));
4982 if (VT.bitsEq(CCVT))
4983 return SetCC;
4984 return DAG.getZExtOrTrunc(SetCC, SL, VT);
4985 }
4986
lowerBALLOTIntrinsic(const SITargetLowering & TLI,SDNode * N,SelectionDAG & DAG)4987 static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
4988 SelectionDAG &DAG) {
4989 EVT VT = N->getValueType(0);
4990 SDValue Src = N->getOperand(1);
4991 SDLoc SL(N);
4992
4993 if (Src.getOpcode() == ISD::SETCC) {
4994 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
4995 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
4996 Src.getOperand(1), Src.getOperand(2));
4997 }
4998 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
4999 // (ballot 0) -> 0
5000 if (Arg->isZero())
5001 return DAG.getConstant(0, SL, VT);
5002
5003 // (ballot 1) -> EXEC/EXEC_LO
5004 if (Arg->isOne()) {
5005 Register Exec;
5006 if (VT.getScalarSizeInBits() == 32)
5007 Exec = AMDGPU::EXEC_LO;
5008 else if (VT.getScalarSizeInBits() == 64)
5009 Exec = AMDGPU::EXEC;
5010 else
5011 return SDValue();
5012
5013 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
5014 }
5015 }
5016
5017 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
5018 // ISD::SETNE)
5019 return DAG.getNode(
5020 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
5021 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
5022 }
5023
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const5024 void SITargetLowering::ReplaceNodeResults(SDNode *N,
5025 SmallVectorImpl<SDValue> &Results,
5026 SelectionDAG &DAG) const {
5027 switch (N->getOpcode()) {
5028 case ISD::INSERT_VECTOR_ELT: {
5029 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
5030 Results.push_back(Res);
5031 return;
5032 }
5033 case ISD::EXTRACT_VECTOR_ELT: {
5034 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
5035 Results.push_back(Res);
5036 return;
5037 }
5038 case ISD::INTRINSIC_WO_CHAIN: {
5039 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
5040 switch (IID) {
5041 case Intrinsic::amdgcn_cvt_pkrtz: {
5042 SDValue Src0 = N->getOperand(1);
5043 SDValue Src1 = N->getOperand(2);
5044 SDLoc SL(N);
5045 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
5046 Src0, Src1);
5047 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
5048 return;
5049 }
5050 case Intrinsic::amdgcn_cvt_pknorm_i16:
5051 case Intrinsic::amdgcn_cvt_pknorm_u16:
5052 case Intrinsic::amdgcn_cvt_pk_i16:
5053 case Intrinsic::amdgcn_cvt_pk_u16: {
5054 SDValue Src0 = N->getOperand(1);
5055 SDValue Src1 = N->getOperand(2);
5056 SDLoc SL(N);
5057 unsigned Opcode;
5058
5059 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
5060 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
5061 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
5062 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
5063 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
5064 Opcode = AMDGPUISD::CVT_PK_I16_I32;
5065 else
5066 Opcode = AMDGPUISD::CVT_PK_U16_U32;
5067
5068 EVT VT = N->getValueType(0);
5069 if (isTypeLegal(VT))
5070 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
5071 else {
5072 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
5073 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
5074 }
5075 return;
5076 }
5077 }
5078 break;
5079 }
5080 case ISD::INTRINSIC_W_CHAIN: {
5081 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
5082 if (Res.getOpcode() == ISD::MERGE_VALUES) {
5083 // FIXME: Hacky
5084 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
5085 Results.push_back(Res.getOperand(I));
5086 }
5087 } else {
5088 Results.push_back(Res);
5089 Results.push_back(Res.getValue(1));
5090 }
5091 return;
5092 }
5093
5094 break;
5095 }
5096 case ISD::SELECT: {
5097 SDLoc SL(N);
5098 EVT VT = N->getValueType(0);
5099 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
5100 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
5101 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
5102
5103 EVT SelectVT = NewVT;
5104 if (NewVT.bitsLT(MVT::i32)) {
5105 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
5106 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
5107 SelectVT = MVT::i32;
5108 }
5109
5110 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
5111 N->getOperand(0), LHS, RHS);
5112
5113 if (NewVT != SelectVT)
5114 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
5115 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
5116 return;
5117 }
5118 case ISD::FNEG: {
5119 if (N->getValueType(0) != MVT::v2f16)
5120 break;
5121
5122 SDLoc SL(N);
5123 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
5124
5125 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
5126 BC,
5127 DAG.getConstant(0x80008000, SL, MVT::i32));
5128 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
5129 return;
5130 }
5131 case ISD::FABS: {
5132 if (N->getValueType(0) != MVT::v2f16)
5133 break;
5134
5135 SDLoc SL(N);
5136 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
5137
5138 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
5139 BC,
5140 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
5141 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
5142 return;
5143 }
5144 default:
5145 break;
5146 }
5147 }
5148
5149 /// Helper function for LowerBRCOND
findUser(SDValue Value,unsigned Opcode)5150 static SDNode *findUser(SDValue Value, unsigned Opcode) {
5151
5152 SDNode *Parent = Value.getNode();
5153 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
5154 I != E; ++I) {
5155
5156 if (I.getUse().get() != Value)
5157 continue;
5158
5159 if (I->getOpcode() == Opcode)
5160 return *I;
5161 }
5162 return nullptr;
5163 }
5164
isCFIntrinsic(const SDNode * Intr) const5165 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
5166 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
5167 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
5168 case Intrinsic::amdgcn_if:
5169 return AMDGPUISD::IF;
5170 case Intrinsic::amdgcn_else:
5171 return AMDGPUISD::ELSE;
5172 case Intrinsic::amdgcn_loop:
5173 return AMDGPUISD::LOOP;
5174 case Intrinsic::amdgcn_end_cf:
5175 llvm_unreachable("should not occur");
5176 default:
5177 return 0;
5178 }
5179 }
5180
5181 // break, if_break, else_break are all only used as inputs to loop, not
5182 // directly as branch conditions.
5183 return 0;
5184 }
5185
shouldEmitFixup(const GlobalValue * GV) const5186 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
5187 const Triple &TT = getTargetMachine().getTargetTriple();
5188 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
5189 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
5190 AMDGPU::shouldEmitConstantsToTextSection(TT);
5191 }
5192
shouldEmitGOTReloc(const GlobalValue * GV) const5193 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
5194 // FIXME: Either avoid relying on address space here or change the default
5195 // address space for functions to avoid the explicit check.
5196 return (GV->getValueType()->isFunctionTy() ||
5197 !isNonGlobalAddrSpace(GV->getAddressSpace())) &&
5198 !shouldEmitFixup(GV) &&
5199 !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
5200 }
5201
shouldEmitPCReloc(const GlobalValue * GV) const5202 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
5203 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
5204 }
5205
shouldUseLDSConstAddress(const GlobalValue * GV) const5206 bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
5207 if (!GV->hasExternalLinkage())
5208 return true;
5209
5210 const auto OS = getTargetMachine().getTargetTriple().getOS();
5211 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
5212 }
5213
5214 /// This transforms the control flow intrinsics to get the branch destination as
5215 /// last parameter, also switches branch target with BR if the need arise
LowerBRCOND(SDValue BRCOND,SelectionDAG & DAG) const5216 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
5217 SelectionDAG &DAG) const {
5218 SDLoc DL(BRCOND);
5219
5220 SDNode *Intr = BRCOND.getOperand(1).getNode();
5221 SDValue Target = BRCOND.getOperand(2);
5222 SDNode *BR = nullptr;
5223 SDNode *SetCC = nullptr;
5224
5225 if (Intr->getOpcode() == ISD::SETCC) {
5226 // As long as we negate the condition everything is fine
5227 SetCC = Intr;
5228 Intr = SetCC->getOperand(0).getNode();
5229
5230 } else {
5231 // Get the target from BR if we don't negate the condition
5232 BR = findUser(BRCOND, ISD::BR);
5233 assert(BR && "brcond missing unconditional branch user");
5234 Target = BR->getOperand(1);
5235 }
5236
5237 unsigned CFNode = isCFIntrinsic(Intr);
5238 if (CFNode == 0) {
5239 // This is a uniform branch so we don't need to legalize.
5240 return BRCOND;
5241 }
5242
5243 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
5244 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
5245
5246 assert(!SetCC ||
5247 (SetCC->getConstantOperandVal(1) == 1 &&
5248 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
5249 ISD::SETNE));
5250
5251 // operands of the new intrinsic call
5252 SmallVector<SDValue, 4> Ops;
5253 if (HaveChain)
5254 Ops.push_back(BRCOND.getOperand(0));
5255
5256 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
5257 Ops.push_back(Target);
5258
5259 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
5260
5261 // build the new intrinsic call
5262 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
5263
5264 if (!HaveChain) {
5265 SDValue Ops[] = {
5266 SDValue(Result, 0),
5267 BRCOND.getOperand(0)
5268 };
5269
5270 Result = DAG.getMergeValues(Ops, DL).getNode();
5271 }
5272
5273 if (BR) {
5274 // Give the branch instruction our target
5275 SDValue Ops[] = {
5276 BR->getOperand(0),
5277 BRCOND.getOperand(2)
5278 };
5279 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
5280 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
5281 }
5282
5283 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
5284
5285 // Copy the intrinsic results to registers
5286 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
5287 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
5288 if (!CopyToReg)
5289 continue;
5290
5291 Chain = DAG.getCopyToReg(
5292 Chain, DL,
5293 CopyToReg->getOperand(1),
5294 SDValue(Result, i - 1),
5295 SDValue());
5296
5297 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
5298 }
5299
5300 // Remove the old intrinsic from the chain
5301 DAG.ReplaceAllUsesOfValueWith(
5302 SDValue(Intr, Intr->getNumValues() - 1),
5303 Intr->getOperand(0));
5304
5305 return Chain;
5306 }
5307
LowerRETURNADDR(SDValue Op,SelectionDAG & DAG) const5308 SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
5309 SelectionDAG &DAG) const {
5310 MVT VT = Op.getSimpleValueType();
5311 SDLoc DL(Op);
5312 // Checking the depth
5313 if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0)
5314 return DAG.getConstant(0, DL, VT);
5315
5316 MachineFunction &MF = DAG.getMachineFunction();
5317 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5318 // Check for kernel and shader functions
5319 if (Info->isEntryFunction())
5320 return DAG.getConstant(0, DL, VT);
5321
5322 MachineFrameInfo &MFI = MF.getFrameInfo();
5323 // There is a call to @llvm.returnaddress in this function
5324 MFI.setReturnAddressIsTaken(true);
5325
5326 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
5327 // Get the return address reg and mark it as an implicit live-in
5328 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
5329
5330 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
5331 }
5332
getFPExtOrFPRound(SelectionDAG & DAG,SDValue Op,const SDLoc & DL,EVT VT) const5333 SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
5334 SDValue Op,
5335 const SDLoc &DL,
5336 EVT VT) const {
5337 return Op.getValueType().bitsLE(VT) ?
5338 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
5339 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
5340 DAG.getTargetConstant(0, DL, MVT::i32));
5341 }
5342
lowerFP_ROUND(SDValue Op,SelectionDAG & DAG) const5343 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
5344 assert(Op.getValueType() == MVT::f16 &&
5345 "Do not know how to custom lower FP_ROUND for non-f16 type");
5346
5347 SDValue Src = Op.getOperand(0);
5348 EVT SrcVT = Src.getValueType();
5349 if (SrcVT != MVT::f64)
5350 return Op;
5351
5352 SDLoc DL(Op);
5353
5354 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
5355 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
5356 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
5357 }
5358
lowerFMINNUM_FMAXNUM(SDValue Op,SelectionDAG & DAG) const5359 SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
5360 SelectionDAG &DAG) const {
5361 EVT VT = Op.getValueType();
5362 const MachineFunction &MF = DAG.getMachineFunction();
5363 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5364 bool IsIEEEMode = Info->getMode().IEEE;
5365
5366 // FIXME: Assert during selection that this is only selected for
5367 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
5368 // mode functions, but this happens to be OK since it's only done in cases
5369 // where there is known no sNaN.
5370 if (IsIEEEMode)
5371 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
5372
5373 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16)
5374 return splitBinaryVectorOp(Op, DAG);
5375 return Op;
5376 }
5377
lowerXMULO(SDValue Op,SelectionDAG & DAG) const5378 SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
5379 EVT VT = Op.getValueType();
5380 SDLoc SL(Op);
5381 SDValue LHS = Op.getOperand(0);
5382 SDValue RHS = Op.getOperand(1);
5383 bool isSigned = Op.getOpcode() == ISD::SMULO;
5384
5385 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
5386 const APInt &C = RHSC->getAPIntValue();
5387 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
5388 if (C.isPowerOf2()) {
5389 // smulo(x, signed_min) is same as umulo(x, signed_min).
5390 bool UseArithShift = isSigned && !C.isMinSignedValue();
5391 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
5392 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
5393 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
5394 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
5395 SL, VT, Result, ShiftAmt),
5396 LHS, ISD::SETNE);
5397 return DAG.getMergeValues({ Result, Overflow }, SL);
5398 }
5399 }
5400
5401 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
5402 SDValue Top = DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU,
5403 SL, VT, LHS, RHS);
5404
5405 SDValue Sign = isSigned
5406 ? DAG.getNode(ISD::SRA, SL, VT, Result,
5407 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
5408 : DAG.getConstant(0, SL, VT);
5409 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
5410
5411 return DAG.getMergeValues({ Result, Overflow }, SL);
5412 }
5413
lowerXMUL_LOHI(SDValue Op,SelectionDAG & DAG) const5414 SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
5415 if (Op->isDivergent()) {
5416 // Select to V_MAD_[IU]64_[IU]32.
5417 return Op;
5418 }
5419 if (Subtarget->hasSMulHi()) {
5420 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
5421 return SDValue();
5422 }
5423 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
5424 // calculate the high part, so we might as well do the whole thing with
5425 // V_MAD_[IU]64_[IU]32.
5426 return Op;
5427 }
5428
lowerTRAP(SDValue Op,SelectionDAG & DAG) const5429 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
5430 if (!Subtarget->isTrapHandlerEnabled() ||
5431 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
5432 return lowerTrapEndpgm(Op, DAG);
5433
5434 if (std::optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(Subtarget)) {
5435 switch (*HsaAbiVer) {
5436 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
5437 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
5438 return lowerTrapHsaQueuePtr(Op, DAG);
5439 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
5440 case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
5441 return Subtarget->supportsGetDoorbellID() ?
5442 lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG);
5443 }
5444 }
5445
5446 llvm_unreachable("Unknown trap handler");
5447 }
5448
lowerTrapEndpgm(SDValue Op,SelectionDAG & DAG) const5449 SDValue SITargetLowering::lowerTrapEndpgm(
5450 SDValue Op, SelectionDAG &DAG) const {
5451 SDLoc SL(Op);
5452 SDValue Chain = Op.getOperand(0);
5453 return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
5454 }
5455
loadImplicitKernelArgument(SelectionDAG & DAG,MVT VT,const SDLoc & DL,Align Alignment,ImplicitParameter Param) const5456 SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
5457 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
5458 MachineFunction &MF = DAG.getMachineFunction();
5459 uint64_t Offset = getImplicitParameterOffset(MF, Param);
5460 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
5461 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5462 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
5463 MachineMemOperand::MODereferenceable |
5464 MachineMemOperand::MOInvariant);
5465 }
5466
lowerTrapHsaQueuePtr(SDValue Op,SelectionDAG & DAG) const5467 SDValue SITargetLowering::lowerTrapHsaQueuePtr(
5468 SDValue Op, SelectionDAG &DAG) const {
5469 SDLoc SL(Op);
5470 SDValue Chain = Op.getOperand(0);
5471
5472 SDValue QueuePtr;
5473 // For code object version 5, QueuePtr is passed through implicit kernarg.
5474 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
5475 QueuePtr =
5476 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
5477 } else {
5478 MachineFunction &MF = DAG.getMachineFunction();
5479 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5480 Register UserSGPR = Info->getQueuePtrUserSGPR();
5481
5482 if (UserSGPR == AMDGPU::NoRegister) {
5483 // We probably are in a function incorrectly marked with
5484 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
5485 // trap, so just use a null pointer.
5486 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
5487 } else {
5488 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
5489 MVT::i64);
5490 }
5491 }
5492
5493 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
5494 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
5495 QueuePtr, SDValue());
5496
5497 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
5498 SDValue Ops[] = {
5499 ToReg,
5500 DAG.getTargetConstant(TrapID, SL, MVT::i16),
5501 SGPR01,
5502 ToReg.getValue(1)
5503 };
5504 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
5505 }
5506
lowerTrapHsa(SDValue Op,SelectionDAG & DAG) const5507 SDValue SITargetLowering::lowerTrapHsa(
5508 SDValue Op, SelectionDAG &DAG) const {
5509 SDLoc SL(Op);
5510 SDValue Chain = Op.getOperand(0);
5511
5512 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
5513 SDValue Ops[] = {
5514 Chain,
5515 DAG.getTargetConstant(TrapID, SL, MVT::i16)
5516 };
5517 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
5518 }
5519
lowerDEBUGTRAP(SDValue Op,SelectionDAG & DAG) const5520 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
5521 SDLoc SL(Op);
5522 SDValue Chain = Op.getOperand(0);
5523 MachineFunction &MF = DAG.getMachineFunction();
5524
5525 if (!Subtarget->isTrapHandlerEnabled() ||
5526 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
5527 DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
5528 "debugtrap handler not supported",
5529 Op.getDebugLoc(),
5530 DS_Warning);
5531 LLVMContext &Ctx = MF.getFunction().getContext();
5532 Ctx.diagnose(NoTrap);
5533 return Chain;
5534 }
5535
5536 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
5537 SDValue Ops[] = {
5538 Chain,
5539 DAG.getTargetConstant(TrapID, SL, MVT::i16)
5540 };
5541 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
5542 }
5543
getSegmentAperture(unsigned AS,const SDLoc & DL,SelectionDAG & DAG) const5544 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
5545 SelectionDAG &DAG) const {
5546 if (Subtarget->hasApertureRegs()) {
5547 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
5548 ? AMDGPU::SRC_SHARED_BASE
5549 : AMDGPU::SRC_PRIVATE_BASE;
5550 // Note: this feature (register) is broken. When used as a 32-bit operand,
5551 // it returns a wrong value (all zeroes?). The real value is in the upper 32
5552 // bits.
5553 //
5554 // To work around the issue, directly emit a 64 bit mov from this register
5555 // then extract the high bits. Note that this shouldn't even result in a
5556 // shift being emitted and simply become a pair of registers (e.g.):
5557 // s_mov_b64 s[6:7], src_shared_base
5558 // v_mov_b32_e32 v1, s7
5559 //
5560 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
5561 // coalescing would kick in and it would think it's okay to use the "HI"
5562 // subregister directly (instead of extracting the HI 32 bits) which is an
5563 // artificial (unusable) register.
5564 // Register TableGen definitions would need an overhaul to get rid of the
5565 // artificial "HI" aperture registers and prevent this kind of issue from
5566 // happening.
5567 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
5568 DAG.getRegister(ApertureRegNo, MVT::i64));
5569 return DAG.getNode(
5570 ISD::TRUNCATE, DL, MVT::i32,
5571 DAG.getNode(ISD::SRL, DL, MVT::i64,
5572 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
5573 }
5574
5575 // For code object version 5, private_base and shared_base are passed through
5576 // implicit kernargs.
5577 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
5578 ImplicitParameter Param =
5579 (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
5580 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
5581 }
5582
5583 MachineFunction &MF = DAG.getMachineFunction();
5584 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5585 Register UserSGPR = Info->getQueuePtrUserSGPR();
5586 if (UserSGPR == AMDGPU::NoRegister) {
5587 // We probably are in a function incorrectly marked with
5588 // amdgpu-no-queue-ptr. This is undefined.
5589 return DAG.getUNDEF(MVT::i32);
5590 }
5591
5592 SDValue QueuePtr = CreateLiveInRegister(
5593 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
5594
5595 // Offset into amd_queue_t for group_segment_aperture_base_hi /
5596 // private_segment_aperture_base_hi.
5597 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
5598
5599 SDValue Ptr =
5600 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::Fixed(StructOffset));
5601
5602 // TODO: Use custom target PseudoSourceValue.
5603 // TODO: We should use the value from the IR intrinsic call, but it might not
5604 // be available and how do we get it?
5605 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5606 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
5607 commonAlignment(Align(64), StructOffset),
5608 MachineMemOperand::MODereferenceable |
5609 MachineMemOperand::MOInvariant);
5610 }
5611
5612 /// Return true if the value is a known valid address, such that a null check is
5613 /// not necessary.
isKnownNonNull(SDValue Val,SelectionDAG & DAG,const AMDGPUTargetMachine & TM,unsigned AddrSpace)5614 static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
5615 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
5616 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
5617 isa<BasicBlockSDNode>(Val))
5618 return true;
5619
5620 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
5621 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
5622
5623 // TODO: Search through arithmetic, handle arguments and loads
5624 // marked nonnull.
5625 return false;
5626 }
5627
lowerADDRSPACECAST(SDValue Op,SelectionDAG & DAG) const5628 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
5629 SelectionDAG &DAG) const {
5630 SDLoc SL(Op);
5631 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
5632
5633 SDValue Src = ASC->getOperand(0);
5634 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
5635 unsigned SrcAS = ASC->getSrcAddressSpace();
5636
5637 const AMDGPUTargetMachine &TM =
5638 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
5639
5640 // flat -> local/private
5641 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
5642 unsigned DestAS = ASC->getDestAddressSpace();
5643
5644 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
5645 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
5646 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
5647
5648 if (isKnownNonNull(Src, DAG, TM, SrcAS))
5649 return Ptr;
5650
5651 unsigned NullVal = TM.getNullPointerValue(DestAS);
5652 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
5653 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
5654
5655 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
5656 SegmentNullPtr);
5657 }
5658 }
5659
5660 // local/private -> flat
5661 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
5662 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
5663 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
5664
5665 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
5666 SDValue CvtPtr =
5667 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
5668 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
5669
5670 if (isKnownNonNull(Src, DAG, TM, SrcAS))
5671 return CvtPtr;
5672
5673 unsigned NullVal = TM.getNullPointerValue(SrcAS);
5674 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
5675
5676 SDValue NonNull
5677 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
5678
5679 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
5680 FlatNullPtr);
5681 }
5682 }
5683
5684 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
5685 Op.getValueType() == MVT::i64) {
5686 const SIMachineFunctionInfo *Info =
5687 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
5688 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
5689 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
5690 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
5691 }
5692
5693 if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
5694 Src.getValueType() == MVT::i64)
5695 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
5696
5697 // global <-> flat are no-ops and never emitted.
5698
5699 const MachineFunction &MF = DAG.getMachineFunction();
5700 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
5701 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
5702 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
5703
5704 return DAG.getUNDEF(ASC->getValueType(0));
5705 }
5706
5707 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from
5708 // the small vector and inserting them into the big vector. That is better than
5709 // the default expansion of doing it via a stack slot. Even though the use of
5710 // the stack slot would be optimized away afterwards, the stack slot itself
5711 // remains.
lowerINSERT_SUBVECTOR(SDValue Op,SelectionDAG & DAG) const5712 SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
5713 SelectionDAG &DAG) const {
5714 SDValue Vec = Op.getOperand(0);
5715 SDValue Ins = Op.getOperand(1);
5716 SDValue Idx = Op.getOperand(2);
5717 EVT VecVT = Vec.getValueType();
5718 EVT InsVT = Ins.getValueType();
5719 EVT EltVT = VecVT.getVectorElementType();
5720 unsigned InsNumElts = InsVT.getVectorNumElements();
5721 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5722 SDLoc SL(Op);
5723
5724 for (unsigned I = 0; I != InsNumElts; ++I) {
5725 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
5726 DAG.getConstant(I, SL, MVT::i32));
5727 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
5728 DAG.getConstant(IdxVal + I, SL, MVT::i32));
5729 }
5730 return Vec;
5731 }
5732
lowerINSERT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const5733 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
5734 SelectionDAG &DAG) const {
5735 SDValue Vec = Op.getOperand(0);
5736 SDValue InsVal = Op.getOperand(1);
5737 SDValue Idx = Op.getOperand(2);
5738 EVT VecVT = Vec.getValueType();
5739 EVT EltVT = VecVT.getVectorElementType();
5740 unsigned VecSize = VecVT.getSizeInBits();
5741 unsigned EltSize = EltVT.getSizeInBits();
5742 SDLoc SL(Op);
5743
5744 // Specially handle the case of v4i16 with static indexing.
5745 unsigned NumElts = VecVT.getVectorNumElements();
5746 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
5747 if (NumElts == 4 && EltSize == 16 && KIdx) {
5748 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
5749
5750 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
5751 DAG.getConstant(0, SL, MVT::i32));
5752 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
5753 DAG.getConstant(1, SL, MVT::i32));
5754
5755 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
5756 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
5757
5758 unsigned Idx = KIdx->getZExtValue();
5759 bool InsertLo = Idx < 2;
5760 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
5761 InsertLo ? LoVec : HiVec,
5762 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
5763 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
5764
5765 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
5766
5767 SDValue Concat = InsertLo ?
5768 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
5769 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
5770
5771 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
5772 }
5773
5774 // Static indexing does not lower to stack access, and hence there is no need
5775 // for special custom lowering to avoid stack access.
5776 if (isa<ConstantSDNode>(Idx))
5777 return SDValue();
5778
5779 // Avoid stack access for dynamic indexing by custom lowering to
5780 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
5781
5782 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
5783
5784 MVT IntVT = MVT::getIntegerVT(VecSize);
5785
5786 // Convert vector index to bit-index and get the required bit mask.
5787 assert(isPowerOf2_32(EltSize));
5788 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
5789 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
5790 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
5791 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
5792 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
5793
5794 // 1. Create a congruent vector with the target value in each element.
5795 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
5796 DAG.getSplatBuildVector(VecVT, SL, InsVal));
5797
5798 // 2. Mask off all other indicies except the required index within (1).
5799 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
5800
5801 // 3. Mask off the required index within the target vector.
5802 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
5803 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
5804 DAG.getNOT(SL, BFM, IntVT), BCVec);
5805
5806 // 4. Get (2) and (3) ORed into the target vector.
5807 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
5808
5809 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
5810 }
5811
lowerEXTRACT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const5812 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
5813 SelectionDAG &DAG) const {
5814 SDLoc SL(Op);
5815
5816 EVT ResultVT = Op.getValueType();
5817 SDValue Vec = Op.getOperand(0);
5818 SDValue Idx = Op.getOperand(1);
5819 EVT VecVT = Vec.getValueType();
5820 unsigned VecSize = VecVT.getSizeInBits();
5821 EVT EltVT = VecVT.getVectorElementType();
5822
5823 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
5824
5825 // Make sure we do any optimizations that will make it easier to fold
5826 // source modifiers before obscuring it with bit operations.
5827
5828 // XXX - Why doesn't this get called when vector_shuffle is expanded?
5829 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
5830 return Combined;
5831
5832 if (VecSize == 128 || VecSize == 256) {
5833 SDValue Lo, Hi;
5834 EVT LoVT, HiVT;
5835 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
5836
5837 if (VecSize == 128) {
5838 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
5839 Lo = DAG.getBitcast(LoVT,
5840 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
5841 DAG.getConstant(0, SL, MVT::i32)));
5842 Hi = DAG.getBitcast(HiVT,
5843 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
5844 DAG.getConstant(1, SL, MVT::i32)));
5845 } else {
5846 assert(VecSize == 256);
5847
5848 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
5849 SDValue Parts[4];
5850 for (unsigned P = 0; P < 4; ++P) {
5851 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
5852 DAG.getConstant(P, SL, MVT::i32));
5853 }
5854
5855 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
5856 Parts[0], Parts[1]));
5857 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
5858 Parts[2], Parts[3]));
5859 }
5860
5861 EVT IdxVT = Idx.getValueType();
5862 unsigned NElem = VecVT.getVectorNumElements();
5863 assert(isPowerOf2_32(NElem));
5864 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
5865 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
5866 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
5867 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
5868 }
5869
5870 assert(VecSize <= 64);
5871
5872 MVT IntVT = MVT::getIntegerVT(VecSize);
5873
5874 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
5875 SDValue VecBC = peekThroughBitcasts(Vec);
5876 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
5877 SDValue Src = VecBC.getOperand(0);
5878 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
5879 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
5880 }
5881
5882 unsigned EltSize = EltVT.getSizeInBits();
5883 assert(isPowerOf2_32(EltSize));
5884
5885 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
5886
5887 // Convert vector index to bit-index (* EltSize)
5888 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
5889
5890 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
5891 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
5892
5893 if (ResultVT == MVT::f16) {
5894 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
5895 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
5896 }
5897
5898 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
5899 }
5900
elementPairIsContiguous(ArrayRef<int> Mask,int Elt)5901 static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
5902 assert(Elt % 2 == 0);
5903 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
5904 }
5905
lowerVECTOR_SHUFFLE(SDValue Op,SelectionDAG & DAG) const5906 SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
5907 SelectionDAG &DAG) const {
5908 SDLoc SL(Op);
5909 EVT ResultVT = Op.getValueType();
5910 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
5911
5912 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
5913 EVT EltVT = PackVT.getVectorElementType();
5914 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
5915
5916 // vector_shuffle <0,1,6,7> lhs, rhs
5917 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
5918 //
5919 // vector_shuffle <6,7,2,3> lhs, rhs
5920 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
5921 //
5922 // vector_shuffle <6,7,0,1> lhs, rhs
5923 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
5924
5925 // Avoid scalarizing when both halves are reading from consecutive elements.
5926 SmallVector<SDValue, 4> Pieces;
5927 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
5928 if (elementPairIsContiguous(SVN->getMask(), I)) {
5929 const int Idx = SVN->getMaskElt(I);
5930 int VecIdx = Idx < SrcNumElts ? 0 : 1;
5931 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
5932 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
5933 PackVT, SVN->getOperand(VecIdx),
5934 DAG.getConstant(EltIdx, SL, MVT::i32));
5935 Pieces.push_back(SubVec);
5936 } else {
5937 const int Idx0 = SVN->getMaskElt(I);
5938 const int Idx1 = SVN->getMaskElt(I + 1);
5939 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
5940 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
5941 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
5942 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
5943
5944 SDValue Vec0 = SVN->getOperand(VecIdx0);
5945 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
5946 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
5947
5948 SDValue Vec1 = SVN->getOperand(VecIdx1);
5949 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
5950 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
5951 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
5952 }
5953 }
5954
5955 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
5956 }
5957
lowerSCALAR_TO_VECTOR(SDValue Op,SelectionDAG & DAG) const5958 SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
5959 SelectionDAG &DAG) const {
5960 SDValue SVal = Op.getOperand(0);
5961 EVT ResultVT = Op.getValueType();
5962 EVT SValVT = SVal.getValueType();
5963 SDValue UndefVal = DAG.getUNDEF(SValVT);
5964 SDLoc SL(Op);
5965
5966 SmallVector<SDValue, 8> VElts;
5967 VElts.push_back(SVal);
5968 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
5969 VElts.push_back(UndefVal);
5970
5971 return DAG.getBuildVector(ResultVT, SL, VElts);
5972 }
5973
lowerBUILD_VECTOR(SDValue Op,SelectionDAG & DAG) const5974 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
5975 SelectionDAG &DAG) const {
5976 SDLoc SL(Op);
5977 EVT VT = Op.getValueType();
5978
5979 if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
5980 VT == MVT::v8i16 || VT == MVT::v8f16) {
5981 EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
5982 VT.getVectorNumElements() / 2);
5983 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
5984
5985 // Turn into pair of packed build_vectors.
5986 // TODO: Special case for constants that can be materialized with s_mov_b64.
5987 SmallVector<SDValue, 4> LoOps, HiOps;
5988 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
5989 LoOps.push_back(Op.getOperand(I));
5990 HiOps.push_back(Op.getOperand(I + E));
5991 }
5992 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
5993 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
5994
5995 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
5996 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
5997
5998 SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
5999 { CastLo, CastHi });
6000 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
6001 }
6002
6003 if (VT == MVT::v16i16 || VT == MVT::v16f16) {
6004 EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
6005 VT.getVectorNumElements() / 4);
6006 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
6007
6008 SmallVector<SDValue, 4> Parts[4];
6009 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
6010 for (unsigned P = 0; P < 4; ++P)
6011 Parts[P].push_back(Op.getOperand(I + P * E));
6012 }
6013 SDValue Casts[4];
6014 for (unsigned P = 0; P < 4; ++P) {
6015 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
6016 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
6017 }
6018
6019 SDValue Blend =
6020 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
6021 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
6022 }
6023
6024 assert(VT == MVT::v2f16 || VT == MVT::v2i16);
6025 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
6026
6027 SDValue Lo = Op.getOperand(0);
6028 SDValue Hi = Op.getOperand(1);
6029
6030 // Avoid adding defined bits with the zero_extend.
6031 if (Hi.isUndef()) {
6032 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
6033 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
6034 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
6035 }
6036
6037 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
6038 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
6039
6040 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
6041 DAG.getConstant(16, SL, MVT::i32));
6042 if (Lo.isUndef())
6043 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
6044
6045 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
6046 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
6047
6048 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
6049 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
6050 }
6051
6052 bool
isOffsetFoldingLegal(const GlobalAddressSDNode * GA) const6053 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
6054 // We can fold offsets for anything that doesn't require a GOT relocation.
6055 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
6056 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
6057 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
6058 !shouldEmitGOTReloc(GA->getGlobal());
6059 }
6060
6061 static SDValue
buildPCRelGlobalAddress(SelectionDAG & DAG,const GlobalValue * GV,const SDLoc & DL,int64_t Offset,EVT PtrVT,unsigned GAFlags=SIInstrInfo::MO_NONE)6062 buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
6063 const SDLoc &DL, int64_t Offset, EVT PtrVT,
6064 unsigned GAFlags = SIInstrInfo::MO_NONE) {
6065 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
6066 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
6067 // lowered to the following code sequence:
6068 //
6069 // For constant address space:
6070 // s_getpc_b64 s[0:1]
6071 // s_add_u32 s0, s0, $symbol
6072 // s_addc_u32 s1, s1, 0
6073 //
6074 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
6075 // a fixup or relocation is emitted to replace $symbol with a literal
6076 // constant, which is a pc-relative offset from the encoding of the $symbol
6077 // operand to the global variable.
6078 //
6079 // For global address space:
6080 // s_getpc_b64 s[0:1]
6081 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
6082 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
6083 //
6084 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
6085 // fixups or relocations are emitted to replace $symbol@*@lo and
6086 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
6087 // which is a 64-bit pc-relative offset from the encoding of the $symbol
6088 // operand to the global variable.
6089 //
6090 // What we want here is an offset from the value returned by s_getpc
6091 // (which is the address of the s_add_u32 instruction) to the global
6092 // variable, but since the encoding of $symbol starts 4 bytes after the start
6093 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
6094 // small. This requires us to add 4 to the global variable offset in order to
6095 // compute the correct address. Similarly for the s_addc_u32 instruction, the
6096 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
6097 // instruction.
6098 SDValue PtrLo =
6099 DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags);
6100 SDValue PtrHi;
6101 if (GAFlags == SIInstrInfo::MO_NONE) {
6102 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
6103 } else {
6104 PtrHi =
6105 DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 12, GAFlags + 1);
6106 }
6107 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
6108 }
6109
LowerGlobalAddress(AMDGPUMachineFunction * MFI,SDValue Op,SelectionDAG & DAG) const6110 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
6111 SDValue Op,
6112 SelectionDAG &DAG) const {
6113 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
6114 SDLoc DL(GSD);
6115 EVT PtrVT = Op.getValueType();
6116
6117 const GlobalValue *GV = GSD->getGlobal();
6118 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
6119 shouldUseLDSConstAddress(GV)) ||
6120 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
6121 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
6122 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
6123 GV->hasExternalLinkage()) {
6124 Type *Ty = GV->getValueType();
6125 // HIP uses an unsized array `extern __shared__ T s[]` or similar
6126 // zero-sized type in other languages to declare the dynamic shared
6127 // memory which size is not known at the compile time. They will be
6128 // allocated by the runtime and placed directly after the static
6129 // allocated ones. They all share the same offset.
6130 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
6131 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
6132 // Adjust alignment for that dynamic shared memory array.
6133 MFI->setDynLDSAlign(DAG.getDataLayout(), *cast<GlobalVariable>(GV));
6134 return SDValue(
6135 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
6136 }
6137 }
6138 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
6139 }
6140
6141 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
6142 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
6143 SIInstrInfo::MO_ABS32_LO);
6144 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
6145 }
6146
6147 if (shouldEmitFixup(GV))
6148 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
6149 else if (shouldEmitPCReloc(GV))
6150 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
6151 SIInstrInfo::MO_REL32);
6152
6153 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
6154 SIInstrInfo::MO_GOTPCREL32);
6155
6156 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
6157 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
6158 const DataLayout &DataLayout = DAG.getDataLayout();
6159 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
6160 MachinePointerInfo PtrInfo
6161 = MachinePointerInfo::getGOT(DAG.getMachineFunction());
6162
6163 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
6164 MachineMemOperand::MODereferenceable |
6165 MachineMemOperand::MOInvariant);
6166 }
6167
copyToM0(SelectionDAG & DAG,SDValue Chain,const SDLoc & DL,SDValue V) const6168 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
6169 const SDLoc &DL, SDValue V) const {
6170 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
6171 // the destination register.
6172 //
6173 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
6174 // so we will end up with redundant moves to m0.
6175 //
6176 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
6177
6178 // A Null SDValue creates a glue result.
6179 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
6180 V, Chain);
6181 return SDValue(M0, 0);
6182 }
6183
lowerImplicitZextParam(SelectionDAG & DAG,SDValue Op,MVT VT,unsigned Offset) const6184 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
6185 SDValue Op,
6186 MVT VT,
6187 unsigned Offset) const {
6188 SDLoc SL(Op);
6189 SDValue Param = lowerKernargMemParameter(
6190 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
6191 // The local size values will have the hi 16-bits as zero.
6192 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
6193 DAG.getValueType(VT));
6194 }
6195
emitNonHSAIntrinsicError(SelectionDAG & DAG,const SDLoc & DL,EVT VT)6196 static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
6197 EVT VT) {
6198 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
6199 "non-hsa intrinsic with hsa target",
6200 DL.getDebugLoc());
6201 DAG.getContext()->diagnose(BadIntrin);
6202 return DAG.getUNDEF(VT);
6203 }
6204
emitRemovedIntrinsicError(SelectionDAG & DAG,const SDLoc & DL,EVT VT)6205 static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
6206 EVT VT) {
6207 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
6208 "intrinsic not supported on subtarget",
6209 DL.getDebugLoc());
6210 DAG.getContext()->diagnose(BadIntrin);
6211 return DAG.getUNDEF(VT);
6212 }
6213
getBuildDwordsVector(SelectionDAG & DAG,SDLoc DL,ArrayRef<SDValue> Elts)6214 static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
6215 ArrayRef<SDValue> Elts) {
6216 assert(!Elts.empty());
6217 MVT Type;
6218 unsigned NumElts = Elts.size();
6219
6220 if (NumElts <= 12) {
6221 Type = MVT::getVectorVT(MVT::f32, NumElts);
6222 } else {
6223 assert(Elts.size() <= 16);
6224 Type = MVT::v16f32;
6225 NumElts = 16;
6226 }
6227
6228 SmallVector<SDValue, 16> VecElts(NumElts);
6229 for (unsigned i = 0; i < Elts.size(); ++i) {
6230 SDValue Elt = Elts[i];
6231 if (Elt.getValueType() != MVT::f32)
6232 Elt = DAG.getBitcast(MVT::f32, Elt);
6233 VecElts[i] = Elt;
6234 }
6235 for (unsigned i = Elts.size(); i < NumElts; ++i)
6236 VecElts[i] = DAG.getUNDEF(MVT::f32);
6237
6238 if (NumElts == 1)
6239 return VecElts[0];
6240 return DAG.getBuildVector(Type, DL, VecElts);
6241 }
6242
padEltsToUndef(SelectionDAG & DAG,const SDLoc & DL,EVT CastVT,SDValue Src,int ExtraElts)6243 static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
6244 SDValue Src, int ExtraElts) {
6245 EVT SrcVT = Src.getValueType();
6246
6247 SmallVector<SDValue, 8> Elts;
6248
6249 if (SrcVT.isVector())
6250 DAG.ExtractVectorElements(Src, Elts);
6251 else
6252 Elts.push_back(Src);
6253
6254 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
6255 while (ExtraElts--)
6256 Elts.push_back(Undef);
6257
6258 return DAG.getBuildVector(CastVT, DL, Elts);
6259 }
6260
6261 // Re-construct the required return value for a image load intrinsic.
6262 // This is more complicated due to the optional use TexFailCtrl which means the required
6263 // return type is an aggregate
constructRetValue(SelectionDAG & DAG,MachineSDNode * Result,ArrayRef<EVT> ResultTypes,bool IsTexFail,bool Unpacked,bool IsD16,int DMaskPop,int NumVDataDwords,const SDLoc & DL)6264 static SDValue constructRetValue(SelectionDAG &DAG,
6265 MachineSDNode *Result,
6266 ArrayRef<EVT> ResultTypes,
6267 bool IsTexFail, bool Unpacked, bool IsD16,
6268 int DMaskPop, int NumVDataDwords,
6269 const SDLoc &DL) {
6270 // Determine the required return type. This is the same regardless of IsTexFail flag
6271 EVT ReqRetVT = ResultTypes[0];
6272 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
6273 int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ?
6274 ReqRetNumElts : (ReqRetNumElts + 1) / 2;
6275
6276 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
6277 DMaskPop : (DMaskPop + 1) / 2;
6278
6279 MVT DataDwordVT = NumDataDwords == 1 ?
6280 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
6281
6282 MVT MaskPopVT = MaskPopDwords == 1 ?
6283 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
6284
6285 SDValue Data(Result, 0);
6286 SDValue TexFail;
6287
6288 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
6289 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
6290 if (MaskPopVT.isVector()) {
6291 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
6292 SDValue(Result, 0), ZeroIdx);
6293 } else {
6294 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
6295 SDValue(Result, 0), ZeroIdx);
6296 }
6297 }
6298
6299 if (DataDwordVT.isVector())
6300 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
6301 NumDataDwords - MaskPopDwords);
6302
6303 if (IsD16)
6304 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
6305
6306 EVT LegalReqRetVT = ReqRetVT;
6307 if (!ReqRetVT.isVector()) {
6308 if (!Data.getValueType().isInteger())
6309 Data = DAG.getNode(ISD::BITCAST, DL,
6310 Data.getValueType().changeTypeToInteger(), Data);
6311 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
6312 } else {
6313 // We need to widen the return vector to a legal type
6314 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
6315 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
6316 LegalReqRetVT =
6317 EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(),
6318 ReqRetVT.getVectorNumElements() + 1);
6319 }
6320 }
6321 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
6322
6323 if (IsTexFail) {
6324 TexFail =
6325 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
6326 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
6327
6328 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
6329 }
6330
6331 if (Result->getNumValues() == 1)
6332 return Data;
6333
6334 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
6335 }
6336
parseTexFail(SDValue TexFailCtrl,SelectionDAG & DAG,SDValue * TFE,SDValue * LWE,bool & IsTexFail)6337 static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
6338 SDValue *LWE, bool &IsTexFail) {
6339 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
6340
6341 uint64_t Value = TexFailCtrlConst->getZExtValue();
6342 if (Value) {
6343 IsTexFail = true;
6344 }
6345
6346 SDLoc DL(TexFailCtrlConst);
6347 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
6348 Value &= ~(uint64_t)0x1;
6349 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
6350 Value &= ~(uint64_t)0x2;
6351
6352 return Value == 0;
6353 }
6354
packImage16bitOpsToDwords(SelectionDAG & DAG,SDValue Op,MVT PackVectorVT,SmallVectorImpl<SDValue> & PackedAddrs,unsigned DimIdx,unsigned EndIdx,unsigned NumGradients)6355 static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
6356 MVT PackVectorVT,
6357 SmallVectorImpl<SDValue> &PackedAddrs,
6358 unsigned DimIdx, unsigned EndIdx,
6359 unsigned NumGradients) {
6360 SDLoc DL(Op);
6361 for (unsigned I = DimIdx; I < EndIdx; I++) {
6362 SDValue Addr = Op.getOperand(I);
6363
6364 // Gradients are packed with undef for each coordinate.
6365 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
6366 // 1D: undef,dx/dh; undef,dx/dv
6367 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
6368 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
6369 if (((I + 1) >= EndIdx) ||
6370 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
6371 I == DimIdx + NumGradients - 1))) {
6372 if (Addr.getValueType() != MVT::i16)
6373 Addr = DAG.getBitcast(MVT::i16, Addr);
6374 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
6375 } else {
6376 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
6377 I++;
6378 }
6379 Addr = DAG.getBitcast(MVT::f32, Addr);
6380 PackedAddrs.push_back(Addr);
6381 }
6382 }
6383
lowerImage(SDValue Op,const AMDGPU::ImageDimIntrinsicInfo * Intr,SelectionDAG & DAG,bool WithChain) const6384 SDValue SITargetLowering::lowerImage(SDValue Op,
6385 const AMDGPU::ImageDimIntrinsicInfo *Intr,
6386 SelectionDAG &DAG, bool WithChain) const {
6387 SDLoc DL(Op);
6388 MachineFunction &MF = DAG.getMachineFunction();
6389 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
6390 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6391 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
6392 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
6393 unsigned IntrOpcode = Intr->BaseOpcode;
6394 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
6395 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
6396
6397 SmallVector<EVT, 3> ResultTypes(Op->values());
6398 SmallVector<EVT, 3> OrigResultTypes(Op->values());
6399 bool IsD16 = false;
6400 bool IsG16 = false;
6401 bool IsA16 = false;
6402 SDValue VData;
6403 int NumVDataDwords;
6404 bool AdjustRetType = false;
6405
6406 // Offset of intrinsic arguments
6407 const unsigned ArgOffset = WithChain ? 2 : 1;
6408
6409 unsigned DMask;
6410 unsigned DMaskLanes = 0;
6411
6412 if (BaseOpcode->Atomic) {
6413 VData = Op.getOperand(2);
6414
6415 bool Is64Bit = VData.getValueType() == MVT::i64;
6416 if (BaseOpcode->AtomicX2) {
6417 SDValue VData2 = Op.getOperand(3);
6418 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
6419 {VData, VData2});
6420 if (Is64Bit)
6421 VData = DAG.getBitcast(MVT::v4i32, VData);
6422
6423 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
6424 DMask = Is64Bit ? 0xf : 0x3;
6425 NumVDataDwords = Is64Bit ? 4 : 2;
6426 } else {
6427 DMask = Is64Bit ? 0x3 : 0x1;
6428 NumVDataDwords = Is64Bit ? 2 : 1;
6429 }
6430 } else {
6431 auto *DMaskConst =
6432 cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->DMaskIndex));
6433 DMask = DMaskConst->getZExtValue();
6434 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
6435
6436 if (BaseOpcode->Store) {
6437 VData = Op.getOperand(2);
6438
6439 MVT StoreVT = VData.getSimpleValueType();
6440 if (StoreVT.getScalarType() == MVT::f16) {
6441 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
6442 return Op; // D16 is unsupported for this instruction
6443
6444 IsD16 = true;
6445 VData = handleD16VData(VData, DAG, true);
6446 }
6447
6448 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
6449 } else {
6450 // Work out the num dwords based on the dmask popcount and underlying type
6451 // and whether packing is supported.
6452 MVT LoadVT = ResultTypes[0].getSimpleVT();
6453 if (LoadVT.getScalarType() == MVT::f16) {
6454 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
6455 return Op; // D16 is unsupported for this instruction
6456
6457 IsD16 = true;
6458 }
6459
6460 // Confirm that the return type is large enough for the dmask specified
6461 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
6462 (!LoadVT.isVector() && DMaskLanes > 1))
6463 return Op;
6464
6465 // The sq block of gfx8 and gfx9 do not estimate register use correctly
6466 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
6467 // instructions.
6468 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
6469 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
6470 NumVDataDwords = (DMaskLanes + 1) / 2;
6471 else
6472 NumVDataDwords = DMaskLanes;
6473
6474 AdjustRetType = true;
6475 }
6476 }
6477
6478 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
6479 SmallVector<SDValue, 4> VAddrs;
6480
6481 // Check for 16 bit addresses or derivatives and pack if true.
6482 MVT VAddrVT =
6483 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
6484 MVT VAddrScalarVT = VAddrVT.getScalarType();
6485 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
6486 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
6487
6488 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
6489 VAddrScalarVT = VAddrVT.getScalarType();
6490 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
6491 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
6492
6493 // Push back extra arguments.
6494 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
6495 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
6496 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6497 // Special handling of bias when A16 is on. Bias is of type half but
6498 // occupies full 32-bit.
6499 SDValue Bias = DAG.getBuildVector(
6500 MVT::v2f16, DL,
6501 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
6502 VAddrs.push_back(Bias);
6503 } else {
6504 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6505 "Bias needs to be converted to 16 bit in A16 mode");
6506 VAddrs.push_back(Op.getOperand(ArgOffset + I));
6507 }
6508 }
6509
6510 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
6511 // 16 bit gradients are supported, but are tied to the A16 control
6512 // so both gradients and addresses must be 16 bit
6513 LLVM_DEBUG(
6514 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
6515 "require 16 bit args for both gradients and addresses");
6516 return Op;
6517 }
6518
6519 if (IsA16) {
6520 if (!ST->hasA16()) {
6521 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
6522 "support 16 bit addresses\n");
6523 return Op;
6524 }
6525 }
6526
6527 // We've dealt with incorrect input so we know that if IsA16, IsG16
6528 // are set then we have to compress/pack operands (either address,
6529 // gradient or both)
6530 // In the case where a16 and gradients are tied (no G16 support) then we
6531 // have already verified that both IsA16 and IsG16 are true
6532 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
6533 // Activate g16
6534 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
6535 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
6536 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
6537 }
6538
6539 // Add gradients (packed or unpacked)
6540 if (IsG16) {
6541 // Pack the gradients
6542 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
6543 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
6544 ArgOffset + Intr->GradientStart,
6545 ArgOffset + Intr->CoordStart, Intr->NumGradients);
6546 } else {
6547 for (unsigned I = ArgOffset + Intr->GradientStart;
6548 I < ArgOffset + Intr->CoordStart; I++)
6549 VAddrs.push_back(Op.getOperand(I));
6550 }
6551
6552 // Add addresses (packed or unpacked)
6553 if (IsA16) {
6554 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
6555 ArgOffset + Intr->CoordStart, VAddrEnd,
6556 0 /* No gradients */);
6557 } else {
6558 // Add uncompressed address
6559 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
6560 VAddrs.push_back(Op.getOperand(I));
6561 }
6562
6563 // If the register allocator cannot place the address registers contiguously
6564 // without introducing moves, then using the non-sequential address encoding
6565 // is always preferable, since it saves VALU instructions and is usually a
6566 // wash in terms of code size or even better.
6567 //
6568 // However, we currently have no way of hinting to the register allocator that
6569 // MIMG addresses should be placed contiguously when it is possible to do so,
6570 // so force non-NSA for the common 2-address case as a heuristic.
6571 //
6572 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6573 // allocation when possible.
6574 //
6575 // TODO: we can actually allow partial NSA where the final register is a
6576 // contiguous set of the remaining addresses.
6577 // This could help where there are more addresses than supported.
6578 bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&
6579 VAddrs.size() >= (unsigned)ST->getNSAThreshold(MF) &&
6580 VAddrs.size() <= (unsigned)ST->getNSAMaxSize();
6581 SDValue VAddr;
6582 if (!UseNSA)
6583 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
6584
6585 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
6586 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
6587 SDValue Unorm;
6588 if (!BaseOpcode->Sampler) {
6589 Unorm = True;
6590 } else {
6591 auto UnormConst =
6592 cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->UnormIndex));
6593
6594 Unorm = UnormConst->getZExtValue() ? True : False;
6595 }
6596
6597 SDValue TFE;
6598 SDValue LWE;
6599 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
6600 bool IsTexFail = false;
6601 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
6602 return Op;
6603
6604 if (IsTexFail) {
6605 if (!DMaskLanes) {
6606 // Expecting to get an error flag since TFC is on - and dmask is 0
6607 // Force dmask to be at least 1 otherwise the instruction will fail
6608 DMask = 0x1;
6609 DMaskLanes = 1;
6610 NumVDataDwords = 1;
6611 }
6612 NumVDataDwords += 1;
6613 AdjustRetType = true;
6614 }
6615
6616 // Has something earlier tagged that the return type needs adjusting
6617 // This happens if the instruction is a load or has set TexFailCtrl flags
6618 if (AdjustRetType) {
6619 // NumVDataDwords reflects the true number of dwords required in the return type
6620 if (DMaskLanes == 0 && !BaseOpcode->Store) {
6621 // This is a no-op load. This can be eliminated
6622 SDValue Undef = DAG.getUNDEF(Op.getValueType());
6623 if (isa<MemSDNode>(Op))
6624 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
6625 return Undef;
6626 }
6627
6628 EVT NewVT = NumVDataDwords > 1 ?
6629 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
6630 : MVT::i32;
6631
6632 ResultTypes[0] = NewVT;
6633 if (ResultTypes.size() == 3) {
6634 // Original result was aggregate type used for TexFailCtrl results
6635 // The actual instruction returns as a vector type which has now been
6636 // created. Remove the aggregate result.
6637 ResultTypes.erase(&ResultTypes[1]);
6638 }
6639 }
6640
6641 unsigned CPol = cast<ConstantSDNode>(
6642 Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();
6643 if (BaseOpcode->Atomic)
6644 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
6645 if (CPol & ~AMDGPU::CPol::ALL)
6646 return Op;
6647
6648 SmallVector<SDValue, 26> Ops;
6649 if (BaseOpcode->Store || BaseOpcode->Atomic)
6650 Ops.push_back(VData); // vdata
6651 if (UseNSA)
6652 append_range(Ops, VAddrs);
6653 else
6654 Ops.push_back(VAddr);
6655 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
6656 if (BaseOpcode->Sampler)
6657 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
6658 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
6659 if (IsGFX10Plus)
6660 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
6661 Ops.push_back(Unorm);
6662 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
6663 Ops.push_back(IsA16 && // r128, a16 for gfx9
6664 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
6665 if (IsGFX10Plus)
6666 Ops.push_back(IsA16 ? True : False);
6667 if (!Subtarget->hasGFX90AInsts()) {
6668 Ops.push_back(TFE); //tfe
6669 } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) {
6670 report_fatal_error("TFE is not supported on this GPU");
6671 }
6672 Ops.push_back(LWE); // lwe
6673 if (!IsGFX10Plus)
6674 Ops.push_back(DimInfo->DA ? True : False);
6675 if (BaseOpcode->HasD16)
6676 Ops.push_back(IsD16 ? True : False);
6677 if (isa<MemSDNode>(Op))
6678 Ops.push_back(Op.getOperand(0)); // chain
6679
6680 int NumVAddrDwords =
6681 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
6682 int Opcode = -1;
6683
6684 if (IsGFX11Plus) {
6685 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
6686 UseNSA ? AMDGPU::MIMGEncGfx11NSA
6687 : AMDGPU::MIMGEncGfx11Default,
6688 NumVDataDwords, NumVAddrDwords);
6689 } else if (IsGFX10Plus) {
6690 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
6691 UseNSA ? AMDGPU::MIMGEncGfx10NSA
6692 : AMDGPU::MIMGEncGfx10Default,
6693 NumVDataDwords, NumVAddrDwords);
6694 } else {
6695 if (Subtarget->hasGFX90AInsts()) {
6696 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
6697 NumVDataDwords, NumVAddrDwords);
6698 if (Opcode == -1)
6699 return makeV_ILLEGAL(Op, DAG);
6700 }
6701 if (Opcode == -1 &&
6702 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
6703 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
6704 NumVDataDwords, NumVAddrDwords);
6705 if (Opcode == -1)
6706 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
6707 NumVDataDwords, NumVAddrDwords);
6708 }
6709 assert(Opcode != -1);
6710
6711 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
6712 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
6713 MachineMemOperand *MemRef = MemOp->getMemOperand();
6714 DAG.setNodeMemRefs(NewNode, {MemRef});
6715 }
6716
6717 if (BaseOpcode->AtomicX2) {
6718 SmallVector<SDValue, 1> Elt;
6719 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
6720 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
6721 }
6722 if (BaseOpcode->Store)
6723 return SDValue(NewNode, 0);
6724 return constructRetValue(DAG, NewNode,
6725 OrigResultTypes, IsTexFail,
6726 Subtarget->hasUnpackedD16VMem(), IsD16,
6727 DMaskLanes, NumVDataDwords, DL);
6728 }
6729
lowerSBuffer(EVT VT,SDLoc DL,SDValue Rsrc,SDValue Offset,SDValue CachePolicy,SelectionDAG & DAG) const6730 SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
6731 SDValue Offset, SDValue CachePolicy,
6732 SelectionDAG &DAG) const {
6733 MachineFunction &MF = DAG.getMachineFunction();
6734
6735 const DataLayout &DataLayout = DAG.getDataLayout();
6736 Align Alignment =
6737 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
6738
6739 MachineMemOperand *MMO = MF.getMachineMemOperand(
6740 MachinePointerInfo(),
6741 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6742 MachineMemOperand::MOInvariant,
6743 VT.getStoreSize(), Alignment);
6744
6745 if (!Offset->isDivergent()) {
6746 SDValue Ops[] = {
6747 Rsrc,
6748 Offset, // Offset
6749 CachePolicy
6750 };
6751
6752 // Widen vec3 load to vec4.
6753 if (VT.isVector() && VT.getVectorNumElements() == 3) {
6754 EVT WidenedVT =
6755 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
6756 auto WidenedOp = DAG.getMemIntrinsicNode(
6757 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
6758 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
6759 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
6760 DAG.getVectorIdxConstant(0, DL));
6761 return Subvector;
6762 }
6763
6764 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
6765 DAG.getVTList(VT), Ops, VT, MMO);
6766 }
6767
6768 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
6769 // assume that the buffer is unswizzled.
6770 SmallVector<SDValue, 4> Loads;
6771 unsigned NumLoads = 1;
6772 MVT LoadVT = VT.getSimpleVT();
6773 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
6774 assert((LoadVT.getScalarType() == MVT::i32 ||
6775 LoadVT.getScalarType() == MVT::f32));
6776
6777 if (NumElts == 8 || NumElts == 16) {
6778 NumLoads = NumElts / 4;
6779 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
6780 }
6781
6782 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
6783 SDValue Ops[] = {
6784 DAG.getEntryNode(), // Chain
6785 Rsrc, // rsrc
6786 DAG.getConstant(0, DL, MVT::i32), // vindex
6787 {}, // voffset
6788 {}, // soffset
6789 {}, // offset
6790 CachePolicy, // cachepolicy
6791 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6792 };
6793
6794 // Use the alignment to ensure that the required offsets will fit into the
6795 // immediate offsets.
6796 setBufferOffsets(Offset, DAG, &Ops[3],
6797 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
6798
6799 uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
6800 for (unsigned i = 0; i < NumLoads; ++i) {
6801 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
6802 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
6803 LoadVT, MMO, DAG));
6804 }
6805
6806 if (NumElts == 8 || NumElts == 16)
6807 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
6808
6809 return Loads[0];
6810 }
6811
lowerWorkitemID(SelectionDAG & DAG,SDValue Op,unsigned Dim,const ArgDescriptor & Arg) const6812 SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
6813 unsigned Dim,
6814 const ArgDescriptor &Arg) const {
6815 SDLoc SL(Op);
6816 MachineFunction &MF = DAG.getMachineFunction();
6817 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
6818 if (MaxID == 0)
6819 return DAG.getConstant(0, SL, MVT::i32);
6820
6821 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
6822 SDLoc(DAG.getEntryNode()), Arg);
6823
6824 // Don't bother inserting AssertZext for packed IDs since we're emitting the
6825 // masking operations anyway.
6826 //
6827 // TODO: We could assert the top bit is 0 for the source copy.
6828 if (Arg.isMasked())
6829 return Val;
6830
6831 // Preserve the known bits after expansion to a copy.
6832 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
6833 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
6834 DAG.getValueType(SmallVT));
6835 }
6836
LowerINTRINSIC_WO_CHAIN(SDValue Op,SelectionDAG & DAG) const6837 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6838 SelectionDAG &DAG) const {
6839 MachineFunction &MF = DAG.getMachineFunction();
6840 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
6841
6842 EVT VT = Op.getValueType();
6843 SDLoc DL(Op);
6844 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6845
6846 // TODO: Should this propagate fast-math-flags?
6847
6848 switch (IntrinsicID) {
6849 case Intrinsic::amdgcn_implicit_buffer_ptr: {
6850 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
6851 return emitNonHSAIntrinsicError(DAG, DL, VT);
6852 return getPreloadedValue(DAG, *MFI, VT,
6853 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
6854 }
6855 case Intrinsic::amdgcn_dispatch_ptr:
6856 case Intrinsic::amdgcn_queue_ptr: {
6857 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
6858 DiagnosticInfoUnsupported BadIntrin(
6859 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
6860 DL.getDebugLoc());
6861 DAG.getContext()->diagnose(BadIntrin);
6862 return DAG.getUNDEF(VT);
6863 }
6864
6865 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
6866 AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
6867 return getPreloadedValue(DAG, *MFI, VT, RegID);
6868 }
6869 case Intrinsic::amdgcn_implicitarg_ptr: {
6870 if (MFI->isEntryFunction())
6871 return getImplicitArgPtr(DAG, DL);
6872 return getPreloadedValue(DAG, *MFI, VT,
6873 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
6874 }
6875 case Intrinsic::amdgcn_kernarg_segment_ptr: {
6876 if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) {
6877 // This only makes sense to call in a kernel, so just lower to null.
6878 return DAG.getConstant(0, DL, VT);
6879 }
6880
6881 return getPreloadedValue(DAG, *MFI, VT,
6882 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
6883 }
6884 case Intrinsic::amdgcn_dispatch_id: {
6885 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
6886 }
6887 case Intrinsic::amdgcn_rcp:
6888 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
6889 case Intrinsic::amdgcn_rsq:
6890 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
6891 case Intrinsic::amdgcn_rsq_legacy:
6892 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
6893 return emitRemovedIntrinsicError(DAG, DL, VT);
6894 return SDValue();
6895 case Intrinsic::amdgcn_rcp_legacy:
6896 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
6897 return emitRemovedIntrinsicError(DAG, DL, VT);
6898 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
6899 case Intrinsic::amdgcn_rsq_clamp: {
6900 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6901 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
6902
6903 Type *Type = VT.getTypeForEVT(*DAG.getContext());
6904 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
6905 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
6906
6907 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
6908 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
6909 DAG.getConstantFP(Max, DL, VT));
6910 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
6911 DAG.getConstantFP(Min, DL, VT));
6912 }
6913 case Intrinsic::r600_read_ngroups_x:
6914 if (Subtarget->isAmdHsaOS())
6915 return emitNonHSAIntrinsicError(DAG, DL, VT);
6916
6917 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6918 SI::KernelInputOffsets::NGROUPS_X, Align(4),
6919 false);
6920 case Intrinsic::r600_read_ngroups_y:
6921 if (Subtarget->isAmdHsaOS())
6922 return emitNonHSAIntrinsicError(DAG, DL, VT);
6923
6924 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6925 SI::KernelInputOffsets::NGROUPS_Y, Align(4),
6926 false);
6927 case Intrinsic::r600_read_ngroups_z:
6928 if (Subtarget->isAmdHsaOS())
6929 return emitNonHSAIntrinsicError(DAG, DL, VT);
6930
6931 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6932 SI::KernelInputOffsets::NGROUPS_Z, Align(4),
6933 false);
6934 case Intrinsic::r600_read_global_size_x:
6935 if (Subtarget->isAmdHsaOS())
6936 return emitNonHSAIntrinsicError(DAG, DL, VT);
6937
6938 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6939 SI::KernelInputOffsets::GLOBAL_SIZE_X,
6940 Align(4), false);
6941 case Intrinsic::r600_read_global_size_y:
6942 if (Subtarget->isAmdHsaOS())
6943 return emitNonHSAIntrinsicError(DAG, DL, VT);
6944
6945 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6946 SI::KernelInputOffsets::GLOBAL_SIZE_Y,
6947 Align(4), false);
6948 case Intrinsic::r600_read_global_size_z:
6949 if (Subtarget->isAmdHsaOS())
6950 return emitNonHSAIntrinsicError(DAG, DL, VT);
6951
6952 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6953 SI::KernelInputOffsets::GLOBAL_SIZE_Z,
6954 Align(4), false);
6955 case Intrinsic::r600_read_local_size_x:
6956 if (Subtarget->isAmdHsaOS())
6957 return emitNonHSAIntrinsicError(DAG, DL, VT);
6958
6959 return lowerImplicitZextParam(DAG, Op, MVT::i16,
6960 SI::KernelInputOffsets::LOCAL_SIZE_X);
6961 case Intrinsic::r600_read_local_size_y:
6962 if (Subtarget->isAmdHsaOS())
6963 return emitNonHSAIntrinsicError(DAG, DL, VT);
6964
6965 return lowerImplicitZextParam(DAG, Op, MVT::i16,
6966 SI::KernelInputOffsets::LOCAL_SIZE_Y);
6967 case Intrinsic::r600_read_local_size_z:
6968 if (Subtarget->isAmdHsaOS())
6969 return emitNonHSAIntrinsicError(DAG, DL, VT);
6970
6971 return lowerImplicitZextParam(DAG, Op, MVT::i16,
6972 SI::KernelInputOffsets::LOCAL_SIZE_Z);
6973 case Intrinsic::amdgcn_workgroup_id_x:
6974 return getPreloadedValue(DAG, *MFI, VT,
6975 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
6976 case Intrinsic::amdgcn_workgroup_id_y:
6977 return getPreloadedValue(DAG, *MFI, VT,
6978 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
6979 case Intrinsic::amdgcn_workgroup_id_z:
6980 return getPreloadedValue(DAG, *MFI, VT,
6981 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
6982 case Intrinsic::amdgcn_lds_kernel_id: {
6983 if (MFI->isEntryFunction())
6984 return getLDSKernelId(DAG, DL);
6985 return getPreloadedValue(DAG, *MFI, VT,
6986 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
6987 }
6988 case Intrinsic::amdgcn_workitem_id_x:
6989 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
6990 case Intrinsic::amdgcn_workitem_id_y:
6991 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
6992 case Intrinsic::amdgcn_workitem_id_z:
6993 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
6994 case Intrinsic::amdgcn_wavefrontsize:
6995 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
6996 SDLoc(Op), MVT::i32);
6997 case Intrinsic::amdgcn_s_buffer_load: {
6998 unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
6999 if (CPol & ~AMDGPU::CPol::ALL)
7000 return Op;
7001 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
7002 DAG);
7003 }
7004 case Intrinsic::amdgcn_fdiv_fast:
7005 return lowerFDIV_FAST(Op, DAG);
7006 case Intrinsic::amdgcn_sin:
7007 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
7008
7009 case Intrinsic::amdgcn_cos:
7010 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
7011
7012 case Intrinsic::amdgcn_mul_u24:
7013 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
7014 case Intrinsic::amdgcn_mul_i24:
7015 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
7016
7017 case Intrinsic::amdgcn_log_clamp: {
7018 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
7019 return SDValue();
7020
7021 return emitRemovedIntrinsicError(DAG, DL, VT);
7022 }
7023 case Intrinsic::amdgcn_ldexp:
7024 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
7025 Op.getOperand(1), Op.getOperand(2));
7026
7027 case Intrinsic::amdgcn_fract:
7028 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
7029
7030 case Intrinsic::amdgcn_class:
7031 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
7032 Op.getOperand(1), Op.getOperand(2));
7033 case Intrinsic::amdgcn_div_fmas:
7034 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
7035 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
7036 Op.getOperand(4));
7037
7038 case Intrinsic::amdgcn_div_fixup:
7039 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
7040 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
7041
7042 case Intrinsic::amdgcn_div_scale: {
7043 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
7044
7045 // Translate to the operands expected by the machine instruction. The
7046 // first parameter must be the same as the first instruction.
7047 SDValue Numerator = Op.getOperand(1);
7048 SDValue Denominator = Op.getOperand(2);
7049
7050 // Note this order is opposite of the machine instruction's operations,
7051 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
7052 // intrinsic has the numerator as the first operand to match a normal
7053 // division operation.
7054
7055 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
7056
7057 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
7058 Denominator, Numerator);
7059 }
7060 case Intrinsic::amdgcn_icmp: {
7061 // There is a Pat that handles this variant, so return it as-is.
7062 if (Op.getOperand(1).getValueType() == MVT::i1 &&
7063 Op.getConstantOperandVal(2) == 0 &&
7064 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
7065 return Op;
7066 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
7067 }
7068 case Intrinsic::amdgcn_fcmp: {
7069 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
7070 }
7071 case Intrinsic::amdgcn_ballot:
7072 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
7073 case Intrinsic::amdgcn_fmed3:
7074 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
7075 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
7076 case Intrinsic::amdgcn_fdot2:
7077 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
7078 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
7079 Op.getOperand(4));
7080 case Intrinsic::amdgcn_fmul_legacy:
7081 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
7082 Op.getOperand(1), Op.getOperand(2));
7083 case Intrinsic::amdgcn_sffbh:
7084 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
7085 case Intrinsic::amdgcn_sbfe:
7086 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
7087 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
7088 case Intrinsic::amdgcn_ubfe:
7089 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
7090 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
7091 case Intrinsic::amdgcn_cvt_pkrtz:
7092 case Intrinsic::amdgcn_cvt_pknorm_i16:
7093 case Intrinsic::amdgcn_cvt_pknorm_u16:
7094 case Intrinsic::amdgcn_cvt_pk_i16:
7095 case Intrinsic::amdgcn_cvt_pk_u16: {
7096 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
7097 EVT VT = Op.getValueType();
7098 unsigned Opcode;
7099
7100 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
7101 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
7102 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
7103 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7104 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
7105 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7106 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
7107 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7108 else
7109 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7110
7111 if (isTypeLegal(VT))
7112 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
7113
7114 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
7115 Op.getOperand(1), Op.getOperand(2));
7116 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
7117 }
7118 case Intrinsic::amdgcn_fmad_ftz:
7119 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
7120 Op.getOperand(2), Op.getOperand(3));
7121
7122 case Intrinsic::amdgcn_if_break:
7123 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
7124 Op->getOperand(1), Op->getOperand(2)), 0);
7125
7126 case Intrinsic::amdgcn_groupstaticsize: {
7127 Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
7128 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
7129 return Op;
7130
7131 const Module *M = MF.getFunction().getParent();
7132 const GlobalValue *GV =
7133 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
7134 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
7135 SIInstrInfo::MO_ABS32_LO);
7136 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
7137 }
7138 case Intrinsic::amdgcn_is_shared:
7139 case Intrinsic::amdgcn_is_private: {
7140 SDLoc SL(Op);
7141 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
7142 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
7143 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
7144 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
7145 Op.getOperand(1));
7146
7147 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
7148 DAG.getConstant(1, SL, MVT::i32));
7149 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
7150 }
7151 case Intrinsic::amdgcn_perm:
7152 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
7153 Op.getOperand(2), Op.getOperand(3));
7154 case Intrinsic::amdgcn_reloc_constant: {
7155 Module *M = const_cast<Module *>(MF.getFunction().getParent());
7156 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
7157 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
7158 auto RelocSymbol = cast<GlobalVariable>(
7159 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
7160 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
7161 SIInstrInfo::MO_ABS32_LO);
7162 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
7163 }
7164 default:
7165 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7166 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
7167 return lowerImage(Op, ImageDimIntr, DAG, false);
7168
7169 return Op;
7170 }
7171 }
7172
7173 /// Update \p MMO based on the offset inputs to an intrinsic.
updateBufferMMO(MachineMemOperand * MMO,SDValue VOffset,SDValue SOffset,SDValue Offset,SDValue VIndex=SDValue ())7174 static void updateBufferMMO(MachineMemOperand *MMO, SDValue VOffset,
7175 SDValue SOffset, SDValue Offset,
7176 SDValue VIndex = SDValue()) {
7177 if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) ||
7178 !isa<ConstantSDNode>(Offset)) {
7179 // The combined offset is not known to be constant, so we cannot represent
7180 // it in the MMO. Give up.
7181 MMO->setValue((Value *)nullptr);
7182 return;
7183 }
7184
7185 if (VIndex && (!isa<ConstantSDNode>(VIndex) ||
7186 !cast<ConstantSDNode>(VIndex)->isZero())) {
7187 // The strided index component of the address is not known to be zero, so we
7188 // cannot represent it in the MMO. Give up.
7189 MMO->setValue((Value *)nullptr);
7190 return;
7191 }
7192
7193 MMO->setOffset(cast<ConstantSDNode>(VOffset)->getSExtValue() +
7194 cast<ConstantSDNode>(SOffset)->getSExtValue() +
7195 cast<ConstantSDNode>(Offset)->getSExtValue());
7196 }
7197
lowerRawBufferAtomicIntrin(SDValue Op,SelectionDAG & DAG,unsigned NewOpcode) const7198 SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
7199 SelectionDAG &DAG,
7200 unsigned NewOpcode) const {
7201 SDLoc DL(Op);
7202
7203 SDValue VData = Op.getOperand(2);
7204 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
7205 SDValue Ops[] = {
7206 Op.getOperand(0), // Chain
7207 VData, // vdata
7208 Op.getOperand(3), // rsrc
7209 DAG.getConstant(0, DL, MVT::i32), // vindex
7210 Offsets.first, // voffset
7211 Op.getOperand(5), // soffset
7212 Offsets.second, // offset
7213 Op.getOperand(6), // cachepolicy
7214 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7215 };
7216
7217 auto *M = cast<MemSDNode>(Op);
7218 updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]);
7219
7220 EVT MemVT = VData.getValueType();
7221 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
7222 M->getMemOperand());
7223 }
7224
7225 // Return a value to use for the idxen operand by examining the vindex operand.
getIdxEn(SDValue VIndex)7226 static unsigned getIdxEn(SDValue VIndex) {
7227 if (auto VIndexC = dyn_cast<ConstantSDNode>(VIndex))
7228 // No need to set idxen if vindex is known to be zero.
7229 return VIndexC->getZExtValue() != 0;
7230 return 1;
7231 }
7232
7233 SDValue
lowerStructBufferAtomicIntrin(SDValue Op,SelectionDAG & DAG,unsigned NewOpcode) const7234 SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
7235 unsigned NewOpcode) const {
7236 SDLoc DL(Op);
7237
7238 SDValue VData = Op.getOperand(2);
7239 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
7240 SDValue Ops[] = {
7241 Op.getOperand(0), // Chain
7242 VData, // vdata
7243 Op.getOperand(3), // rsrc
7244 Op.getOperand(4), // vindex
7245 Offsets.first, // voffset
7246 Op.getOperand(6), // soffset
7247 Offsets.second, // offset
7248 Op.getOperand(7), // cachepolicy
7249 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
7250 };
7251
7252 auto *M = cast<MemSDNode>(Op);
7253 updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
7254
7255 EVT MemVT = VData.getValueType();
7256 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
7257 M->getMemOperand());
7258 }
7259
LowerINTRINSIC_W_CHAIN(SDValue Op,SelectionDAG & DAG) const7260 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
7261 SelectionDAG &DAG) const {
7262 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7263 SDLoc DL(Op);
7264
7265 switch (IntrID) {
7266 case Intrinsic::amdgcn_ds_ordered_add:
7267 case Intrinsic::amdgcn_ds_ordered_swap: {
7268 MemSDNode *M = cast<MemSDNode>(Op);
7269 SDValue Chain = M->getOperand(0);
7270 SDValue M0 = M->getOperand(2);
7271 SDValue Value = M->getOperand(3);
7272 unsigned IndexOperand = M->getConstantOperandVal(7);
7273 unsigned WaveRelease = M->getConstantOperandVal(8);
7274 unsigned WaveDone = M->getConstantOperandVal(9);
7275
7276 unsigned OrderedCountIndex = IndexOperand & 0x3f;
7277 IndexOperand &= ~0x3f;
7278 unsigned CountDw = 0;
7279
7280 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
7281 CountDw = (IndexOperand >> 24) & 0xf;
7282 IndexOperand &= ~(0xf << 24);
7283
7284 if (CountDw < 1 || CountDw > 4) {
7285 report_fatal_error(
7286 "ds_ordered_count: dword count must be between 1 and 4");
7287 }
7288 }
7289
7290 if (IndexOperand)
7291 report_fatal_error("ds_ordered_count: bad index operand");
7292
7293 if (WaveDone && !WaveRelease)
7294 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
7295
7296 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
7297 unsigned ShaderType =
7298 SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction());
7299 unsigned Offset0 = OrderedCountIndex << 2;
7300 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
7301
7302 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
7303 Offset1 |= (CountDw - 1) << 6;
7304
7305 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
7306 Offset1 |= ShaderType << 2;
7307
7308 unsigned Offset = Offset0 | (Offset1 << 8);
7309
7310 SDValue Ops[] = {
7311 Chain,
7312 Value,
7313 DAG.getTargetConstant(Offset, DL, MVT::i16),
7314 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
7315 };
7316 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
7317 M->getVTList(), Ops, M->getMemoryVT(),
7318 M->getMemOperand());
7319 }
7320 case Intrinsic::amdgcn_ds_fadd: {
7321 MemSDNode *M = cast<MemSDNode>(Op);
7322 unsigned Opc;
7323 switch (IntrID) {
7324 case Intrinsic::amdgcn_ds_fadd:
7325 Opc = ISD::ATOMIC_LOAD_FADD;
7326 break;
7327 }
7328
7329 return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
7330 M->getOperand(0), M->getOperand(2), M->getOperand(3),
7331 M->getMemOperand());
7332 }
7333 case Intrinsic::amdgcn_atomic_inc:
7334 case Intrinsic::amdgcn_atomic_dec:
7335 case Intrinsic::amdgcn_ds_fmin:
7336 case Intrinsic::amdgcn_ds_fmax: {
7337 MemSDNode *M = cast<MemSDNode>(Op);
7338 unsigned Opc;
7339 switch (IntrID) {
7340 case Intrinsic::amdgcn_atomic_inc:
7341 Opc = AMDGPUISD::ATOMIC_INC;
7342 break;
7343 case Intrinsic::amdgcn_atomic_dec:
7344 Opc = AMDGPUISD::ATOMIC_DEC;
7345 break;
7346 case Intrinsic::amdgcn_ds_fmin:
7347 Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
7348 break;
7349 case Intrinsic::amdgcn_ds_fmax:
7350 Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
7351 break;
7352 default:
7353 llvm_unreachable("Unknown intrinsic!");
7354 }
7355 SDValue Ops[] = {
7356 M->getOperand(0), // Chain
7357 M->getOperand(2), // Ptr
7358 M->getOperand(3) // Value
7359 };
7360
7361 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
7362 M->getMemoryVT(), M->getMemOperand());
7363 }
7364 case Intrinsic::amdgcn_buffer_load:
7365 case Intrinsic::amdgcn_buffer_load_format: {
7366 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
7367 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
7368 unsigned IdxEn = getIdxEn(Op.getOperand(3));
7369 SDValue Ops[] = {
7370 Op.getOperand(0), // Chain
7371 Op.getOperand(2), // rsrc
7372 Op.getOperand(3), // vindex
7373 SDValue(), // voffset -- will be set by setBufferOffsets
7374 SDValue(), // soffset -- will be set by setBufferOffsets
7375 SDValue(), // offset -- will be set by setBufferOffsets
7376 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
7377 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
7378 };
7379 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
7380
7381 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
7382 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
7383
7384 EVT VT = Op.getValueType();
7385 EVT IntVT = VT.changeTypeToInteger();
7386 auto *M = cast<MemSDNode>(Op);
7387 updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]);
7388 EVT LoadVT = Op.getValueType();
7389
7390 if (LoadVT.getScalarType() == MVT::f16)
7391 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
7392 M, DAG, Ops);
7393
7394 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7395 if (LoadVT.getScalarType() == MVT::i8 ||
7396 LoadVT.getScalarType() == MVT::i16)
7397 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
7398
7399 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
7400 M->getMemOperand(), DAG);
7401 }
7402 case Intrinsic::amdgcn_raw_buffer_load:
7403 case Intrinsic::amdgcn_raw_buffer_load_format: {
7404 const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format;
7405
7406 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
7407 SDValue Ops[] = {
7408 Op.getOperand(0), // Chain
7409 Op.getOperand(2), // rsrc
7410 DAG.getConstant(0, DL, MVT::i32), // vindex
7411 Offsets.first, // voffset
7412 Op.getOperand(4), // soffset
7413 Offsets.second, // offset
7414 Op.getOperand(5), // cachepolicy, swizzled buffer
7415 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7416 };
7417
7418 auto *M = cast<MemSDNode>(Op);
7419 updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5]);
7420 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
7421 }
7422 case Intrinsic::amdgcn_struct_buffer_load:
7423 case Intrinsic::amdgcn_struct_buffer_load_format: {
7424 const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format;
7425
7426 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
7427 SDValue Ops[] = {
7428 Op.getOperand(0), // Chain
7429 Op.getOperand(2), // rsrc
7430 Op.getOperand(3), // vindex
7431 Offsets.first, // voffset
7432 Op.getOperand(5), // soffset
7433 Offsets.second, // offset
7434 Op.getOperand(6), // cachepolicy, swizzled buffer
7435 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
7436 };
7437
7438 auto *M = cast<MemSDNode>(Op);
7439 updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]);
7440 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
7441 }
7442 case Intrinsic::amdgcn_tbuffer_load: {
7443 MemSDNode *M = cast<MemSDNode>(Op);
7444 EVT LoadVT = Op.getValueType();
7445
7446 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
7447 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
7448 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
7449 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
7450 unsigned IdxEn = getIdxEn(Op.getOperand(3));
7451 SDValue Ops[] = {
7452 Op.getOperand(0), // Chain
7453 Op.getOperand(2), // rsrc
7454 Op.getOperand(3), // vindex
7455 Op.getOperand(4), // voffset
7456 Op.getOperand(5), // soffset
7457 Op.getOperand(6), // offset
7458 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
7459 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
7460 DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
7461 };
7462
7463 if (LoadVT.getScalarType() == MVT::f16)
7464 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
7465 M, DAG, Ops);
7466 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
7467 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
7468 DAG);
7469 }
7470 case Intrinsic::amdgcn_raw_tbuffer_load: {
7471 MemSDNode *M = cast<MemSDNode>(Op);
7472 EVT LoadVT = Op.getValueType();
7473 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
7474
7475 SDValue Ops[] = {
7476 Op.getOperand(0), // Chain
7477 Op.getOperand(2), // rsrc
7478 DAG.getConstant(0, DL, MVT::i32), // vindex
7479 Offsets.first, // voffset
7480 Op.getOperand(4), // soffset
7481 Offsets.second, // offset
7482 Op.getOperand(5), // format
7483 Op.getOperand(6), // cachepolicy, swizzled buffer
7484 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7485 };
7486
7487 if (LoadVT.getScalarType() == MVT::f16)
7488 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
7489 M, DAG, Ops);
7490 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
7491 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
7492 DAG);
7493 }
7494 case Intrinsic::amdgcn_struct_tbuffer_load: {
7495 MemSDNode *M = cast<MemSDNode>(Op);
7496 EVT LoadVT = Op.getValueType();
7497 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
7498
7499 SDValue Ops[] = {
7500 Op.getOperand(0), // Chain
7501 Op.getOperand(2), // rsrc
7502 Op.getOperand(3), // vindex
7503 Offsets.first, // voffset
7504 Op.getOperand(5), // soffset
7505 Offsets.second, // offset
7506 Op.getOperand(6), // format
7507 Op.getOperand(7), // cachepolicy, swizzled buffer
7508 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
7509 };
7510
7511 if (LoadVT.getScalarType() == MVT::f16)
7512 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
7513 M, DAG, Ops);
7514 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
7515 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
7516 DAG);
7517 }
7518 case Intrinsic::amdgcn_buffer_atomic_swap:
7519 case Intrinsic::amdgcn_buffer_atomic_add:
7520 case Intrinsic::amdgcn_buffer_atomic_sub:
7521 case Intrinsic::amdgcn_buffer_atomic_csub:
7522 case Intrinsic::amdgcn_buffer_atomic_smin:
7523 case Intrinsic::amdgcn_buffer_atomic_umin:
7524 case Intrinsic::amdgcn_buffer_atomic_smax:
7525 case Intrinsic::amdgcn_buffer_atomic_umax:
7526 case Intrinsic::amdgcn_buffer_atomic_and:
7527 case Intrinsic::amdgcn_buffer_atomic_or:
7528 case Intrinsic::amdgcn_buffer_atomic_xor:
7529 case Intrinsic::amdgcn_buffer_atomic_fadd: {
7530 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
7531 unsigned IdxEn = getIdxEn(Op.getOperand(4));
7532 SDValue Ops[] = {
7533 Op.getOperand(0), // Chain
7534 Op.getOperand(2), // vdata
7535 Op.getOperand(3), // rsrc
7536 Op.getOperand(4), // vindex
7537 SDValue(), // voffset -- will be set by setBufferOffsets
7538 SDValue(), // soffset -- will be set by setBufferOffsets
7539 SDValue(), // offset -- will be set by setBufferOffsets
7540 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
7541 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
7542 };
7543 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
7544
7545 EVT VT = Op.getValueType();
7546
7547 auto *M = cast<MemSDNode>(Op);
7548 updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
7549 unsigned Opcode = 0;
7550
7551 switch (IntrID) {
7552 case Intrinsic::amdgcn_buffer_atomic_swap:
7553 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
7554 break;
7555 case Intrinsic::amdgcn_buffer_atomic_add:
7556 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
7557 break;
7558 case Intrinsic::amdgcn_buffer_atomic_sub:
7559 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
7560 break;
7561 case Intrinsic::amdgcn_buffer_atomic_csub:
7562 Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB;
7563 break;
7564 case Intrinsic::amdgcn_buffer_atomic_smin:
7565 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
7566 break;
7567 case Intrinsic::amdgcn_buffer_atomic_umin:
7568 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
7569 break;
7570 case Intrinsic::amdgcn_buffer_atomic_smax:
7571 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
7572 break;
7573 case Intrinsic::amdgcn_buffer_atomic_umax:
7574 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
7575 break;
7576 case Intrinsic::amdgcn_buffer_atomic_and:
7577 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
7578 break;
7579 case Intrinsic::amdgcn_buffer_atomic_or:
7580 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
7581 break;
7582 case Intrinsic::amdgcn_buffer_atomic_xor:
7583 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
7584 break;
7585 case Intrinsic::amdgcn_buffer_atomic_fadd:
7586 Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD;
7587 break;
7588 default:
7589 llvm_unreachable("unhandled atomic opcode");
7590 }
7591
7592 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
7593 M->getMemOperand());
7594 }
7595 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7596 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
7597 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7598 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
7599 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7600 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
7601 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7602 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
7603 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7604 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
7605 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7606 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
7607 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7608 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
7609 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7610 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
7611 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7612 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
7613 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7614 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
7615 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7616 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
7617 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7618 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
7619 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7620 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
7621 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7622 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
7623 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7624 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
7625 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7626 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
7627 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7628 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
7629 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7630 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
7631 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7632 return lowerStructBufferAtomicIntrin(Op, DAG,
7633 AMDGPUISD::BUFFER_ATOMIC_SWAP);
7634 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7635 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
7636 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7637 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
7638 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7639 return lowerStructBufferAtomicIntrin(Op, DAG,
7640 AMDGPUISD::BUFFER_ATOMIC_SMIN);
7641 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7642 return lowerStructBufferAtomicIntrin(Op, DAG,
7643 AMDGPUISD::BUFFER_ATOMIC_UMIN);
7644 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7645 return lowerStructBufferAtomicIntrin(Op, DAG,
7646 AMDGPUISD::BUFFER_ATOMIC_SMAX);
7647 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7648 return lowerStructBufferAtomicIntrin(Op, DAG,
7649 AMDGPUISD::BUFFER_ATOMIC_UMAX);
7650 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7651 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
7652 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7653 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
7654 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7655 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
7656 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7657 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
7658 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7659 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
7660
7661 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
7662 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
7663 unsigned IdxEn = getIdxEn(Op.getOperand(5));
7664 SDValue Ops[] = {
7665 Op.getOperand(0), // Chain
7666 Op.getOperand(2), // src
7667 Op.getOperand(3), // cmp
7668 Op.getOperand(4), // rsrc
7669 Op.getOperand(5), // vindex
7670 SDValue(), // voffset -- will be set by setBufferOffsets
7671 SDValue(), // soffset -- will be set by setBufferOffsets
7672 SDValue(), // offset -- will be set by setBufferOffsets
7673 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
7674 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
7675 };
7676 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
7677
7678 EVT VT = Op.getValueType();
7679 auto *M = cast<MemSDNode>(Op);
7680 updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);
7681
7682 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
7683 Op->getVTList(), Ops, VT, M->getMemOperand());
7684 }
7685 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
7686 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
7687 SDValue Ops[] = {
7688 Op.getOperand(0), // Chain
7689 Op.getOperand(2), // src
7690 Op.getOperand(3), // cmp
7691 Op.getOperand(4), // rsrc
7692 DAG.getConstant(0, DL, MVT::i32), // vindex
7693 Offsets.first, // voffset
7694 Op.getOperand(6), // soffset
7695 Offsets.second, // offset
7696 Op.getOperand(7), // cachepolicy
7697 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7698 };
7699 EVT VT = Op.getValueType();
7700 auto *M = cast<MemSDNode>(Op);
7701 updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7]);
7702
7703 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
7704 Op->getVTList(), Ops, VT, M->getMemOperand());
7705 }
7706 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
7707 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
7708 SDValue Ops[] = {
7709 Op.getOperand(0), // Chain
7710 Op.getOperand(2), // src
7711 Op.getOperand(3), // cmp
7712 Op.getOperand(4), // rsrc
7713 Op.getOperand(5), // vindex
7714 Offsets.first, // voffset
7715 Op.getOperand(7), // soffset
7716 Offsets.second, // offset
7717 Op.getOperand(8), // cachepolicy
7718 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
7719 };
7720 EVT VT = Op.getValueType();
7721 auto *M = cast<MemSDNode>(Op);
7722 updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);
7723
7724 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
7725 Op->getVTList(), Ops, VT, M->getMemOperand());
7726 }
7727 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
7728 MemSDNode *M = cast<MemSDNode>(Op);
7729 SDValue NodePtr = M->getOperand(2);
7730 SDValue RayExtent = M->getOperand(3);
7731 SDValue RayOrigin = M->getOperand(4);
7732 SDValue RayDir = M->getOperand(5);
7733 SDValue RayInvDir = M->getOperand(6);
7734 SDValue TDescr = M->getOperand(7);
7735
7736 assert(NodePtr.getValueType() == MVT::i32 ||
7737 NodePtr.getValueType() == MVT::i64);
7738 assert(RayDir.getValueType() == MVT::v3f16 ||
7739 RayDir.getValueType() == MVT::v3f32);
7740
7741 if (!Subtarget->hasGFX10_AEncoding()) {
7742 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
7743 return SDValue();
7744 }
7745
7746 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7747 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
7748 const bool Is64 = NodePtr.getValueType() == MVT::i64;
7749 const unsigned NumVDataDwords = 4;
7750 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7751 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7752 const bool UseNSA =
7753 Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize();
7754 const unsigned BaseOpcodes[2][2] = {
7755 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7756 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7757 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7758 int Opcode;
7759 if (UseNSA) {
7760 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7761 IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
7762 : AMDGPU::MIMGEncGfx10NSA,
7763 NumVDataDwords, NumVAddrDwords);
7764 } else {
7765 Opcode =
7766 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7767 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default
7768 : AMDGPU::MIMGEncGfx10Default,
7769 NumVDataDwords, NumVAddrDwords);
7770 }
7771 assert(Opcode != -1);
7772
7773 SmallVector<SDValue, 16> Ops;
7774
7775 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
7776 SmallVector<SDValue, 3> Lanes;
7777 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
7778 if (Lanes[0].getValueSizeInBits() == 32) {
7779 for (unsigned I = 0; I < 3; ++I)
7780 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
7781 } else {
7782 if (IsAligned) {
7783 Ops.push_back(
7784 DAG.getBitcast(MVT::i32,
7785 DAG.getBuildVector(MVT::v2f16, DL,
7786 { Lanes[0], Lanes[1] })));
7787 Ops.push_back(Lanes[2]);
7788 } else {
7789 SDValue Elt0 = Ops.pop_back_val();
7790 Ops.push_back(
7791 DAG.getBitcast(MVT::i32,
7792 DAG.getBuildVector(MVT::v2f16, DL,
7793 { Elt0, Lanes[0] })));
7794 Ops.push_back(
7795 DAG.getBitcast(MVT::i32,
7796 DAG.getBuildVector(MVT::v2f16, DL,
7797 { Lanes[1], Lanes[2] })));
7798 }
7799 }
7800 };
7801
7802 if (UseNSA && IsGFX11Plus) {
7803 Ops.push_back(NodePtr);
7804 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
7805 Ops.push_back(RayOrigin);
7806 if (IsA16) {
7807 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
7808 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
7809 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
7810 for (unsigned I = 0; I < 3; ++I) {
7811 MergedLanes.push_back(DAG.getBitcast(
7812 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
7813 {DirLanes[I], InvDirLanes[I]})));
7814 }
7815 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
7816 } else {
7817 Ops.push_back(RayDir);
7818 Ops.push_back(RayInvDir);
7819 }
7820 } else {
7821 if (Is64)
7822 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
7823 2);
7824 else
7825 Ops.push_back(NodePtr);
7826
7827 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
7828 packLanes(RayOrigin, true);
7829 packLanes(RayDir, true);
7830 packLanes(RayInvDir, false);
7831 }
7832
7833 if (!UseNSA) {
7834 // Build a single vector containing all the operands so far prepared.
7835 if (NumVAddrDwords > 12) {
7836 SDValue Undef = DAG.getUNDEF(MVT::i32);
7837 Ops.append(16 - Ops.size(), Undef);
7838 }
7839 assert(Ops.size() >= 8 && Ops.size() <= 12);
7840 SDValue MergedOps = DAG.getBuildVector(
7841 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
7842 Ops.clear();
7843 Ops.push_back(MergedOps);
7844 }
7845
7846 Ops.push_back(TDescr);
7847 if (IsA16)
7848 Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1));
7849 Ops.push_back(M->getChain());
7850
7851 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
7852 MachineMemOperand *MemRef = M->getMemOperand();
7853 DAG.setNodeMemRefs(NewNode, {MemRef});
7854 return SDValue(NewNode, 0);
7855 }
7856 case Intrinsic::amdgcn_global_atomic_fadd: {
7857 if (!Subtarget->hasAtomicFaddNoRtnInsts())
7858 return makeV_ILLEGAL(Op, DAG);
7859 return SDValue();
7860 }
7861 case Intrinsic::amdgcn_global_atomic_fmin:
7862 case Intrinsic::amdgcn_global_atomic_fmax:
7863 case Intrinsic::amdgcn_flat_atomic_fmin:
7864 case Intrinsic::amdgcn_flat_atomic_fmax: {
7865 MemSDNode *M = cast<MemSDNode>(Op);
7866 SDValue Ops[] = {
7867 M->getOperand(0), // Chain
7868 M->getOperand(2), // Ptr
7869 M->getOperand(3) // Value
7870 };
7871 unsigned Opcode = 0;
7872 switch (IntrID) {
7873 case Intrinsic::amdgcn_global_atomic_fmin:
7874 case Intrinsic::amdgcn_flat_atomic_fmin: {
7875 Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
7876 break;
7877 }
7878 case Intrinsic::amdgcn_global_atomic_fmax:
7879 case Intrinsic::amdgcn_flat_atomic_fmax: {
7880 Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
7881 break;
7882 }
7883 default:
7884 llvm_unreachable("unhandled atomic opcode");
7885 }
7886 return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
7887 M->getVTList(), Ops, M->getMemoryVT(),
7888 M->getMemOperand());
7889 }
7890 default:
7891
7892 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7893 AMDGPU::getImageDimIntrinsicInfo(IntrID))
7894 return lowerImage(Op, ImageDimIntr, DAG, true);
7895
7896 return SDValue();
7897 }
7898 }
7899
7900 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
7901 // dwordx4 if on SI and handle TFE loads.
getMemIntrinsicNode(unsigned Opcode,const SDLoc & DL,SDVTList VTList,ArrayRef<SDValue> Ops,EVT MemVT,MachineMemOperand * MMO,SelectionDAG & DAG) const7902 SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
7903 SDVTList VTList,
7904 ArrayRef<SDValue> Ops, EVT MemVT,
7905 MachineMemOperand *MMO,
7906 SelectionDAG &DAG) const {
7907 LLVMContext &C = *DAG.getContext();
7908 MachineFunction &MF = DAG.getMachineFunction();
7909 EVT VT = VTList.VTs[0];
7910
7911 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
7912 bool IsTFE = VTList.NumVTs == 3;
7913 if (IsTFE) {
7914 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
7915 unsigned NumOpDWords = NumValueDWords + 1;
7916 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
7917 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
7918 MachineMemOperand *OpDWordsMMO =
7919 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
7920 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
7921 OpDWordsVT, OpDWordsMMO, DAG);
7922 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
7923 DAG.getVectorIdxConstant(NumValueDWords, DL));
7924 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
7925 SDValue ValueDWords =
7926 NumValueDWords == 1
7927 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
7928 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
7929 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
7930 ZeroIdx);
7931 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
7932 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
7933 }
7934
7935 if (!Subtarget->hasDwordx3LoadStores() &&
7936 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
7937 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
7938 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
7939 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
7940 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
7941 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
7942 WidenedMemVT, WidenedMMO);
7943 SDValue Value = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Op,
7944 DAG.getVectorIdxConstant(0, DL));
7945 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
7946 }
7947
7948 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
7949 }
7950
handleD16VData(SDValue VData,SelectionDAG & DAG,bool ImageStore) const7951 SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
7952 bool ImageStore) const {
7953 EVT StoreVT = VData.getValueType();
7954
7955 // No change for f16 and legal vector D16 types.
7956 if (!StoreVT.isVector())
7957 return VData;
7958
7959 SDLoc DL(VData);
7960 unsigned NumElements = StoreVT.getVectorNumElements();
7961
7962 if (Subtarget->hasUnpackedD16VMem()) {
7963 // We need to unpack the packed data to store.
7964 EVT IntStoreVT = StoreVT.changeTypeToInteger();
7965 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
7966
7967 EVT EquivStoreVT =
7968 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
7969 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
7970 return DAG.UnrollVectorOp(ZExt.getNode());
7971 }
7972
7973 // The sq block of gfx8.1 does not estimate register use correctly for d16
7974 // image store instructions. The data operand is computed as if it were not a
7975 // d16 image instruction.
7976 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
7977 // Bitcast to i16
7978 EVT IntStoreVT = StoreVT.changeTypeToInteger();
7979 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
7980
7981 // Decompose into scalars
7982 SmallVector<SDValue, 4> Elts;
7983 DAG.ExtractVectorElements(IntVData, Elts);
7984
7985 // Group pairs of i16 into v2i16 and bitcast to i32
7986 SmallVector<SDValue, 4> PackedElts;
7987 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
7988 SDValue Pair =
7989 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
7990 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
7991 PackedElts.push_back(IntPair);
7992 }
7993 if ((NumElements % 2) == 1) {
7994 // Handle v3i16
7995 unsigned I = Elts.size() / 2;
7996 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
7997 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
7998 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
7999 PackedElts.push_back(IntPair);
8000 }
8001
8002 // Pad using UNDEF
8003 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
8004
8005 // Build final vector
8006 EVT VecVT =
8007 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
8008 return DAG.getBuildVector(VecVT, DL, PackedElts);
8009 }
8010
8011 if (NumElements == 3) {
8012 EVT IntStoreVT =
8013 EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
8014 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
8015
8016 EVT WidenedStoreVT = EVT::getVectorVT(
8017 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
8018 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
8019 WidenedStoreVT.getStoreSizeInBits());
8020 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
8021 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
8022 }
8023
8024 assert(isTypeLegal(StoreVT));
8025 return VData;
8026 }
8027
LowerINTRINSIC_VOID(SDValue Op,SelectionDAG & DAG) const8028 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
8029 SelectionDAG &DAG) const {
8030 SDLoc DL(Op);
8031 SDValue Chain = Op.getOperand(0);
8032 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
8033 MachineFunction &MF = DAG.getMachineFunction();
8034
8035 switch (IntrinsicID) {
8036 case Intrinsic::amdgcn_exp_compr: {
8037 if (!Subtarget->hasCompressedExport()) {
8038 DiagnosticInfoUnsupported BadIntrin(
8039 DAG.getMachineFunction().getFunction(),
8040 "intrinsic not supported on subtarget", DL.getDebugLoc());
8041 DAG.getContext()->diagnose(BadIntrin);
8042 }
8043 SDValue Src0 = Op.getOperand(4);
8044 SDValue Src1 = Op.getOperand(5);
8045 // Hack around illegal type on SI by directly selecting it.
8046 if (isTypeLegal(Src0.getValueType()))
8047 return SDValue();
8048
8049 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
8050 SDValue Undef = DAG.getUNDEF(MVT::f32);
8051 const SDValue Ops[] = {
8052 Op.getOperand(2), // tgt
8053 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
8054 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
8055 Undef, // src2
8056 Undef, // src3
8057 Op.getOperand(7), // vm
8058 DAG.getTargetConstant(1, DL, MVT::i1), // compr
8059 Op.getOperand(3), // en
8060 Op.getOperand(0) // Chain
8061 };
8062
8063 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
8064 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
8065 }
8066 case Intrinsic::amdgcn_s_barrier: {
8067 if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
8068 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
8069 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
8070 if (WGSize <= ST.getWavefrontSize())
8071 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
8072 Op.getOperand(0)), 0);
8073 }
8074 return SDValue();
8075 };
8076 case Intrinsic::amdgcn_tbuffer_store: {
8077 SDValue VData = Op.getOperand(2);
8078 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
8079 if (IsD16)
8080 VData = handleD16VData(VData, DAG);
8081 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
8082 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
8083 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
8084 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
8085 unsigned IdxEn = getIdxEn(Op.getOperand(4));
8086 SDValue Ops[] = {
8087 Chain,
8088 VData, // vdata
8089 Op.getOperand(3), // rsrc
8090 Op.getOperand(4), // vindex
8091 Op.getOperand(5), // voffset
8092 Op.getOperand(6), // soffset
8093 Op.getOperand(7), // offset
8094 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
8095 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8096 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8097 };
8098 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
8099 AMDGPUISD::TBUFFER_STORE_FORMAT;
8100 MemSDNode *M = cast<MemSDNode>(Op);
8101 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8102 M->getMemoryVT(), M->getMemOperand());
8103 }
8104
8105 case Intrinsic::amdgcn_struct_tbuffer_store: {
8106 SDValue VData = Op.getOperand(2);
8107 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
8108 if (IsD16)
8109 VData = handleD16VData(VData, DAG);
8110 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8111 SDValue Ops[] = {
8112 Chain,
8113 VData, // vdata
8114 Op.getOperand(3), // rsrc
8115 Op.getOperand(4), // vindex
8116 Offsets.first, // voffset
8117 Op.getOperand(6), // soffset
8118 Offsets.second, // offset
8119 Op.getOperand(7), // format
8120 Op.getOperand(8), // cachepolicy, swizzled buffer
8121 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8122 };
8123 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
8124 AMDGPUISD::TBUFFER_STORE_FORMAT;
8125 MemSDNode *M = cast<MemSDNode>(Op);
8126 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8127 M->getMemoryVT(), M->getMemOperand());
8128 }
8129
8130 case Intrinsic::amdgcn_raw_tbuffer_store: {
8131 SDValue VData = Op.getOperand(2);
8132 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
8133 if (IsD16)
8134 VData = handleD16VData(VData, DAG);
8135 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8136 SDValue Ops[] = {
8137 Chain,
8138 VData, // vdata
8139 Op.getOperand(3), // rsrc
8140 DAG.getConstant(0, DL, MVT::i32), // vindex
8141 Offsets.first, // voffset
8142 Op.getOperand(5), // soffset
8143 Offsets.second, // offset
8144 Op.getOperand(6), // format
8145 Op.getOperand(7), // cachepolicy, swizzled buffer
8146 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8147 };
8148 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
8149 AMDGPUISD::TBUFFER_STORE_FORMAT;
8150 MemSDNode *M = cast<MemSDNode>(Op);
8151 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8152 M->getMemoryVT(), M->getMemOperand());
8153 }
8154
8155 case Intrinsic::amdgcn_buffer_store:
8156 case Intrinsic::amdgcn_buffer_store_format: {
8157 SDValue VData = Op.getOperand(2);
8158 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
8159 if (IsD16)
8160 VData = handleD16VData(VData, DAG);
8161 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
8162 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
8163 unsigned IdxEn = getIdxEn(Op.getOperand(4));
8164 SDValue Ops[] = {
8165 Chain,
8166 VData,
8167 Op.getOperand(3), // rsrc
8168 Op.getOperand(4), // vindex
8169 SDValue(), // voffset -- will be set by setBufferOffsets
8170 SDValue(), // soffset -- will be set by setBufferOffsets
8171 SDValue(), // offset -- will be set by setBufferOffsets
8172 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8173 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8174 };
8175 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
8176
8177 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
8178 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
8179 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
8180 MemSDNode *M = cast<MemSDNode>(Op);
8181 updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
8182
8183 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
8184 EVT VDataType = VData.getValueType().getScalarType();
8185 if (VDataType == MVT::i8 || VDataType == MVT::i16)
8186 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
8187
8188 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8189 M->getMemoryVT(), M->getMemOperand());
8190 }
8191
8192 case Intrinsic::amdgcn_raw_buffer_store:
8193 case Intrinsic::amdgcn_raw_buffer_store_format: {
8194 const bool IsFormat =
8195 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format;
8196
8197 SDValue VData = Op.getOperand(2);
8198 EVT VDataVT = VData.getValueType();
8199 EVT EltType = VDataVT.getScalarType();
8200 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
8201 if (IsD16) {
8202 VData = handleD16VData(VData, DAG);
8203 VDataVT = VData.getValueType();
8204 }
8205
8206 if (!isTypeLegal(VDataVT)) {
8207 VData =
8208 DAG.getNode(ISD::BITCAST, DL,
8209 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
8210 }
8211
8212 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8213 SDValue Ops[] = {
8214 Chain,
8215 VData,
8216 Op.getOperand(3), // rsrc
8217 DAG.getConstant(0, DL, MVT::i32), // vindex
8218 Offsets.first, // voffset
8219 Op.getOperand(5), // soffset
8220 Offsets.second, // offset
8221 Op.getOperand(6), // cachepolicy, swizzled buffer
8222 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8223 };
8224 unsigned Opc =
8225 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
8226 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
8227 MemSDNode *M = cast<MemSDNode>(Op);
8228 updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]);
8229
8230 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
8231 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
8232 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
8233
8234 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8235 M->getMemoryVT(), M->getMemOperand());
8236 }
8237
8238 case Intrinsic::amdgcn_struct_buffer_store:
8239 case Intrinsic::amdgcn_struct_buffer_store_format: {
8240 const bool IsFormat =
8241 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format;
8242
8243 SDValue VData = Op.getOperand(2);
8244 EVT VDataVT = VData.getValueType();
8245 EVT EltType = VDataVT.getScalarType();
8246 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
8247
8248 if (IsD16) {
8249 VData = handleD16VData(VData, DAG);
8250 VDataVT = VData.getValueType();
8251 }
8252
8253 if (!isTypeLegal(VDataVT)) {
8254 VData =
8255 DAG.getNode(ISD::BITCAST, DL,
8256 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
8257 }
8258
8259 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8260 SDValue Ops[] = {
8261 Chain,
8262 VData,
8263 Op.getOperand(3), // rsrc
8264 Op.getOperand(4), // vindex
8265 Offsets.first, // voffset
8266 Op.getOperand(6), // soffset
8267 Offsets.second, // offset
8268 Op.getOperand(7), // cachepolicy, swizzled buffer
8269 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8270 };
8271 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
8272 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
8273 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
8274 MemSDNode *M = cast<MemSDNode>(Op);
8275 updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
8276
8277 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
8278 EVT VDataType = VData.getValueType().getScalarType();
8279 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
8280 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
8281
8282 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8283 M->getMemoryVT(), M->getMemOperand());
8284 }
8285 case Intrinsic::amdgcn_raw_buffer_load_lds:
8286 case Intrinsic::amdgcn_struct_buffer_load_lds: {
8287 unsigned Opc;
8288 bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds;
8289 unsigned OpOffset = HasVIndex ? 1 : 0;
8290 SDValue VOffset = Op.getOperand(5 + OpOffset);
8291 auto CVOffset = dyn_cast<ConstantSDNode>(VOffset);
8292 bool HasVOffset = !CVOffset || !CVOffset->isZero();
8293 unsigned Size = Op->getConstantOperandVal(4);
8294
8295 switch (Size) {
8296 default:
8297 return SDValue();
8298 case 1:
8299 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
8300 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
8301 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
8302 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
8303 break;
8304 case 2:
8305 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
8306 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
8307 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
8308 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
8309 break;
8310 case 4:
8311 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
8312 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
8313 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
8314 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
8315 break;
8316 }
8317
8318 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
8319
8320 SmallVector<SDValue, 8> Ops;
8321
8322 if (HasVIndex && HasVOffset)
8323 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
8324 { Op.getOperand(5), // VIndex
8325 VOffset }));
8326 else if (HasVIndex)
8327 Ops.push_back(Op.getOperand(5));
8328 else if (HasVOffset)
8329 Ops.push_back(VOffset);
8330
8331 Ops.push_back(Op.getOperand(2)); // rsrc
8332 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
8333 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
8334 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
8335 Ops.push_back(
8336 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
8337 Ops.push_back(
8338 DAG.getTargetConstant((Aux >> 3) & 1, DL, MVT::i8)); // swz
8339 Ops.push_back(M0Val.getValue(0)); // Chain
8340 Ops.push_back(M0Val.getValue(1)); // Glue
8341
8342 auto *M = cast<MemSDNode>(Op);
8343 MachineMemOperand *LoadMMO = M->getMemOperand();
8344 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
8345 LoadPtrI.Offset = Op->getConstantOperandVal(7 + OpOffset);
8346 MachinePointerInfo StorePtrI = LoadPtrI;
8347 StorePtrI.V = nullptr;
8348 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
8349
8350 auto F = LoadMMO->getFlags() &
8351 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
8352 LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
8353 Size, LoadMMO->getBaseAlign());
8354
8355 MachineMemOperand *StoreMMO =
8356 MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
8357 sizeof(int32_t), LoadMMO->getBaseAlign());
8358
8359 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
8360 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
8361
8362 return SDValue(Load, 0);
8363 }
8364 case Intrinsic::amdgcn_global_load_lds: {
8365 unsigned Opc;
8366 unsigned Size = Op->getConstantOperandVal(4);
8367 switch (Size) {
8368 default:
8369 return SDValue();
8370 case 1:
8371 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
8372 break;
8373 case 2:
8374 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
8375 break;
8376 case 4:
8377 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
8378 break;
8379 }
8380
8381 auto *M = cast<MemSDNode>(Op);
8382 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
8383
8384 SmallVector<SDValue, 6> Ops;
8385
8386 SDValue Addr = Op.getOperand(2); // Global ptr
8387 SDValue VOffset;
8388 // Try to split SAddr and VOffset. Global and LDS pointers share the same
8389 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
8390 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
8391 SDValue LHS = Addr.getOperand(0);
8392 SDValue RHS = Addr.getOperand(1);
8393
8394 if (LHS->isDivergent())
8395 std::swap(LHS, RHS);
8396
8397 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
8398 RHS.getOperand(0).getValueType() == MVT::i32) {
8399 // add (i64 sgpr), (zero_extend (i32 vgpr))
8400 Addr = LHS;
8401 VOffset = RHS.getOperand(0);
8402 }
8403 }
8404
8405 Ops.push_back(Addr);
8406 if (!Addr->isDivergent()) {
8407 Opc = AMDGPU::getGlobalSaddrOp(Opc);
8408 if (!VOffset)
8409 VOffset = SDValue(
8410 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
8411 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
8412 Ops.push_back(VOffset);
8413 }
8414
8415 Ops.push_back(Op.getOperand(5)); // Offset
8416 Ops.push_back(Op.getOperand(6)); // CPol
8417 Ops.push_back(M0Val.getValue(0)); // Chain
8418 Ops.push_back(M0Val.getValue(1)); // Glue
8419
8420 MachineMemOperand *LoadMMO = M->getMemOperand();
8421 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
8422 LoadPtrI.Offset = Op->getConstantOperandVal(5);
8423 MachinePointerInfo StorePtrI = LoadPtrI;
8424 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
8425 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
8426 auto F = LoadMMO->getFlags() &
8427 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
8428 LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
8429 Size, LoadMMO->getBaseAlign());
8430 MachineMemOperand *StoreMMO =
8431 MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
8432 sizeof(int32_t), Align(4));
8433
8434 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
8435 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
8436
8437 return SDValue(Load, 0);
8438 }
8439 case Intrinsic::amdgcn_end_cf:
8440 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
8441 Op->getOperand(2), Chain), 0);
8442
8443 default: {
8444 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8445 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
8446 return lowerImage(Op, ImageDimIntr, DAG, true);
8447
8448 return Op;
8449 }
8450 }
8451 }
8452
makeV_ILLEGAL(SDValue Op,SelectionDAG & DAG) const8453 SDValue SITargetLowering::makeV_ILLEGAL(SDValue Op, SelectionDAG & DAG) const {
8454 // Create the V_ILLEGAL node.
8455 SDLoc DL(Op);
8456 auto Opcode = Subtarget->getGeneration() < AMDGPUSubtarget::GFX10 ?
8457 AMDGPU::V_ILLEGAL_gfx6_gfx7_gfx8_gfx9 : AMDGPU::V_ILLEGAL;
8458 auto EntryNode = DAG.getEntryNode();
8459 auto IllegalNode = DAG.getMachineNode(Opcode, DL, MVT::Other, EntryNode);
8460 auto IllegalVal = SDValue(IllegalNode, 0u);
8461
8462 // Add the V_ILLEGAL node to the root chain to prevent its removal.
8463 auto Chains = SmallVector<SDValue, 2u>();
8464 Chains.push_back(IllegalVal);
8465 Chains.push_back(DAG.getRoot());
8466 auto Root = DAG.getTokenFactor(SDLoc(Chains.back()), Chains);
8467 DAG.setRoot(Root);
8468
8469 // Merge with UNDEF to satisfy return value requirements.
8470 auto UndefVal = DAG.getUNDEF(Op.getValueType());
8471 return DAG.getMergeValues({UndefVal, IllegalVal}, DL);
8472 }
8473
8474 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
8475 // offset (the offset that is included in bounds checking and swizzling, to be
8476 // split between the instruction's voffset and immoffset fields) and soffset
8477 // (the offset that is excluded from bounds checking and swizzling, to go in
8478 // the instruction's soffset field). This function takes the first kind of
8479 // offset and figures out how to split it between voffset and immoffset.
splitBufferOffsets(SDValue Offset,SelectionDAG & DAG) const8480 std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
8481 SDValue Offset, SelectionDAG &DAG) const {
8482 SDLoc DL(Offset);
8483 const unsigned MaxImm = 4095;
8484 SDValue N0 = Offset;
8485 ConstantSDNode *C1 = nullptr;
8486
8487 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
8488 N0 = SDValue();
8489 else if (DAG.isBaseWithConstantOffset(N0)) {
8490 C1 = cast<ConstantSDNode>(N0.getOperand(1));
8491 N0 = N0.getOperand(0);
8492 }
8493
8494 if (C1) {
8495 unsigned ImmOffset = C1->getZExtValue();
8496 // If the immediate value is too big for the immoffset field, put the value
8497 // and -4096 into the immoffset field so that the value that is copied/added
8498 // for the voffset field is a multiple of 4096, and it stands more chance
8499 // of being CSEd with the copy/add for another similar load/store.
8500 // However, do not do that rounding down to a multiple of 4096 if that is a
8501 // negative number, as it appears to be illegal to have a negative offset
8502 // in the vgpr, even if adding the immediate offset makes it positive.
8503 unsigned Overflow = ImmOffset & ~MaxImm;
8504 ImmOffset -= Overflow;
8505 if ((int32_t)Overflow < 0) {
8506 Overflow += ImmOffset;
8507 ImmOffset = 0;
8508 }
8509 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
8510 if (Overflow) {
8511 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
8512 if (!N0)
8513 N0 = OverflowVal;
8514 else {
8515 SDValue Ops[] = { N0, OverflowVal };
8516 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
8517 }
8518 }
8519 }
8520 if (!N0)
8521 N0 = DAG.getConstant(0, DL, MVT::i32);
8522 if (!C1)
8523 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
8524 return {N0, SDValue(C1, 0)};
8525 }
8526
8527 // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
8528 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
8529 // pointed to by Offsets.
setBufferOffsets(SDValue CombinedOffset,SelectionDAG & DAG,SDValue * Offsets,Align Alignment) const8530 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
8531 SelectionDAG &DAG, SDValue *Offsets,
8532 Align Alignment) const {
8533 SDLoc DL(CombinedOffset);
8534 if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
8535 uint32_t Imm = C->getZExtValue();
8536 uint32_t SOffset, ImmOffset;
8537 if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget,
8538 Alignment)) {
8539 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
8540 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
8541 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
8542 return;
8543 }
8544 }
8545 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
8546 SDValue N0 = CombinedOffset.getOperand(0);
8547 SDValue N1 = CombinedOffset.getOperand(1);
8548 uint32_t SOffset, ImmOffset;
8549 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
8550 if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
8551 Subtarget, Alignment)) {
8552 Offsets[0] = N0;
8553 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
8554 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
8555 return;
8556 }
8557 }
8558 Offsets[0] = CombinedOffset;
8559 Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
8560 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
8561 }
8562
8563 // Handle 8 bit and 16 bit buffer loads
handleByteShortBufferLoads(SelectionDAG & DAG,EVT LoadVT,SDLoc DL,ArrayRef<SDValue> Ops,MemSDNode * M) const8564 SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
8565 EVT LoadVT, SDLoc DL,
8566 ArrayRef<SDValue> Ops,
8567 MemSDNode *M) const {
8568 EVT IntVT = LoadVT.changeTypeToInteger();
8569 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
8570 AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
8571
8572 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
8573 SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
8574 Ops, IntVT,
8575 M->getMemOperand());
8576 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
8577 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
8578
8579 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
8580 }
8581
8582 // Handle 8 bit and 16 bit buffer stores
handleByteShortBufferStores(SelectionDAG & DAG,EVT VDataType,SDLoc DL,SDValue Ops[],MemSDNode * M) const8583 SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
8584 EVT VDataType, SDLoc DL,
8585 SDValue Ops[],
8586 MemSDNode *M) const {
8587 if (VDataType == MVT::f16)
8588 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
8589
8590 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
8591 Ops[1] = BufferStoreExt;
8592 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
8593 AMDGPUISD::BUFFER_STORE_SHORT;
8594 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
8595 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
8596 M->getMemOperand());
8597 }
8598
getLoadExtOrTrunc(SelectionDAG & DAG,ISD::LoadExtType ExtType,SDValue Op,const SDLoc & SL,EVT VT)8599 static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
8600 ISD::LoadExtType ExtType, SDValue Op,
8601 const SDLoc &SL, EVT VT) {
8602 if (VT.bitsLT(Op.getValueType()))
8603 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
8604
8605 switch (ExtType) {
8606 case ISD::SEXTLOAD:
8607 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
8608 case ISD::ZEXTLOAD:
8609 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
8610 case ISD::EXTLOAD:
8611 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
8612 case ISD::NON_EXTLOAD:
8613 return Op;
8614 }
8615
8616 llvm_unreachable("invalid ext type");
8617 }
8618
widenLoad(LoadSDNode * Ld,DAGCombinerInfo & DCI) const8619 SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
8620 SelectionDAG &DAG = DCI.DAG;
8621 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
8622 return SDValue();
8623
8624 // FIXME: Constant loads should all be marked invariant.
8625 unsigned AS = Ld->getAddressSpace();
8626 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
8627 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8628 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
8629 return SDValue();
8630
8631 // Don't do this early, since it may interfere with adjacent load merging for
8632 // illegal types. We can avoid losing alignment information for exotic types
8633 // pre-legalize.
8634 EVT MemVT = Ld->getMemoryVT();
8635 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
8636 MemVT.getSizeInBits() >= 32)
8637 return SDValue();
8638
8639 SDLoc SL(Ld);
8640
8641 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
8642 "unexpected vector extload");
8643
8644 // TODO: Drop only high part of range.
8645 SDValue Ptr = Ld->getBasePtr();
8646 SDValue NewLoad = DAG.getLoad(
8647 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
8648 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
8649 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
8650 nullptr); // Drop ranges
8651
8652 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
8653 if (MemVT.isFloatingPoint()) {
8654 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
8655 "unexpected fp extload");
8656 TruncVT = MemVT.changeTypeToInteger();
8657 }
8658
8659 SDValue Cvt = NewLoad;
8660 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
8661 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
8662 DAG.getValueType(TruncVT));
8663 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
8664 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
8665 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
8666 } else {
8667 assert(Ld->getExtensionType() == ISD::EXTLOAD);
8668 }
8669
8670 EVT VT = Ld->getValueType(0);
8671 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
8672
8673 DCI.AddToWorklist(Cvt.getNode());
8674
8675 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
8676 // the appropriate extension from the 32-bit load.
8677 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
8678 DCI.AddToWorklist(Cvt.getNode());
8679
8680 // Handle conversion back to floating point if necessary.
8681 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
8682
8683 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
8684 }
8685
LowerLOAD(SDValue Op,SelectionDAG & DAG) const8686 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
8687 SDLoc DL(Op);
8688 LoadSDNode *Load = cast<LoadSDNode>(Op);
8689 ISD::LoadExtType ExtType = Load->getExtensionType();
8690 EVT MemVT = Load->getMemoryVT();
8691
8692 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
8693 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
8694 return SDValue();
8695
8696 // FIXME: Copied from PPC
8697 // First, load into 32 bits, then truncate to 1 bit.
8698
8699 SDValue Chain = Load->getChain();
8700 SDValue BasePtr = Load->getBasePtr();
8701 MachineMemOperand *MMO = Load->getMemOperand();
8702
8703 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
8704
8705 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
8706 BasePtr, RealMemVT, MMO);
8707
8708 if (!MemVT.isVector()) {
8709 SDValue Ops[] = {
8710 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
8711 NewLD.getValue(1)
8712 };
8713
8714 return DAG.getMergeValues(Ops, DL);
8715 }
8716
8717 SmallVector<SDValue, 3> Elts;
8718 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
8719 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
8720 DAG.getConstant(I, DL, MVT::i32));
8721
8722 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
8723 }
8724
8725 SDValue Ops[] = {
8726 DAG.getBuildVector(MemVT, DL, Elts),
8727 NewLD.getValue(1)
8728 };
8729
8730 return DAG.getMergeValues(Ops, DL);
8731 }
8732
8733 if (!MemVT.isVector())
8734 return SDValue();
8735
8736 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
8737 "Custom lowering for non-i32 vectors hasn't been implemented.");
8738
8739 Align Alignment = Load->getAlign();
8740 unsigned AS = Load->getAddressSpace();
8741 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
8742 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
8743 return SplitVectorLoad(Op, DAG);
8744 }
8745
8746 MachineFunction &MF = DAG.getMachineFunction();
8747 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8748 // If there is a possibility that flat instruction access scratch memory
8749 // then we need to use the same legalization rules we use for private.
8750 if (AS == AMDGPUAS::FLAT_ADDRESS &&
8751 !Subtarget->hasMultiDwordFlatScratchAddressing())
8752 AS = MFI->hasFlatScratchInit() ?
8753 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
8754
8755 unsigned NumElements = MemVT.getVectorNumElements();
8756
8757 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
8758 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
8759 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
8760 if (MemVT.isPow2VectorType())
8761 return SDValue();
8762 return WidenOrSplitVectorLoad(Op, DAG);
8763 }
8764 // Non-uniform loads will be selected to MUBUF instructions, so they
8765 // have the same legalization requirements as global and private
8766 // loads.
8767 //
8768 }
8769
8770 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
8771 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
8772 AS == AMDGPUAS::GLOBAL_ADDRESS) {
8773 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
8774 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
8775 Alignment >= Align(4) && NumElements < 32) {
8776 if (MemVT.isPow2VectorType())
8777 return SDValue();
8778 return WidenOrSplitVectorLoad(Op, DAG);
8779 }
8780 // Non-uniform loads will be selected to MUBUF instructions, so they
8781 // have the same legalization requirements as global and private
8782 // loads.
8783 //
8784 }
8785 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
8786 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
8787 AS == AMDGPUAS::GLOBAL_ADDRESS ||
8788 AS == AMDGPUAS::FLAT_ADDRESS) {
8789 if (NumElements > 4)
8790 return SplitVectorLoad(Op, DAG);
8791 // v3 loads not supported on SI.
8792 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
8793 return WidenOrSplitVectorLoad(Op, DAG);
8794
8795 // v3 and v4 loads are supported for private and global memory.
8796 return SDValue();
8797 }
8798 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
8799 // Depending on the setting of the private_element_size field in the
8800 // resource descriptor, we can only make private accesses up to a certain
8801 // size.
8802 switch (Subtarget->getMaxPrivateElementSize()) {
8803 case 4: {
8804 SDValue Ops[2];
8805 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
8806 return DAG.getMergeValues(Ops, DL);
8807 }
8808 case 8:
8809 if (NumElements > 2)
8810 return SplitVectorLoad(Op, DAG);
8811 return SDValue();
8812 case 16:
8813 // Same as global/flat
8814 if (NumElements > 4)
8815 return SplitVectorLoad(Op, DAG);
8816 // v3 loads not supported on SI.
8817 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
8818 return WidenOrSplitVectorLoad(Op, DAG);
8819
8820 return SDValue();
8821 default:
8822 llvm_unreachable("unsupported private_element_size");
8823 }
8824 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
8825 unsigned Fast = 0;
8826 auto Flags = Load->getMemOperand()->getFlags();
8827 if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
8828 Load->getAlign(), Flags, &Fast) &&
8829 Fast > 1)
8830 return SDValue();
8831
8832 if (MemVT.isVector())
8833 return SplitVectorLoad(Op, DAG);
8834 }
8835
8836 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
8837 MemVT, *Load->getMemOperand())) {
8838 SDValue Ops[2];
8839 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
8840 return DAG.getMergeValues(Ops, DL);
8841 }
8842
8843 return SDValue();
8844 }
8845
LowerSELECT(SDValue Op,SelectionDAG & DAG) const8846 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
8847 EVT VT = Op.getValueType();
8848 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
8849 return splitTernaryVectorOp(Op, DAG);
8850
8851 assert(VT.getSizeInBits() == 64);
8852
8853 SDLoc DL(Op);
8854 SDValue Cond = Op.getOperand(0);
8855
8856 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
8857 SDValue One = DAG.getConstant(1, DL, MVT::i32);
8858
8859 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
8860 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
8861
8862 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
8863 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
8864
8865 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
8866
8867 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
8868 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
8869
8870 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
8871
8872 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
8873 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
8874 }
8875
8876 // Catch division cases where we can use shortcuts with rcp and rsq
8877 // instructions.
lowerFastUnsafeFDIV(SDValue Op,SelectionDAG & DAG) const8878 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
8879 SelectionDAG &DAG) const {
8880 SDLoc SL(Op);
8881 SDValue LHS = Op.getOperand(0);
8882 SDValue RHS = Op.getOperand(1);
8883 EVT VT = Op.getValueType();
8884 const SDNodeFlags Flags = Op->getFlags();
8885
8886 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
8887
8888 // Without !fpmath accuracy information, we can't do more because we don't
8889 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
8890 if (!AllowInaccurateRcp)
8891 return SDValue();
8892
8893 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
8894 if (CLHS->isExactlyValue(1.0)) {
8895 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
8896 // the CI documentation has a worst case error of 1 ulp.
8897 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
8898 // use it as long as we aren't trying to use denormals.
8899 //
8900 // v_rcp_f16 and v_rsq_f16 DO support denormals.
8901
8902 // 1.0 / sqrt(x) -> rsq(x)
8903
8904 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
8905 // error seems really high at 2^29 ULP.
8906 if (RHS.getOpcode() == ISD::FSQRT)
8907 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
8908
8909 // 1.0 / x -> rcp(x)
8910 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
8911 }
8912
8913 // Same as for 1.0, but expand the sign out of the constant.
8914 if (CLHS->isExactlyValue(-1.0)) {
8915 // -1.0 / x -> rcp (fneg x)
8916 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
8917 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
8918 }
8919 }
8920
8921 // Turn into multiply by the reciprocal.
8922 // x / y -> x * (1.0 / y)
8923 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
8924 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
8925 }
8926
lowerFastUnsafeFDIV64(SDValue Op,SelectionDAG & DAG) const8927 SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
8928 SelectionDAG &DAG) const {
8929 SDLoc SL(Op);
8930 SDValue X = Op.getOperand(0);
8931 SDValue Y = Op.getOperand(1);
8932 EVT VT = Op.getValueType();
8933 const SDNodeFlags Flags = Op->getFlags();
8934
8935 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
8936 DAG.getTarget().Options.UnsafeFPMath;
8937 if (!AllowInaccurateDiv)
8938 return SDValue();
8939
8940 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
8941 SDValue One = DAG.getConstantFP(1.0, SL, VT);
8942
8943 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
8944 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
8945
8946 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
8947 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
8948 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
8949 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
8950 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
8951 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
8952 }
8953
getFPBinOp(SelectionDAG & DAG,unsigned Opcode,const SDLoc & SL,EVT VT,SDValue A,SDValue B,SDValue GlueChain,SDNodeFlags Flags)8954 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
8955 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
8956 SDNodeFlags Flags) {
8957 if (GlueChain->getNumValues() <= 1) {
8958 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
8959 }
8960
8961 assert(GlueChain->getNumValues() == 3);
8962
8963 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
8964 switch (Opcode) {
8965 default: llvm_unreachable("no chain equivalent for opcode");
8966 case ISD::FMUL:
8967 Opcode = AMDGPUISD::FMUL_W_CHAIN;
8968 break;
8969 }
8970
8971 return DAG.getNode(Opcode, SL, VTList,
8972 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
8973 Flags);
8974 }
8975
getFPTernOp(SelectionDAG & DAG,unsigned Opcode,const SDLoc & SL,EVT VT,SDValue A,SDValue B,SDValue C,SDValue GlueChain,SDNodeFlags Flags)8976 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
8977 EVT VT, SDValue A, SDValue B, SDValue C,
8978 SDValue GlueChain, SDNodeFlags Flags) {
8979 if (GlueChain->getNumValues() <= 1) {
8980 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
8981 }
8982
8983 assert(GlueChain->getNumValues() == 3);
8984
8985 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
8986 switch (Opcode) {
8987 default: llvm_unreachable("no chain equivalent for opcode");
8988 case ISD::FMA:
8989 Opcode = AMDGPUISD::FMA_W_CHAIN;
8990 break;
8991 }
8992
8993 return DAG.getNode(Opcode, SL, VTList,
8994 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
8995 Flags);
8996 }
8997
LowerFDIV16(SDValue Op,SelectionDAG & DAG) const8998 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
8999 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
9000 return FastLowered;
9001
9002 SDLoc SL(Op);
9003 SDValue Src0 = Op.getOperand(0);
9004 SDValue Src1 = Op.getOperand(1);
9005
9006 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
9007 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
9008
9009 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
9010 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
9011
9012 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
9013 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
9014
9015 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
9016 }
9017
9018 // Faster 2.5 ULP division that does not support denormals.
lowerFDIV_FAST(SDValue Op,SelectionDAG & DAG) const9019 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
9020 SDLoc SL(Op);
9021 SDValue LHS = Op.getOperand(1);
9022 SDValue RHS = Op.getOperand(2);
9023
9024 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
9025
9026 const APFloat K0Val(BitsToFloat(0x6f800000));
9027 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
9028
9029 const APFloat K1Val(BitsToFloat(0x2f800000));
9030 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
9031
9032 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
9033
9034 EVT SetCCVT =
9035 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
9036
9037 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
9038
9039 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
9040
9041 // TODO: Should this propagate fast-math-flags?
9042 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
9043
9044 // rcp does not support denormals.
9045 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
9046
9047 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
9048
9049 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
9050 }
9051
9052 // Returns immediate value for setting the F32 denorm mode when using the
9053 // S_DENORM_MODE instruction.
getSPDenormModeValue(int SPDenormMode,SelectionDAG & DAG,const SDLoc & SL,const GCNSubtarget * ST)9054 static SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
9055 const SDLoc &SL, const GCNSubtarget *ST) {
9056 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
9057 int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction())
9058 ? FP_DENORM_FLUSH_NONE
9059 : FP_DENORM_FLUSH_IN_FLUSH_OUT;
9060
9061 int Mode = SPDenormMode | (DPDenormModeDefault << 2);
9062 return DAG.getTargetConstant(Mode, SL, MVT::i32);
9063 }
9064
LowerFDIV32(SDValue Op,SelectionDAG & DAG) const9065 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
9066 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
9067 return FastLowered;
9068
9069 // The selection matcher assumes anything with a chain selecting to a
9070 // mayRaiseFPException machine instruction. Since we're introducing a chain
9071 // here, we need to explicitly report nofpexcept for the regular fdiv
9072 // lowering.
9073 SDNodeFlags Flags = Op->getFlags();
9074 Flags.setNoFPExcept(true);
9075
9076 SDLoc SL(Op);
9077 SDValue LHS = Op.getOperand(0);
9078 SDValue RHS = Op.getOperand(1);
9079
9080 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
9081
9082 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
9083
9084 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
9085 {RHS, RHS, LHS}, Flags);
9086 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
9087 {LHS, RHS, LHS}, Flags);
9088
9089 // Denominator is scaled to not be denormal, so using rcp is ok.
9090 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
9091 DenominatorScaled, Flags);
9092 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
9093 DenominatorScaled, Flags);
9094
9095 const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
9096 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
9097 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
9098 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
9099
9100 const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction());
9101
9102 if (!HasFP32Denormals) {
9103 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
9104 // lowering. The chain dependence is insufficient, and we need glue. We do
9105 // not need the glue variants in a strictfp function.
9106
9107 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
9108
9109 SDNode *EnableDenorm;
9110 if (Subtarget->hasDenormModeInst()) {
9111 const SDValue EnableDenormValue =
9112 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget);
9113
9114 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
9115 DAG.getEntryNode(), EnableDenormValue).getNode();
9116 } else {
9117 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
9118 SL, MVT::i32);
9119 EnableDenorm =
9120 DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
9121 {EnableDenormValue, BitField, DAG.getEntryNode()});
9122 }
9123
9124 SDValue Ops[3] = {
9125 NegDivScale0,
9126 SDValue(EnableDenorm, 0),
9127 SDValue(EnableDenorm, 1)
9128 };
9129
9130 NegDivScale0 = DAG.getMergeValues(Ops, SL);
9131 }
9132
9133 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
9134 ApproxRcp, One, NegDivScale0, Flags);
9135
9136 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
9137 ApproxRcp, Fma0, Flags);
9138
9139 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
9140 Fma1, Fma1, Flags);
9141
9142 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
9143 NumeratorScaled, Mul, Flags);
9144
9145 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
9146 Fma2, Fma1, Mul, Fma2, Flags);
9147
9148 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
9149 NumeratorScaled, Fma3, Flags);
9150
9151 if (!HasFP32Denormals) {
9152 SDNode *DisableDenorm;
9153 if (Subtarget->hasDenormModeInst()) {
9154 const SDValue DisableDenormValue =
9155 getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget);
9156
9157 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
9158 Fma4.getValue(1), DisableDenormValue,
9159 Fma4.getValue(2)).getNode();
9160 } else {
9161 const SDValue DisableDenormValue =
9162 DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
9163
9164 DisableDenorm = DAG.getMachineNode(
9165 AMDGPU::S_SETREG_B32, SL, MVT::Other,
9166 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
9167 }
9168
9169 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
9170 SDValue(DisableDenorm, 0), DAG.getRoot());
9171 DAG.setRoot(OutputChain);
9172 }
9173
9174 SDValue Scale = NumeratorScaled.getValue(1);
9175 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
9176 {Fma4, Fma1, Fma3, Scale}, Flags);
9177
9178 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
9179 }
9180
LowerFDIV64(SDValue Op,SelectionDAG & DAG) const9181 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
9182 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
9183 return FastLowered;
9184
9185 SDLoc SL(Op);
9186 SDValue X = Op.getOperand(0);
9187 SDValue Y = Op.getOperand(1);
9188
9189 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
9190
9191 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
9192
9193 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
9194
9195 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
9196
9197 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
9198
9199 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
9200
9201 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
9202
9203 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
9204
9205 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
9206
9207 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
9208 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
9209
9210 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
9211 NegDivScale0, Mul, DivScale1);
9212
9213 SDValue Scale;
9214
9215 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
9216 // Workaround a hardware bug on SI where the condition output from div_scale
9217 // is not usable.
9218
9219 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
9220
9221 // Figure out if the scale to use for div_fmas.
9222 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
9223 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
9224 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
9225 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
9226
9227 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
9228 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
9229
9230 SDValue Scale0Hi
9231 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
9232 SDValue Scale1Hi
9233 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
9234
9235 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
9236 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
9237 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
9238 } else {
9239 Scale = DivScale1.getValue(1);
9240 }
9241
9242 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
9243 Fma4, Fma3, Mul, Scale);
9244
9245 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
9246 }
9247
LowerFDIV(SDValue Op,SelectionDAG & DAG) const9248 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
9249 EVT VT = Op.getValueType();
9250
9251 if (VT == MVT::f32)
9252 return LowerFDIV32(Op, DAG);
9253
9254 if (VT == MVT::f64)
9255 return LowerFDIV64(Op, DAG);
9256
9257 if (VT == MVT::f16)
9258 return LowerFDIV16(Op, DAG);
9259
9260 llvm_unreachable("Unexpected type for fdiv");
9261 }
9262
LowerSTORE(SDValue Op,SelectionDAG & DAG) const9263 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
9264 SDLoc DL(Op);
9265 StoreSDNode *Store = cast<StoreSDNode>(Op);
9266 EVT VT = Store->getMemoryVT();
9267
9268 if (VT == MVT::i1) {
9269 return DAG.getTruncStore(Store->getChain(), DL,
9270 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
9271 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
9272 }
9273
9274 assert(VT.isVector() &&
9275 Store->getValue().getValueType().getScalarType() == MVT::i32);
9276
9277 unsigned AS = Store->getAddressSpace();
9278 if (Subtarget->hasLDSMisalignedBug() &&
9279 AS == AMDGPUAS::FLAT_ADDRESS &&
9280 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
9281 return SplitVectorStore(Op, DAG);
9282 }
9283
9284 MachineFunction &MF = DAG.getMachineFunction();
9285 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
9286 // If there is a possibility that flat instruction access scratch memory
9287 // then we need to use the same legalization rules we use for private.
9288 if (AS == AMDGPUAS::FLAT_ADDRESS &&
9289 !Subtarget->hasMultiDwordFlatScratchAddressing())
9290 AS = MFI->hasFlatScratchInit() ?
9291 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
9292
9293 unsigned NumElements = VT.getVectorNumElements();
9294 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
9295 AS == AMDGPUAS::FLAT_ADDRESS) {
9296 if (NumElements > 4)
9297 return SplitVectorStore(Op, DAG);
9298 // v3 stores not supported on SI.
9299 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
9300 return SplitVectorStore(Op, DAG);
9301
9302 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
9303 VT, *Store->getMemOperand()))
9304 return expandUnalignedStore(Store, DAG);
9305
9306 return SDValue();
9307 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
9308 switch (Subtarget->getMaxPrivateElementSize()) {
9309 case 4:
9310 return scalarizeVectorStore(Store, DAG);
9311 case 8:
9312 if (NumElements > 2)
9313 return SplitVectorStore(Op, DAG);
9314 return SDValue();
9315 case 16:
9316 if (NumElements > 4 ||
9317 (NumElements == 3 && !Subtarget->enableFlatScratch()))
9318 return SplitVectorStore(Op, DAG);
9319 return SDValue();
9320 default:
9321 llvm_unreachable("unsupported private_element_size");
9322 }
9323 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
9324 unsigned Fast = 0;
9325 auto Flags = Store->getMemOperand()->getFlags();
9326 if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
9327 Store->getAlign(), Flags, &Fast) &&
9328 Fast > 1)
9329 return SDValue();
9330
9331 if (VT.isVector())
9332 return SplitVectorStore(Op, DAG);
9333
9334 return expandUnalignedStore(Store, DAG);
9335 }
9336
9337 // Probably an invalid store. If so we'll end up emitting a selection error.
9338 return SDValue();
9339 }
9340
LowerTrig(SDValue Op,SelectionDAG & DAG) const9341 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
9342 SDLoc DL(Op);
9343 EVT VT = Op.getValueType();
9344 SDValue Arg = Op.getOperand(0);
9345 SDValue TrigVal;
9346
9347 // Propagate fast-math flags so that the multiply we introduce can be folded
9348 // if Arg is already the result of a multiply by constant.
9349 auto Flags = Op->getFlags();
9350
9351 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
9352
9353 if (Subtarget->hasTrigReducedRange()) {
9354 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
9355 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
9356 } else {
9357 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
9358 }
9359
9360 switch (Op.getOpcode()) {
9361 case ISD::FCOS:
9362 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
9363 case ISD::FSIN:
9364 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
9365 default:
9366 llvm_unreachable("Wrong trig opcode");
9367 }
9368 }
9369
LowerATOMIC_CMP_SWAP(SDValue Op,SelectionDAG & DAG) const9370 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
9371 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
9372 assert(AtomicNode->isCompareAndSwap());
9373 unsigned AS = AtomicNode->getAddressSpace();
9374
9375 // No custom lowering required for local address space
9376 if (!AMDGPU::isFlatGlobalAddrSpace(AS))
9377 return Op;
9378
9379 // Non-local address space requires custom lowering for atomic compare
9380 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
9381 SDLoc DL(Op);
9382 SDValue ChainIn = Op.getOperand(0);
9383 SDValue Addr = Op.getOperand(1);
9384 SDValue Old = Op.getOperand(2);
9385 SDValue New = Op.getOperand(3);
9386 EVT VT = Op.getValueType();
9387 MVT SimpleVT = VT.getSimpleVT();
9388 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
9389
9390 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
9391 SDValue Ops[] = { ChainIn, Addr, NewOld };
9392
9393 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
9394 Ops, VT, AtomicNode->getMemOperand());
9395 }
9396
9397 //===----------------------------------------------------------------------===//
9398 // Custom DAG optimizations
9399 //===----------------------------------------------------------------------===//
9400
performUCharToFloatCombine(SDNode * N,DAGCombinerInfo & DCI) const9401 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
9402 DAGCombinerInfo &DCI) const {
9403 EVT VT = N->getValueType(0);
9404 EVT ScalarVT = VT.getScalarType();
9405 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
9406 return SDValue();
9407
9408 SelectionDAG &DAG = DCI.DAG;
9409 SDLoc DL(N);
9410
9411 SDValue Src = N->getOperand(0);
9412 EVT SrcVT = Src.getValueType();
9413
9414 // TODO: We could try to match extracting the higher bytes, which would be
9415 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
9416 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
9417 // about in practice.
9418 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
9419 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
9420 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
9421 DCI.AddToWorklist(Cvt.getNode());
9422
9423 // For the f16 case, fold to a cast to f32 and then cast back to f16.
9424 if (ScalarVT != MVT::f32) {
9425 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
9426 DAG.getTargetConstant(0, DL, MVT::i32));
9427 }
9428 return Cvt;
9429 }
9430 }
9431
9432 return SDValue();
9433 }
9434
9435 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
9436
9437 // This is a variant of
9438 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
9439 //
9440 // The normal DAG combiner will do this, but only if the add has one use since
9441 // that would increase the number of instructions.
9442 //
9443 // This prevents us from seeing a constant offset that can be folded into a
9444 // memory instruction's addressing mode. If we know the resulting add offset of
9445 // a pointer can be folded into an addressing offset, we can replace the pointer
9446 // operand with the add of new constant offset. This eliminates one of the uses,
9447 // and may allow the remaining use to also be simplified.
9448 //
performSHLPtrCombine(SDNode * N,unsigned AddrSpace,EVT MemVT,DAGCombinerInfo & DCI) const9449 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
9450 unsigned AddrSpace,
9451 EVT MemVT,
9452 DAGCombinerInfo &DCI) const {
9453 SDValue N0 = N->getOperand(0);
9454 SDValue N1 = N->getOperand(1);
9455
9456 // We only do this to handle cases where it's profitable when there are
9457 // multiple uses of the add, so defer to the standard combine.
9458 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
9459 N0->hasOneUse())
9460 return SDValue();
9461
9462 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
9463 if (!CN1)
9464 return SDValue();
9465
9466 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
9467 if (!CAdd)
9468 return SDValue();
9469
9470 // If the resulting offset is too large, we can't fold it into the addressing
9471 // mode offset.
9472 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
9473 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
9474
9475 AddrMode AM;
9476 AM.HasBaseReg = true;
9477 AM.BaseOffs = Offset.getSExtValue();
9478 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
9479 return SDValue();
9480
9481 SelectionDAG &DAG = DCI.DAG;
9482 SDLoc SL(N);
9483 EVT VT = N->getValueType(0);
9484
9485 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
9486 SDValue COffset = DAG.getConstant(Offset, SL, VT);
9487
9488 SDNodeFlags Flags;
9489 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
9490 (N0.getOpcode() == ISD::OR ||
9491 N0->getFlags().hasNoUnsignedWrap()));
9492
9493 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
9494 }
9495
9496 /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
9497 /// by the chain and intrinsic ID. Theoretically we would also need to check the
9498 /// specific intrinsic, but they all place the pointer operand first.
getBasePtrIndex(const MemSDNode * N)9499 static unsigned getBasePtrIndex(const MemSDNode *N) {
9500 switch (N->getOpcode()) {
9501 case ISD::STORE:
9502 case ISD::INTRINSIC_W_CHAIN:
9503 case ISD::INTRINSIC_VOID:
9504 return 2;
9505 default:
9506 return 1;
9507 }
9508 }
9509
performMemSDNodeCombine(MemSDNode * N,DAGCombinerInfo & DCI) const9510 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
9511 DAGCombinerInfo &DCI) const {
9512 SelectionDAG &DAG = DCI.DAG;
9513 SDLoc SL(N);
9514
9515 unsigned PtrIdx = getBasePtrIndex(N);
9516 SDValue Ptr = N->getOperand(PtrIdx);
9517
9518 // TODO: We could also do this for multiplies.
9519 if (Ptr.getOpcode() == ISD::SHL) {
9520 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
9521 N->getMemoryVT(), DCI);
9522 if (NewPtr) {
9523 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
9524
9525 NewOps[PtrIdx] = NewPtr;
9526 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
9527 }
9528 }
9529
9530 return SDValue();
9531 }
9532
bitOpWithConstantIsReducible(unsigned Opc,uint32_t Val)9533 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
9534 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
9535 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
9536 (Opc == ISD::XOR && Val == 0);
9537 }
9538
9539 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
9540 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
9541 // integer combine opportunities since most 64-bit operations are decomposed
9542 // this way. TODO: We won't want this for SALU especially if it is an inline
9543 // immediate.
splitBinaryBitConstantOp(DAGCombinerInfo & DCI,const SDLoc & SL,unsigned Opc,SDValue LHS,const ConstantSDNode * CRHS) const9544 SDValue SITargetLowering::splitBinaryBitConstantOp(
9545 DAGCombinerInfo &DCI,
9546 const SDLoc &SL,
9547 unsigned Opc, SDValue LHS,
9548 const ConstantSDNode *CRHS) const {
9549 uint64_t Val = CRHS->getZExtValue();
9550 uint32_t ValLo = Lo_32(Val);
9551 uint32_t ValHi = Hi_32(Val);
9552 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9553
9554 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
9555 bitOpWithConstantIsReducible(Opc, ValHi)) ||
9556 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
9557 // If we need to materialize a 64-bit immediate, it will be split up later
9558 // anyway. Avoid creating the harder to understand 64-bit immediate
9559 // materialization.
9560 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
9561 }
9562
9563 return SDValue();
9564 }
9565
9566 // Returns true if argument is a boolean value which is not serialized into
9567 // memory or argument and does not require v_cndmask_b32 to be deserialized.
isBoolSGPR(SDValue V)9568 static bool isBoolSGPR(SDValue V) {
9569 if (V.getValueType() != MVT::i1)
9570 return false;
9571 switch (V.getOpcode()) {
9572 default:
9573 break;
9574 case ISD::SETCC:
9575 case AMDGPUISD::FP_CLASS:
9576 return true;
9577 case ISD::AND:
9578 case ISD::OR:
9579 case ISD::XOR:
9580 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
9581 }
9582 return false;
9583 }
9584
9585 // If a constant has all zeroes or all ones within each byte return it.
9586 // Otherwise return 0.
getConstantPermuteMask(uint32_t C)9587 static uint32_t getConstantPermuteMask(uint32_t C) {
9588 // 0xff for any zero byte in the mask
9589 uint32_t ZeroByteMask = 0;
9590 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
9591 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
9592 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
9593 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
9594 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
9595 if ((NonZeroByteMask & C) != NonZeroByteMask)
9596 return 0; // Partial bytes selected.
9597 return C;
9598 }
9599
9600 // Check if a node selects whole bytes from its operand 0 starting at a byte
9601 // boundary while masking the rest. Returns select mask as in the v_perm_b32
9602 // or -1 if not succeeded.
9603 // Note byte select encoding:
9604 // value 0-3 selects corresponding source byte;
9605 // value 0xc selects zero;
9606 // value 0xff selects 0xff.
getPermuteMask(SelectionDAG & DAG,SDValue V)9607 static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
9608 assert(V.getValueSizeInBits() == 32);
9609
9610 if (V.getNumOperands() != 2)
9611 return ~0;
9612
9613 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
9614 if (!N1)
9615 return ~0;
9616
9617 uint32_t C = N1->getZExtValue();
9618
9619 switch (V.getOpcode()) {
9620 default:
9621 break;
9622 case ISD::AND:
9623 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
9624 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
9625 }
9626 break;
9627
9628 case ISD::OR:
9629 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
9630 return (0x03020100 & ~ConstMask) | ConstMask;
9631 }
9632 break;
9633
9634 case ISD::SHL:
9635 if (C % 8)
9636 return ~0;
9637
9638 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
9639
9640 case ISD::SRL:
9641 if (C % 8)
9642 return ~0;
9643
9644 return uint32_t(0x0c0c0c0c03020100ull >> C);
9645 }
9646
9647 return ~0;
9648 }
9649
performAndCombine(SDNode * N,DAGCombinerInfo & DCI) const9650 SDValue SITargetLowering::performAndCombine(SDNode *N,
9651 DAGCombinerInfo &DCI) const {
9652 if (DCI.isBeforeLegalize())
9653 return SDValue();
9654
9655 SelectionDAG &DAG = DCI.DAG;
9656 EVT VT = N->getValueType(0);
9657 SDValue LHS = N->getOperand(0);
9658 SDValue RHS = N->getOperand(1);
9659
9660
9661 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
9662 if (VT == MVT::i64 && CRHS) {
9663 if (SDValue Split
9664 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
9665 return Split;
9666 }
9667
9668 if (CRHS && VT == MVT::i32) {
9669 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
9670 // nb = number of trailing zeroes in mask
9671 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
9672 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
9673 uint64_t Mask = CRHS->getZExtValue();
9674 unsigned Bits = llvm::popcount(Mask);
9675 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
9676 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
9677 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
9678 unsigned Shift = CShift->getZExtValue();
9679 unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
9680 unsigned Offset = NB + Shift;
9681 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
9682 SDLoc SL(N);
9683 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
9684 LHS->getOperand(0),
9685 DAG.getConstant(Offset, SL, MVT::i32),
9686 DAG.getConstant(Bits, SL, MVT::i32));
9687 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
9688 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
9689 DAG.getValueType(NarrowVT));
9690 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
9691 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
9692 return Shl;
9693 }
9694 }
9695 }
9696
9697 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
9698 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
9699 isa<ConstantSDNode>(LHS.getOperand(2))) {
9700 uint32_t Sel = getConstantPermuteMask(Mask);
9701 if (!Sel)
9702 return SDValue();
9703
9704 // Select 0xc for all zero bytes
9705 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
9706 SDLoc DL(N);
9707 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
9708 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
9709 }
9710 }
9711
9712 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
9713 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
9714 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
9715 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
9716 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
9717
9718 SDValue X = LHS.getOperand(0);
9719 SDValue Y = RHS.getOperand(0);
9720 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
9721 !isTypeLegal(X.getValueType()))
9722 return SDValue();
9723
9724 if (LCC == ISD::SETO) {
9725 if (X != LHS.getOperand(1))
9726 return SDValue();
9727
9728 if (RCC == ISD::SETUNE) {
9729 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
9730 if (!C1 || !C1->isInfinity() || C1->isNegative())
9731 return SDValue();
9732
9733 const uint32_t Mask = SIInstrFlags::N_NORMAL |
9734 SIInstrFlags::N_SUBNORMAL |
9735 SIInstrFlags::N_ZERO |
9736 SIInstrFlags::P_ZERO |
9737 SIInstrFlags::P_SUBNORMAL |
9738 SIInstrFlags::P_NORMAL;
9739
9740 static_assert(((~(SIInstrFlags::S_NAN |
9741 SIInstrFlags::Q_NAN |
9742 SIInstrFlags::N_INFINITY |
9743 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
9744 "mask not equal");
9745
9746 SDLoc DL(N);
9747 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
9748 X, DAG.getConstant(Mask, DL, MVT::i32));
9749 }
9750 }
9751 }
9752
9753 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
9754 std::swap(LHS, RHS);
9755
9756 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
9757 RHS.hasOneUse()) {
9758 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
9759 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
9760 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
9761 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
9762 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
9763 (RHS.getOperand(0) == LHS.getOperand(0) &&
9764 LHS.getOperand(0) == LHS.getOperand(1))) {
9765 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
9766 unsigned NewMask = LCC == ISD::SETO ?
9767 Mask->getZExtValue() & ~OrdMask :
9768 Mask->getZExtValue() & OrdMask;
9769
9770 SDLoc DL(N);
9771 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
9772 DAG.getConstant(NewMask, DL, MVT::i32));
9773 }
9774 }
9775
9776 if (VT == MVT::i32 &&
9777 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
9778 // and x, (sext cc from i1) => select cc, x, 0
9779 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
9780 std::swap(LHS, RHS);
9781 if (isBoolSGPR(RHS.getOperand(0)))
9782 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
9783 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
9784 }
9785
9786 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
9787 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9788 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
9789 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
9790 uint32_t LHSMask = getPermuteMask(DAG, LHS);
9791 uint32_t RHSMask = getPermuteMask(DAG, RHS);
9792 if (LHSMask != ~0u && RHSMask != ~0u) {
9793 // Canonicalize the expression in an attempt to have fewer unique masks
9794 // and therefore fewer registers used to hold the masks.
9795 if (LHSMask > RHSMask) {
9796 std::swap(LHSMask, RHSMask);
9797 std::swap(LHS, RHS);
9798 }
9799
9800 // Select 0xc for each lane used from source operand. Zero has 0xc mask
9801 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
9802 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
9803 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
9804
9805 // Check of we need to combine values from two sources within a byte.
9806 if (!(LHSUsedLanes & RHSUsedLanes) &&
9807 // If we select high and lower word keep it for SDWA.
9808 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
9809 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
9810 // Each byte in each mask is either selector mask 0-3, or has higher
9811 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
9812 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
9813 // mask which is not 0xff wins. By anding both masks we have a correct
9814 // result except that 0x0c shall be corrected to give 0x0c only.
9815 uint32_t Mask = LHSMask & RHSMask;
9816 for (unsigned I = 0; I < 32; I += 8) {
9817 uint32_t ByteSel = 0xff << I;
9818 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
9819 Mask &= (0x0c << I) & 0xffffffff;
9820 }
9821
9822 // Add 4 to each active LHS lane. It will not affect any existing 0xff
9823 // or 0x0c.
9824 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
9825 SDLoc DL(N);
9826
9827 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
9828 LHS.getOperand(0), RHS.getOperand(0),
9829 DAG.getConstant(Sel, DL, MVT::i32));
9830 }
9831 }
9832 }
9833
9834 return SDValue();
9835 }
9836
performOrCombine(SDNode * N,DAGCombinerInfo & DCI) const9837 SDValue SITargetLowering::performOrCombine(SDNode *N,
9838 DAGCombinerInfo &DCI) const {
9839 SelectionDAG &DAG = DCI.DAG;
9840 SDValue LHS = N->getOperand(0);
9841 SDValue RHS = N->getOperand(1);
9842
9843 EVT VT = N->getValueType(0);
9844 if (VT == MVT::i1) {
9845 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
9846 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
9847 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
9848 SDValue Src = LHS.getOperand(0);
9849 if (Src != RHS.getOperand(0))
9850 return SDValue();
9851
9852 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
9853 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
9854 if (!CLHS || !CRHS)
9855 return SDValue();
9856
9857 // Only 10 bits are used.
9858 static const uint32_t MaxMask = 0x3ff;
9859
9860 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
9861 SDLoc DL(N);
9862 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
9863 Src, DAG.getConstant(NewMask, DL, MVT::i32));
9864 }
9865
9866 return SDValue();
9867 }
9868
9869 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
9870 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
9871 LHS.getOpcode() == AMDGPUISD::PERM &&
9872 isa<ConstantSDNode>(LHS.getOperand(2))) {
9873 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
9874 if (!Sel)
9875 return SDValue();
9876
9877 Sel |= LHS.getConstantOperandVal(2);
9878 SDLoc DL(N);
9879 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
9880 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
9881 }
9882
9883 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
9884 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9885 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
9886 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
9887 uint32_t LHSMask = getPermuteMask(DAG, LHS);
9888 uint32_t RHSMask = getPermuteMask(DAG, RHS);
9889 if (LHSMask != ~0u && RHSMask != ~0u) {
9890 // Canonicalize the expression in an attempt to have fewer unique masks
9891 // and therefore fewer registers used to hold the masks.
9892 if (LHSMask > RHSMask) {
9893 std::swap(LHSMask, RHSMask);
9894 std::swap(LHS, RHS);
9895 }
9896
9897 // Select 0xc for each lane used from source operand. Zero has 0xc mask
9898 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
9899 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
9900 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
9901
9902 // Check of we need to combine values from two sources within a byte.
9903 if (!(LHSUsedLanes & RHSUsedLanes) &&
9904 // If we select high and lower word keep it for SDWA.
9905 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
9906 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
9907 // Kill zero bytes selected by other mask. Zero value is 0xc.
9908 LHSMask &= ~RHSUsedLanes;
9909 RHSMask &= ~LHSUsedLanes;
9910 // Add 4 to each active LHS lane
9911 LHSMask |= LHSUsedLanes & 0x04040404;
9912 // Combine masks
9913 uint32_t Sel = LHSMask | RHSMask;
9914 SDLoc DL(N);
9915
9916 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
9917 LHS.getOperand(0), RHS.getOperand(0),
9918 DAG.getConstant(Sel, DL, MVT::i32));
9919 }
9920 }
9921 }
9922
9923 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
9924 return SDValue();
9925
9926 // TODO: This could be a generic combine with a predicate for extracting the
9927 // high half of an integer being free.
9928
9929 // (or i64:x, (zero_extend i32:y)) ->
9930 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
9931 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
9932 RHS.getOpcode() != ISD::ZERO_EXTEND)
9933 std::swap(LHS, RHS);
9934
9935 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
9936 SDValue ExtSrc = RHS.getOperand(0);
9937 EVT SrcVT = ExtSrc.getValueType();
9938 if (SrcVT == MVT::i32) {
9939 SDLoc SL(N);
9940 SDValue LowLHS, HiBits;
9941 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
9942 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
9943
9944 DCI.AddToWorklist(LowOr.getNode());
9945 DCI.AddToWorklist(HiBits.getNode());
9946
9947 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
9948 LowOr, HiBits);
9949 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
9950 }
9951 }
9952
9953 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
9954 if (CRHS) {
9955 if (SDValue Split
9956 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
9957 N->getOperand(0), CRHS))
9958 return Split;
9959 }
9960
9961 return SDValue();
9962 }
9963
performXorCombine(SDNode * N,DAGCombinerInfo & DCI) const9964 SDValue SITargetLowering::performXorCombine(SDNode *N,
9965 DAGCombinerInfo &DCI) const {
9966 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
9967 return RV;
9968
9969 EVT VT = N->getValueType(0);
9970 if (VT != MVT::i64)
9971 return SDValue();
9972
9973 SDValue LHS = N->getOperand(0);
9974 SDValue RHS = N->getOperand(1);
9975
9976 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
9977 if (CRHS) {
9978 if (SDValue Split
9979 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
9980 return Split;
9981 }
9982
9983 return SDValue();
9984 }
9985
performZeroExtendCombine(SDNode * N,DAGCombinerInfo & DCI) const9986 SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
9987 DAGCombinerInfo &DCI) const {
9988 if (!Subtarget->has16BitInsts() ||
9989 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
9990 return SDValue();
9991
9992 EVT VT = N->getValueType(0);
9993 if (VT != MVT::i32)
9994 return SDValue();
9995
9996 SDValue Src = N->getOperand(0);
9997 if (Src.getValueType() != MVT::i16)
9998 return SDValue();
9999
10000 return SDValue();
10001 }
10002
performSignExtendInRegCombine(SDNode * N,DAGCombinerInfo & DCI) const10003 SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N,
10004 DAGCombinerInfo &DCI)
10005 const {
10006 SDValue Src = N->getOperand(0);
10007 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
10008
10009 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
10010 VTSign->getVT() == MVT::i8) ||
10011 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
10012 VTSign->getVT() == MVT::i16)) &&
10013 Src.hasOneUse()) {
10014 auto *M = cast<MemSDNode>(Src);
10015 SDValue Ops[] = {
10016 Src.getOperand(0), // Chain
10017 Src.getOperand(1), // rsrc
10018 Src.getOperand(2), // vindex
10019 Src.getOperand(3), // voffset
10020 Src.getOperand(4), // soffset
10021 Src.getOperand(5), // offset
10022 Src.getOperand(6),
10023 Src.getOperand(7)
10024 };
10025 // replace with BUFFER_LOAD_BYTE/SHORT
10026 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
10027 Src.getOperand(0).getValueType());
10028 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
10029 AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT;
10030 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
10031 ResList,
10032 Ops, M->getMemoryVT(),
10033 M->getMemOperand());
10034 return DCI.DAG.getMergeValues({BufferLoadSignExt,
10035 BufferLoadSignExt.getValue(1)}, SDLoc(N));
10036 }
10037 return SDValue();
10038 }
10039
performClassCombine(SDNode * N,DAGCombinerInfo & DCI) const10040 SDValue SITargetLowering::performClassCombine(SDNode *N,
10041 DAGCombinerInfo &DCI) const {
10042 SelectionDAG &DAG = DCI.DAG;
10043 SDValue Mask = N->getOperand(1);
10044
10045 // fp_class x, 0 -> false
10046 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
10047 if (CMask->isZero())
10048 return DAG.getConstant(0, SDLoc(N), MVT::i1);
10049 }
10050
10051 if (N->getOperand(0).isUndef())
10052 return DAG.getUNDEF(MVT::i1);
10053
10054 return SDValue();
10055 }
10056
performRcpCombine(SDNode * N,DAGCombinerInfo & DCI) const10057 SDValue SITargetLowering::performRcpCombine(SDNode *N,
10058 DAGCombinerInfo &DCI) const {
10059 EVT VT = N->getValueType(0);
10060 SDValue N0 = N->getOperand(0);
10061
10062 if (N0.isUndef()) {
10063 return DCI.DAG.getConstantFP(
10064 APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT)), SDLoc(N),
10065 VT);
10066 }
10067
10068 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
10069 N0.getOpcode() == ISD::SINT_TO_FP)) {
10070 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
10071 N->getFlags());
10072 }
10073
10074 if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) {
10075 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
10076 N0.getOperand(0), N->getFlags());
10077 }
10078
10079 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
10080 }
10081
isCanonicalized(SelectionDAG & DAG,SDValue Op,unsigned MaxDepth) const10082 bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
10083 unsigned MaxDepth) const {
10084 unsigned Opcode = Op.getOpcode();
10085 if (Opcode == ISD::FCANONICALIZE)
10086 return true;
10087
10088 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
10089 auto F = CFP->getValueAPF();
10090 if (F.isNaN() && F.isSignaling())
10091 return false;
10092 return !F.isDenormal() || denormalsEnabledForType(DAG, Op.getValueType());
10093 }
10094
10095 // If source is a result of another standard FP operation it is already in
10096 // canonical form.
10097 if (MaxDepth == 0)
10098 return false;
10099
10100 switch (Opcode) {
10101 // These will flush denorms if required.
10102 case ISD::FADD:
10103 case ISD::FSUB:
10104 case ISD::FMUL:
10105 case ISD::FCEIL:
10106 case ISD::FFLOOR:
10107 case ISD::FMA:
10108 case ISD::FMAD:
10109 case ISD::FSQRT:
10110 case ISD::FDIV:
10111 case ISD::FREM:
10112 case ISD::FP_ROUND:
10113 case ISD::FP_EXTEND:
10114 case AMDGPUISD::FMUL_LEGACY:
10115 case AMDGPUISD::FMAD_FTZ:
10116 case AMDGPUISD::RCP:
10117 case AMDGPUISD::RSQ:
10118 case AMDGPUISD::RSQ_CLAMP:
10119 case AMDGPUISD::RCP_LEGACY:
10120 case AMDGPUISD::RCP_IFLAG:
10121 case AMDGPUISD::DIV_SCALE:
10122 case AMDGPUISD::DIV_FMAS:
10123 case AMDGPUISD::DIV_FIXUP:
10124 case AMDGPUISD::FRACT:
10125 case AMDGPUISD::LDEXP:
10126 case AMDGPUISD::CVT_PKRTZ_F16_F32:
10127 case AMDGPUISD::CVT_F32_UBYTE0:
10128 case AMDGPUISD::CVT_F32_UBYTE1:
10129 case AMDGPUISD::CVT_F32_UBYTE2:
10130 case AMDGPUISD::CVT_F32_UBYTE3:
10131 return true;
10132
10133 // It can/will be lowered or combined as a bit operation.
10134 // Need to check their input recursively to handle.
10135 case ISD::FNEG:
10136 case ISD::FABS:
10137 case ISD::FCOPYSIGN:
10138 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
10139
10140 case ISD::FSIN:
10141 case ISD::FCOS:
10142 case ISD::FSINCOS:
10143 return Op.getValueType().getScalarType() != MVT::f16;
10144
10145 case ISD::FMINNUM:
10146 case ISD::FMAXNUM:
10147 case ISD::FMINNUM_IEEE:
10148 case ISD::FMAXNUM_IEEE:
10149 case AMDGPUISD::CLAMP:
10150 case AMDGPUISD::FMED3:
10151 case AMDGPUISD::FMAX3:
10152 case AMDGPUISD::FMIN3: {
10153 // FIXME: Shouldn't treat the generic operations different based these.
10154 // However, we aren't really required to flush the result from
10155 // minnum/maxnum..
10156
10157 // snans will be quieted, so we only need to worry about denormals.
10158 if (Subtarget->supportsMinMaxDenormModes() ||
10159 denormalsEnabledForType(DAG, Op.getValueType()))
10160 return true;
10161
10162 // Flushing may be required.
10163 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
10164 // targets need to check their input recursively.
10165
10166 // FIXME: Does this apply with clamp? It's implemented with max.
10167 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
10168 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
10169 return false;
10170 }
10171
10172 return true;
10173 }
10174 case ISD::SELECT: {
10175 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
10176 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
10177 }
10178 case ISD::BUILD_VECTOR: {
10179 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
10180 SDValue SrcOp = Op.getOperand(i);
10181 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
10182 return false;
10183 }
10184
10185 return true;
10186 }
10187 case ISD::EXTRACT_VECTOR_ELT:
10188 case ISD::EXTRACT_SUBVECTOR: {
10189 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
10190 }
10191 case ISD::INSERT_VECTOR_ELT: {
10192 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
10193 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
10194 }
10195 case ISD::UNDEF:
10196 // Could be anything.
10197 return false;
10198
10199 case ISD::BITCAST:
10200 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
10201 case ISD::TRUNCATE: {
10202 // Hack round the mess we make when legalizing extract_vector_elt
10203 if (Op.getValueType() == MVT::i16) {
10204 SDValue TruncSrc = Op.getOperand(0);
10205 if (TruncSrc.getValueType() == MVT::i32 &&
10206 TruncSrc.getOpcode() == ISD::BITCAST &&
10207 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
10208 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
10209 }
10210 }
10211 return false;
10212 }
10213 case ISD::INTRINSIC_WO_CHAIN: {
10214 unsigned IntrinsicID
10215 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10216 // TODO: Handle more intrinsics
10217 switch (IntrinsicID) {
10218 case Intrinsic::amdgcn_cvt_pkrtz:
10219 case Intrinsic::amdgcn_cubeid:
10220 case Intrinsic::amdgcn_frexp_mant:
10221 case Intrinsic::amdgcn_fdot2:
10222 case Intrinsic::amdgcn_rcp:
10223 case Intrinsic::amdgcn_rsq:
10224 case Intrinsic::amdgcn_rsq_clamp:
10225 case Intrinsic::amdgcn_rcp_legacy:
10226 case Intrinsic::amdgcn_rsq_legacy:
10227 case Intrinsic::amdgcn_trig_preop:
10228 return true;
10229 default:
10230 break;
10231 }
10232
10233 [[fallthrough]];
10234 }
10235 default:
10236 return denormalsEnabledForType(DAG, Op.getValueType()) &&
10237 DAG.isKnownNeverSNaN(Op);
10238 }
10239
10240 llvm_unreachable("invalid operation");
10241 }
10242
isCanonicalized(Register Reg,MachineFunction & MF,unsigned MaxDepth) const10243 bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
10244 unsigned MaxDepth) const {
10245 MachineRegisterInfo &MRI = MF.getRegInfo();
10246 MachineInstr *MI = MRI.getVRegDef(Reg);
10247 unsigned Opcode = MI->getOpcode();
10248
10249 if (Opcode == AMDGPU::G_FCANONICALIZE)
10250 return true;
10251
10252 std::optional<FPValueAndVReg> FCR;
10253 // Constant splat (can be padded with undef) or scalar constant.
10254 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
10255 if (FCR->Value.isSignaling())
10256 return false;
10257 return !FCR->Value.isDenormal() ||
10258 denormalsEnabledForType(MRI.getType(FCR->VReg), MF);
10259 }
10260
10261 if (MaxDepth == 0)
10262 return false;
10263
10264 switch (Opcode) {
10265 case AMDGPU::G_FADD:
10266 case AMDGPU::G_FSUB:
10267 case AMDGPU::G_FMUL:
10268 case AMDGPU::G_FCEIL:
10269 case AMDGPU::G_FFLOOR:
10270 case AMDGPU::G_FRINT:
10271 case AMDGPU::G_FNEARBYINT:
10272 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
10273 case AMDGPU::G_INTRINSIC_TRUNC:
10274 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
10275 case AMDGPU::G_FMA:
10276 case AMDGPU::G_FMAD:
10277 case AMDGPU::G_FSQRT:
10278 case AMDGPU::G_FDIV:
10279 case AMDGPU::G_FREM:
10280 case AMDGPU::G_FPOW:
10281 case AMDGPU::G_FPEXT:
10282 case AMDGPU::G_FLOG:
10283 case AMDGPU::G_FLOG2:
10284 case AMDGPU::G_FLOG10:
10285 case AMDGPU::G_FPTRUNC:
10286 case AMDGPU::G_AMDGPU_RCP_IFLAG:
10287 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
10288 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
10289 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
10290 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
10291 return true;
10292 case AMDGPU::G_FNEG:
10293 case AMDGPU::G_FABS:
10294 case AMDGPU::G_FCOPYSIGN:
10295 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
10296 case AMDGPU::G_FMINNUM:
10297 case AMDGPU::G_FMAXNUM:
10298 case AMDGPU::G_FMINNUM_IEEE:
10299 case AMDGPU::G_FMAXNUM_IEEE: {
10300 if (Subtarget->supportsMinMaxDenormModes() ||
10301 denormalsEnabledForType(MRI.getType(Reg), MF))
10302 return true;
10303
10304 [[fallthrough]];
10305 }
10306 case AMDGPU::G_BUILD_VECTOR:
10307 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
10308 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
10309 return false;
10310 return true;
10311 case AMDGPU::G_INTRINSIC:
10312 switch (MI->getIntrinsicID()) {
10313 case Intrinsic::amdgcn_fmul_legacy:
10314 case Intrinsic::amdgcn_fmad_ftz:
10315 case Intrinsic::amdgcn_sqrt:
10316 case Intrinsic::amdgcn_fmed3:
10317 case Intrinsic::amdgcn_sin:
10318 case Intrinsic::amdgcn_cos:
10319 case Intrinsic::amdgcn_log_clamp:
10320 case Intrinsic::amdgcn_rcp:
10321 case Intrinsic::amdgcn_rcp_legacy:
10322 case Intrinsic::amdgcn_rsq:
10323 case Intrinsic::amdgcn_rsq_clamp:
10324 case Intrinsic::amdgcn_rsq_legacy:
10325 case Intrinsic::amdgcn_div_scale:
10326 case Intrinsic::amdgcn_div_fmas:
10327 case Intrinsic::amdgcn_div_fixup:
10328 case Intrinsic::amdgcn_fract:
10329 case Intrinsic::amdgcn_ldexp:
10330 case Intrinsic::amdgcn_cvt_pkrtz:
10331 case Intrinsic::amdgcn_cubeid:
10332 case Intrinsic::amdgcn_cubema:
10333 case Intrinsic::amdgcn_cubesc:
10334 case Intrinsic::amdgcn_cubetc:
10335 case Intrinsic::amdgcn_frexp_mant:
10336 case Intrinsic::amdgcn_fdot2:
10337 case Intrinsic::amdgcn_trig_preop:
10338 return true;
10339 default:
10340 break;
10341 }
10342
10343 [[fallthrough]];
10344 default:
10345 return false;
10346 }
10347
10348 llvm_unreachable("invalid operation");
10349 }
10350
10351 // Constant fold canonicalize.
getCanonicalConstantFP(SelectionDAG & DAG,const SDLoc & SL,EVT VT,const APFloat & C) const10352 SDValue SITargetLowering::getCanonicalConstantFP(
10353 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
10354 // Flush denormals to 0 if not enabled.
10355 if (C.isDenormal() && !denormalsEnabledForType(DAG, VT)) {
10356 return DAG.getConstantFP(APFloat::getZero(C.getSemantics(),
10357 C.isNegative()), SL, VT);
10358 }
10359
10360 if (C.isNaN()) {
10361 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
10362 if (C.isSignaling()) {
10363 // Quiet a signaling NaN.
10364 // FIXME: Is this supposed to preserve payload bits?
10365 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
10366 }
10367
10368 // Make sure it is the canonical NaN bitpattern.
10369 //
10370 // TODO: Can we use -1 as the canonical NaN value since it's an inline
10371 // immediate?
10372 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
10373 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
10374 }
10375
10376 // Already canonical.
10377 return DAG.getConstantFP(C, SL, VT);
10378 }
10379
vectorEltWillFoldAway(SDValue Op)10380 static bool vectorEltWillFoldAway(SDValue Op) {
10381 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
10382 }
10383
performFCanonicalizeCombine(SDNode * N,DAGCombinerInfo & DCI) const10384 SDValue SITargetLowering::performFCanonicalizeCombine(
10385 SDNode *N,
10386 DAGCombinerInfo &DCI) const {
10387 SelectionDAG &DAG = DCI.DAG;
10388 SDValue N0 = N->getOperand(0);
10389 EVT VT = N->getValueType(0);
10390
10391 // fcanonicalize undef -> qnan
10392 if (N0.isUndef()) {
10393 APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
10394 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
10395 }
10396
10397 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
10398 EVT VT = N->getValueType(0);
10399 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
10400 }
10401
10402 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
10403 // (fcanonicalize k)
10404 //
10405 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
10406
10407 // TODO: This could be better with wider vectors that will be split to v2f16,
10408 // and to consider uses since there aren't that many packed operations.
10409 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
10410 isTypeLegal(MVT::v2f16)) {
10411 SDLoc SL(N);
10412 SDValue NewElts[2];
10413 SDValue Lo = N0.getOperand(0);
10414 SDValue Hi = N0.getOperand(1);
10415 EVT EltVT = Lo.getValueType();
10416
10417 if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
10418 for (unsigned I = 0; I != 2; ++I) {
10419 SDValue Op = N0.getOperand(I);
10420 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
10421 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
10422 CFP->getValueAPF());
10423 } else if (Op.isUndef()) {
10424 // Handled below based on what the other operand is.
10425 NewElts[I] = Op;
10426 } else {
10427 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
10428 }
10429 }
10430
10431 // If one half is undef, and one is constant, prefer a splat vector rather
10432 // than the normal qNaN. If it's a register, prefer 0.0 since that's
10433 // cheaper to use and may be free with a packed operation.
10434 if (NewElts[0].isUndef()) {
10435 if (isa<ConstantFPSDNode>(NewElts[1]))
10436 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
10437 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
10438 }
10439
10440 if (NewElts[1].isUndef()) {
10441 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
10442 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
10443 }
10444
10445 return DAG.getBuildVector(VT, SL, NewElts);
10446 }
10447 }
10448
10449 unsigned SrcOpc = N0.getOpcode();
10450
10451 // If it's free to do so, push canonicalizes further up the source, which may
10452 // find a canonical source.
10453 //
10454 // TODO: More opcodes. Note this is unsafe for the _ieee minnum/maxnum for
10455 // sNaNs.
10456 if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
10457 auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
10458 if (CRHS && N0.hasOneUse()) {
10459 SDLoc SL(N);
10460 SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
10461 N0.getOperand(0));
10462 SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
10463 DCI.AddToWorklist(Canon0.getNode());
10464
10465 return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
10466 }
10467 }
10468
10469 return isCanonicalized(DAG, N0) ? N0 : SDValue();
10470 }
10471
minMaxOpcToMin3Max3Opc(unsigned Opc)10472 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
10473 switch (Opc) {
10474 case ISD::FMAXNUM:
10475 case ISD::FMAXNUM_IEEE:
10476 return AMDGPUISD::FMAX3;
10477 case ISD::SMAX:
10478 return AMDGPUISD::SMAX3;
10479 case ISD::UMAX:
10480 return AMDGPUISD::UMAX3;
10481 case ISD::FMINNUM:
10482 case ISD::FMINNUM_IEEE:
10483 return AMDGPUISD::FMIN3;
10484 case ISD::SMIN:
10485 return AMDGPUISD::SMIN3;
10486 case ISD::UMIN:
10487 return AMDGPUISD::UMIN3;
10488 default:
10489 llvm_unreachable("Not a min/max opcode");
10490 }
10491 }
10492
performIntMed3ImmCombine(SelectionDAG & DAG,const SDLoc & SL,SDValue Op0,SDValue Op1,bool Signed) const10493 SDValue SITargetLowering::performIntMed3ImmCombine(
10494 SelectionDAG &DAG, const SDLoc &SL,
10495 SDValue Op0, SDValue Op1, bool Signed) const {
10496 ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
10497 if (!K1)
10498 return SDValue();
10499
10500 ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
10501 if (!K0)
10502 return SDValue();
10503
10504 if (Signed) {
10505 if (K0->getAPIntValue().sge(K1->getAPIntValue()))
10506 return SDValue();
10507 } else {
10508 if (K0->getAPIntValue().uge(K1->getAPIntValue()))
10509 return SDValue();
10510 }
10511
10512 EVT VT = K0->getValueType(0);
10513 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
10514 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
10515 return DAG.getNode(Med3Opc, SL, VT,
10516 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
10517 }
10518
10519 // If there isn't a 16-bit med3 operation, convert to 32-bit.
10520 if (VT == MVT::i16) {
10521 MVT NVT = MVT::i32;
10522 unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
10523
10524 SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
10525 SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
10526 SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
10527
10528 SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
10529 return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
10530 }
10531
10532 return SDValue();
10533 }
10534
getSplatConstantFP(SDValue Op)10535 static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
10536 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
10537 return C;
10538
10539 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
10540 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
10541 return C;
10542 }
10543
10544 return nullptr;
10545 }
10546
performFPMed3ImmCombine(SelectionDAG & DAG,const SDLoc & SL,SDValue Op0,SDValue Op1) const10547 SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
10548 const SDLoc &SL,
10549 SDValue Op0,
10550 SDValue Op1) const {
10551 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
10552 if (!K1)
10553 return SDValue();
10554
10555 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
10556 if (!K0)
10557 return SDValue();
10558
10559 // Ordered >= (although NaN inputs should have folded away by now).
10560 if (K0->getValueAPF() > K1->getValueAPF())
10561 return SDValue();
10562
10563 const MachineFunction &MF = DAG.getMachineFunction();
10564 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
10565
10566 // TODO: Check IEEE bit enabled?
10567 EVT VT = Op0.getValueType();
10568 if (Info->getMode().DX10Clamp) {
10569 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
10570 // hardware fmed3 behavior converting to a min.
10571 // FIXME: Should this be allowing -0.0?
10572 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
10573 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
10574 }
10575
10576 // med3 for f16 is only available on gfx9+, and not available for v2f16.
10577 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
10578 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
10579 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
10580 // then give the other result, which is different from med3 with a NaN
10581 // input.
10582 SDValue Var = Op0.getOperand(0);
10583 if (!DAG.isKnownNeverSNaN(Var))
10584 return SDValue();
10585
10586 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10587
10588 if ((!K0->hasOneUse() ||
10589 TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
10590 (!K1->hasOneUse() ||
10591 TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
10592 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
10593 Var, SDValue(K0, 0), SDValue(K1, 0));
10594 }
10595 }
10596
10597 return SDValue();
10598 }
10599
performMinMaxCombine(SDNode * N,DAGCombinerInfo & DCI) const10600 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
10601 DAGCombinerInfo &DCI) const {
10602 SelectionDAG &DAG = DCI.DAG;
10603
10604 EVT VT = N->getValueType(0);
10605 unsigned Opc = N->getOpcode();
10606 SDValue Op0 = N->getOperand(0);
10607 SDValue Op1 = N->getOperand(1);
10608
10609 // Only do this if the inner op has one use since this will just increases
10610 // register pressure for no benefit.
10611
10612 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
10613 !VT.isVector() &&
10614 (VT == MVT::i32 || VT == MVT::f32 ||
10615 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
10616 // max(max(a, b), c) -> max3(a, b, c)
10617 // min(min(a, b), c) -> min3(a, b, c)
10618 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
10619 SDLoc DL(N);
10620 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
10621 DL,
10622 N->getValueType(0),
10623 Op0.getOperand(0),
10624 Op0.getOperand(1),
10625 Op1);
10626 }
10627
10628 // Try commuted.
10629 // max(a, max(b, c)) -> max3(a, b, c)
10630 // min(a, min(b, c)) -> min3(a, b, c)
10631 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
10632 SDLoc DL(N);
10633 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
10634 DL,
10635 N->getValueType(0),
10636 Op0,
10637 Op1.getOperand(0),
10638 Op1.getOperand(1));
10639 }
10640 }
10641
10642 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
10643 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
10644 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
10645 return Med3;
10646 }
10647
10648 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
10649 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
10650 return Med3;
10651 }
10652
10653 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
10654 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
10655 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
10656 (Opc == AMDGPUISD::FMIN_LEGACY &&
10657 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
10658 (VT == MVT::f32 || VT == MVT::f64 ||
10659 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
10660 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
10661 Op0.hasOneUse()) {
10662 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
10663 return Res;
10664 }
10665
10666 return SDValue();
10667 }
10668
isClampZeroToOne(SDValue A,SDValue B)10669 static bool isClampZeroToOne(SDValue A, SDValue B) {
10670 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
10671 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
10672 // FIXME: Should this be allowing -0.0?
10673 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
10674 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
10675 }
10676 }
10677
10678 return false;
10679 }
10680
10681 // FIXME: Should only worry about snans for version with chain.
performFMed3Combine(SDNode * N,DAGCombinerInfo & DCI) const10682 SDValue SITargetLowering::performFMed3Combine(SDNode *N,
10683 DAGCombinerInfo &DCI) const {
10684 EVT VT = N->getValueType(0);
10685 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
10686 // NaNs. With a NaN input, the order of the operands may change the result.
10687
10688 SelectionDAG &DAG = DCI.DAG;
10689 SDLoc SL(N);
10690
10691 SDValue Src0 = N->getOperand(0);
10692 SDValue Src1 = N->getOperand(1);
10693 SDValue Src2 = N->getOperand(2);
10694
10695 if (isClampZeroToOne(Src0, Src1)) {
10696 // const_a, const_b, x -> clamp is safe in all cases including signaling
10697 // nans.
10698 // FIXME: Should this be allowing -0.0?
10699 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
10700 }
10701
10702 const MachineFunction &MF = DAG.getMachineFunction();
10703 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
10704
10705 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
10706 // handling no dx10-clamp?
10707 if (Info->getMode().DX10Clamp) {
10708 // If NaNs is clamped to 0, we are free to reorder the inputs.
10709
10710 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
10711 std::swap(Src0, Src1);
10712
10713 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
10714 std::swap(Src1, Src2);
10715
10716 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
10717 std::swap(Src0, Src1);
10718
10719 if (isClampZeroToOne(Src1, Src2))
10720 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
10721 }
10722
10723 return SDValue();
10724 }
10725
performCvtPkRTZCombine(SDNode * N,DAGCombinerInfo & DCI) const10726 SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
10727 DAGCombinerInfo &DCI) const {
10728 SDValue Src0 = N->getOperand(0);
10729 SDValue Src1 = N->getOperand(1);
10730 if (Src0.isUndef() && Src1.isUndef())
10731 return DCI.DAG.getUNDEF(N->getValueType(0));
10732 return SDValue();
10733 }
10734
10735 // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
10736 // expanded into a set of cmp/select instructions.
shouldExpandVectorDynExt(unsigned EltSize,unsigned NumElem,bool IsDivergentIdx,const GCNSubtarget * Subtarget)10737 bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
10738 unsigned NumElem,
10739 bool IsDivergentIdx,
10740 const GCNSubtarget *Subtarget) {
10741 if (UseDivergentRegisterIndexing)
10742 return false;
10743
10744 unsigned VecSize = EltSize * NumElem;
10745
10746 // Sub-dword vectors of size 2 dword or less have better implementation.
10747 if (VecSize <= 64 && EltSize < 32)
10748 return false;
10749
10750 // Always expand the rest of sub-dword instructions, otherwise it will be
10751 // lowered via memory.
10752 if (EltSize < 32)
10753 return true;
10754
10755 // Always do this if var-idx is divergent, otherwise it will become a loop.
10756 if (IsDivergentIdx)
10757 return true;
10758
10759 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
10760 unsigned NumInsts = NumElem /* Number of compares */ +
10761 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
10762
10763 // On some architectures (GFX9) movrel is not available and it's better
10764 // to expand.
10765 if (!Subtarget->hasMovrel())
10766 return NumInsts <= 16;
10767
10768 // If movrel is available, use it instead of expanding for vector of 8
10769 // elements.
10770 return NumInsts <= 15;
10771 }
10772
shouldExpandVectorDynExt(SDNode * N) const10773 bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
10774 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
10775 if (isa<ConstantSDNode>(Idx))
10776 return false;
10777
10778 SDValue Vec = N->getOperand(0);
10779 EVT VecVT = Vec.getValueType();
10780 EVT EltVT = VecVT.getVectorElementType();
10781 unsigned EltSize = EltVT.getSizeInBits();
10782 unsigned NumElem = VecVT.getVectorNumElements();
10783
10784 return SITargetLowering::shouldExpandVectorDynExt(
10785 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
10786 }
10787
performExtractVectorEltCombine(SDNode * N,DAGCombinerInfo & DCI) const10788 SDValue SITargetLowering::performExtractVectorEltCombine(
10789 SDNode *N, DAGCombinerInfo &DCI) const {
10790 SDValue Vec = N->getOperand(0);
10791 SelectionDAG &DAG = DCI.DAG;
10792
10793 EVT VecVT = Vec.getValueType();
10794 EVT VecEltVT = VecVT.getVectorElementType();
10795 EVT ResVT = N->getValueType(0);
10796
10797 unsigned VecSize = VecVT.getSizeInBits();
10798 unsigned VecEltSize = VecEltVT.getSizeInBits();
10799
10800 if ((Vec.getOpcode() == ISD::FNEG ||
10801 Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
10802 SDLoc SL(N);
10803 SDValue Idx = N->getOperand(1);
10804 SDValue Elt =
10805 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
10806 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
10807 }
10808
10809 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
10810 // =>
10811 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
10812 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
10813 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
10814 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
10815 SDLoc SL(N);
10816 SDValue Idx = N->getOperand(1);
10817 unsigned Opc = Vec.getOpcode();
10818
10819 switch(Opc) {
10820 default:
10821 break;
10822 // TODO: Support other binary operations.
10823 case ISD::FADD:
10824 case ISD::FSUB:
10825 case ISD::FMUL:
10826 case ISD::ADD:
10827 case ISD::UMIN:
10828 case ISD::UMAX:
10829 case ISD::SMIN:
10830 case ISD::SMAX:
10831 case ISD::FMAXNUM:
10832 case ISD::FMINNUM:
10833 case ISD::FMAXNUM_IEEE:
10834 case ISD::FMINNUM_IEEE: {
10835 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
10836 Vec.getOperand(0), Idx);
10837 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
10838 Vec.getOperand(1), Idx);
10839
10840 DCI.AddToWorklist(Elt0.getNode());
10841 DCI.AddToWorklist(Elt1.getNode());
10842 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
10843 }
10844 }
10845 }
10846
10847 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
10848 if (shouldExpandVectorDynExt(N)) {
10849 SDLoc SL(N);
10850 SDValue Idx = N->getOperand(1);
10851 SDValue V;
10852 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
10853 SDValue IC = DAG.getVectorIdxConstant(I, SL);
10854 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
10855 if (I == 0)
10856 V = Elt;
10857 else
10858 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
10859 }
10860 return V;
10861 }
10862
10863 if (!DCI.isBeforeLegalize())
10864 return SDValue();
10865
10866 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
10867 // elements. This exposes more load reduction opportunities by replacing
10868 // multiple small extract_vector_elements with a single 32-bit extract.
10869 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
10870 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
10871 VecSize > 32 && VecSize % 32 == 0 && Idx) {
10872 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
10873
10874 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
10875 unsigned EltIdx = BitIndex / 32;
10876 unsigned LeftoverBitIdx = BitIndex % 32;
10877 SDLoc SL(N);
10878
10879 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
10880 DCI.AddToWorklist(Cast.getNode());
10881
10882 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
10883 DAG.getConstant(EltIdx, SL, MVT::i32));
10884 DCI.AddToWorklist(Elt.getNode());
10885 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
10886 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
10887 DCI.AddToWorklist(Srl.getNode());
10888
10889 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
10890 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
10891 DCI.AddToWorklist(Trunc.getNode());
10892
10893 if (VecEltVT == ResVT) {
10894 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
10895 }
10896
10897 assert(ResVT.isScalarInteger());
10898 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
10899 }
10900
10901 return SDValue();
10902 }
10903
10904 SDValue
performInsertVectorEltCombine(SDNode * N,DAGCombinerInfo & DCI) const10905 SITargetLowering::performInsertVectorEltCombine(SDNode *N,
10906 DAGCombinerInfo &DCI) const {
10907 SDValue Vec = N->getOperand(0);
10908 SDValue Idx = N->getOperand(2);
10909 EVT VecVT = Vec.getValueType();
10910 EVT EltVT = VecVT.getVectorElementType();
10911
10912 // INSERT_VECTOR_ELT (<n x e>, var-idx)
10913 // => BUILD_VECTOR n x select (e, const-idx)
10914 if (!shouldExpandVectorDynExt(N))
10915 return SDValue();
10916
10917 SelectionDAG &DAG = DCI.DAG;
10918 SDLoc SL(N);
10919 SDValue Ins = N->getOperand(1);
10920 EVT IdxVT = Idx.getValueType();
10921
10922 SmallVector<SDValue, 16> Ops;
10923 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
10924 SDValue IC = DAG.getConstant(I, SL, IdxVT);
10925 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
10926 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
10927 Ops.push_back(V);
10928 }
10929
10930 return DAG.getBuildVector(VecVT, SL, Ops);
10931 }
10932
getFusedOpcode(const SelectionDAG & DAG,const SDNode * N0,const SDNode * N1) const10933 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
10934 const SDNode *N0,
10935 const SDNode *N1) const {
10936 EVT VT = N0->getValueType(0);
10937
10938 // Only do this if we are not trying to support denormals. v_mad_f32 does not
10939 // support denormals ever.
10940 if (((VT == MVT::f32 && !hasFP32Denormals(DAG.getMachineFunction())) ||
10941 (VT == MVT::f16 && !hasFP64FP16Denormals(DAG.getMachineFunction()) &&
10942 getSubtarget()->hasMadF16())) &&
10943 isOperationLegal(ISD::FMAD, VT))
10944 return ISD::FMAD;
10945
10946 const TargetOptions &Options = DAG.getTarget().Options;
10947 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
10948 (N0->getFlags().hasAllowContract() &&
10949 N1->getFlags().hasAllowContract())) &&
10950 isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
10951 return ISD::FMA;
10952 }
10953
10954 return 0;
10955 }
10956
10957 // For a reassociatable opcode perform:
10958 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
reassociateScalarOps(SDNode * N,SelectionDAG & DAG) const10959 SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
10960 SelectionDAG &DAG) const {
10961 EVT VT = N->getValueType(0);
10962 if (VT != MVT::i32 && VT != MVT::i64)
10963 return SDValue();
10964
10965 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
10966 return SDValue();
10967
10968 unsigned Opc = N->getOpcode();
10969 SDValue Op0 = N->getOperand(0);
10970 SDValue Op1 = N->getOperand(1);
10971
10972 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
10973 return SDValue();
10974
10975 if (Op0->isDivergent())
10976 std::swap(Op0, Op1);
10977
10978 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
10979 return SDValue();
10980
10981 SDValue Op2 = Op1.getOperand(1);
10982 Op1 = Op1.getOperand(0);
10983 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
10984 return SDValue();
10985
10986 if (Op1->isDivergent())
10987 std::swap(Op1, Op2);
10988
10989 SDLoc SL(N);
10990 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
10991 return DAG.getNode(Opc, SL, VT, Add1, Op2);
10992 }
10993
getMad64_32(SelectionDAG & DAG,const SDLoc & SL,EVT VT,SDValue N0,SDValue N1,SDValue N2,bool Signed)10994 static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
10995 EVT VT,
10996 SDValue N0, SDValue N1, SDValue N2,
10997 bool Signed) {
10998 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
10999 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
11000 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
11001 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
11002 }
11003
11004 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
11005 // multiplies, if any.
11006 //
11007 // Full 64-bit multiplies that feed into an addition are lowered here instead
11008 // of using the generic expansion. The generic expansion ends up with
11009 // a tree of ADD nodes that prevents us from using the "add" part of the
11010 // MAD instruction. The expansion produced here results in a chain of ADDs
11011 // instead of a tree.
tryFoldToMad64_32(SDNode * N,DAGCombinerInfo & DCI) const11012 SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
11013 DAGCombinerInfo &DCI) const {
11014 assert(N->getOpcode() == ISD::ADD);
11015
11016 SelectionDAG &DAG = DCI.DAG;
11017 EVT VT = N->getValueType(0);
11018 SDLoc SL(N);
11019 SDValue LHS = N->getOperand(0);
11020 SDValue RHS = N->getOperand(1);
11021
11022 if (VT.isVector())
11023 return SDValue();
11024
11025 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
11026 // result in scalar registers for uniform values.
11027 if (!N->isDivergent() && Subtarget->hasSMulHi())
11028 return SDValue();
11029
11030 unsigned NumBits = VT.getScalarSizeInBits();
11031 if (NumBits <= 32 || NumBits > 64)
11032 return SDValue();
11033
11034 if (LHS.getOpcode() != ISD::MUL) {
11035 assert(RHS.getOpcode() == ISD::MUL);
11036 std::swap(LHS, RHS);
11037 }
11038
11039 // Avoid the fold if it would unduly increase the number of multiplies due to
11040 // multiple uses, except on hardware with full-rate multiply-add (which is
11041 // part of full-rate 64-bit ops).
11042 if (!Subtarget->hasFullRate64Ops()) {
11043 unsigned NumUsers = 0;
11044 for (SDNode *Use : LHS->uses()) {
11045 // There is a use that does not feed into addition, so the multiply can't
11046 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
11047 if (Use->getOpcode() != ISD::ADD)
11048 return SDValue();
11049
11050 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
11051 // MUL + 3xADD + 3xADDC over 3xMAD.
11052 ++NumUsers;
11053 if (NumUsers >= 3)
11054 return SDValue();
11055 }
11056 }
11057
11058 SDValue MulLHS = LHS.getOperand(0);
11059 SDValue MulRHS = LHS.getOperand(1);
11060 SDValue AddRHS = RHS;
11061
11062 // Always check whether operands are small unsigned values, since that
11063 // knowledge is useful in more cases. Check for small signed values only if
11064 // doing so can unlock a shorter code sequence.
11065 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
11066 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
11067
11068 bool MulSignedLo = false;
11069 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
11070 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
11071 numBitsSigned(MulRHS, DAG) <= 32;
11072 }
11073
11074 // The operands and final result all have the same number of bits. If
11075 // operands need to be extended, they can be extended with garbage. The
11076 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
11077 // truncated away in the end.
11078 if (VT != MVT::i64) {
11079 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
11080 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
11081 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
11082 }
11083
11084 // The basic code generated is conceptually straightforward. Pseudo code:
11085 //
11086 // accum = mad_64_32 lhs.lo, rhs.lo, accum
11087 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
11088 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
11089 //
11090 // The second and third lines are optional, depending on whether the factors
11091 // are {sign,zero}-extended or not.
11092 //
11093 // The actual DAG is noisier than the pseudo code, but only due to
11094 // instructions that disassemble values into low and high parts, and
11095 // assemble the final result.
11096 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
11097 SDValue One = DAG.getConstant(1, SL, MVT::i32);
11098
11099 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
11100 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
11101 SDValue Accum =
11102 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
11103
11104 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
11105 auto AccumLo = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, Zero);
11106 auto AccumHi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, One);
11107
11108 if (!MulLHSUnsigned32) {
11109 auto MulLHSHi =
11110 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
11111 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
11112 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
11113 }
11114
11115 if (!MulRHSUnsigned32) {
11116 auto MulRHSHi =
11117 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
11118 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
11119 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
11120 }
11121
11122 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
11123 Accum = DAG.getBitcast(MVT::i64, Accum);
11124 }
11125
11126 if (VT != MVT::i64)
11127 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
11128 return Accum;
11129 }
11130
performAddCombine(SDNode * N,DAGCombinerInfo & DCI) const11131 SDValue SITargetLowering::performAddCombine(SDNode *N,
11132 DAGCombinerInfo &DCI) const {
11133 SelectionDAG &DAG = DCI.DAG;
11134 EVT VT = N->getValueType(0);
11135 SDLoc SL(N);
11136 SDValue LHS = N->getOperand(0);
11137 SDValue RHS = N->getOperand(1);
11138
11139 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
11140 if (Subtarget->hasMad64_32()) {
11141 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
11142 return Folded;
11143 }
11144
11145 return SDValue();
11146 }
11147
11148 if (SDValue V = reassociateScalarOps(N, DAG)) {
11149 return V;
11150 }
11151
11152 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
11153 return SDValue();
11154
11155 // add x, zext (setcc) => addcarry x, 0, setcc
11156 // add x, sext (setcc) => subcarry x, 0, setcc
11157 unsigned Opc = LHS.getOpcode();
11158 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
11159 Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
11160 std::swap(RHS, LHS);
11161
11162 Opc = RHS.getOpcode();
11163 switch (Opc) {
11164 default: break;
11165 case ISD::ZERO_EXTEND:
11166 case ISD::SIGN_EXTEND:
11167 case ISD::ANY_EXTEND: {
11168 auto Cond = RHS.getOperand(0);
11169 // If this won't be a real VOPC output, we would still need to insert an
11170 // extra instruction anyway.
11171 if (!isBoolSGPR(Cond))
11172 break;
11173 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
11174 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
11175 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
11176 return DAG.getNode(Opc, SL, VTList, Args);
11177 }
11178 case ISD::ADDCARRY: {
11179 // add x, (addcarry y, 0, cc) => addcarry x, y, cc
11180 auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11181 if (!C || C->getZExtValue() != 0) break;
11182 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
11183 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
11184 }
11185 }
11186 return SDValue();
11187 }
11188
performSubCombine(SDNode * N,DAGCombinerInfo & DCI) const11189 SDValue SITargetLowering::performSubCombine(SDNode *N,
11190 DAGCombinerInfo &DCI) const {
11191 SelectionDAG &DAG = DCI.DAG;
11192 EVT VT = N->getValueType(0);
11193
11194 if (VT != MVT::i32)
11195 return SDValue();
11196
11197 SDLoc SL(N);
11198 SDValue LHS = N->getOperand(0);
11199 SDValue RHS = N->getOperand(1);
11200
11201 // sub x, zext (setcc) => subcarry x, 0, setcc
11202 // sub x, sext (setcc) => addcarry x, 0, setcc
11203 unsigned Opc = RHS.getOpcode();
11204 switch (Opc) {
11205 default: break;
11206 case ISD::ZERO_EXTEND:
11207 case ISD::SIGN_EXTEND:
11208 case ISD::ANY_EXTEND: {
11209 auto Cond = RHS.getOperand(0);
11210 // If this won't be a real VOPC output, we would still need to insert an
11211 // extra instruction anyway.
11212 if (!isBoolSGPR(Cond))
11213 break;
11214 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
11215 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
11216 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::ADDCARRY : ISD::SUBCARRY;
11217 return DAG.getNode(Opc, SL, VTList, Args);
11218 }
11219 }
11220
11221 if (LHS.getOpcode() == ISD::SUBCARRY) {
11222 // sub (subcarry x, 0, cc), y => subcarry x, y, cc
11223 auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
11224 if (!C || !C->isZero())
11225 return SDValue();
11226 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
11227 return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
11228 }
11229 return SDValue();
11230 }
11231
performAddCarrySubCarryCombine(SDNode * N,DAGCombinerInfo & DCI) const11232 SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
11233 DAGCombinerInfo &DCI) const {
11234
11235 if (N->getValueType(0) != MVT::i32)
11236 return SDValue();
11237
11238 auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
11239 if (!C || C->getZExtValue() != 0)
11240 return SDValue();
11241
11242 SelectionDAG &DAG = DCI.DAG;
11243 SDValue LHS = N->getOperand(0);
11244
11245 // addcarry (add x, y), 0, cc => addcarry x, y, cc
11246 // subcarry (sub x, y), 0, cc => subcarry x, y, cc
11247 unsigned LHSOpc = LHS.getOpcode();
11248 unsigned Opc = N->getOpcode();
11249 if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
11250 (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
11251 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
11252 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
11253 }
11254 return SDValue();
11255 }
11256
performFAddCombine(SDNode * N,DAGCombinerInfo & DCI) const11257 SDValue SITargetLowering::performFAddCombine(SDNode *N,
11258 DAGCombinerInfo &DCI) const {
11259 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
11260 return SDValue();
11261
11262 SelectionDAG &DAG = DCI.DAG;
11263 EVT VT = N->getValueType(0);
11264
11265 SDLoc SL(N);
11266 SDValue LHS = N->getOperand(0);
11267 SDValue RHS = N->getOperand(1);
11268
11269 // These should really be instruction patterns, but writing patterns with
11270 // source modifiers is a pain.
11271
11272 // fadd (fadd (a, a), b) -> mad 2.0, a, b
11273 if (LHS.getOpcode() == ISD::FADD) {
11274 SDValue A = LHS.getOperand(0);
11275 if (A == LHS.getOperand(1)) {
11276 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
11277 if (FusedOp != 0) {
11278 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
11279 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
11280 }
11281 }
11282 }
11283
11284 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
11285 if (RHS.getOpcode() == ISD::FADD) {
11286 SDValue A = RHS.getOperand(0);
11287 if (A == RHS.getOperand(1)) {
11288 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
11289 if (FusedOp != 0) {
11290 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
11291 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
11292 }
11293 }
11294 }
11295
11296 return SDValue();
11297 }
11298
performFSubCombine(SDNode * N,DAGCombinerInfo & DCI) const11299 SDValue SITargetLowering::performFSubCombine(SDNode *N,
11300 DAGCombinerInfo &DCI) const {
11301 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
11302 return SDValue();
11303
11304 SelectionDAG &DAG = DCI.DAG;
11305 SDLoc SL(N);
11306 EVT VT = N->getValueType(0);
11307 assert(!VT.isVector());
11308
11309 // Try to get the fneg to fold into the source modifier. This undoes generic
11310 // DAG combines and folds them into the mad.
11311 //
11312 // Only do this if we are not trying to support denormals. v_mad_f32 does
11313 // not support denormals ever.
11314 SDValue LHS = N->getOperand(0);
11315 SDValue RHS = N->getOperand(1);
11316 if (LHS.getOpcode() == ISD::FADD) {
11317 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
11318 SDValue A = LHS.getOperand(0);
11319 if (A == LHS.getOperand(1)) {
11320 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
11321 if (FusedOp != 0){
11322 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
11323 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
11324
11325 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
11326 }
11327 }
11328 }
11329
11330 if (RHS.getOpcode() == ISD::FADD) {
11331 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
11332
11333 SDValue A = RHS.getOperand(0);
11334 if (A == RHS.getOperand(1)) {
11335 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
11336 if (FusedOp != 0){
11337 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
11338 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
11339 }
11340 }
11341 }
11342
11343 return SDValue();
11344 }
11345
performFMACombine(SDNode * N,DAGCombinerInfo & DCI) const11346 SDValue SITargetLowering::performFMACombine(SDNode *N,
11347 DAGCombinerInfo &DCI) const {
11348 SelectionDAG &DAG = DCI.DAG;
11349 EVT VT = N->getValueType(0);
11350 SDLoc SL(N);
11351
11352 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
11353 return SDValue();
11354
11355 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
11356 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
11357 SDValue Op1 = N->getOperand(0);
11358 SDValue Op2 = N->getOperand(1);
11359 SDValue FMA = N->getOperand(2);
11360
11361 if (FMA.getOpcode() != ISD::FMA ||
11362 Op1.getOpcode() != ISD::FP_EXTEND ||
11363 Op2.getOpcode() != ISD::FP_EXTEND)
11364 return SDValue();
11365
11366 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
11367 // regardless of the denorm mode setting. Therefore,
11368 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
11369 const TargetOptions &Options = DAG.getTarget().Options;
11370 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
11371 (N->getFlags().hasAllowContract() &&
11372 FMA->getFlags().hasAllowContract())) {
11373 Op1 = Op1.getOperand(0);
11374 Op2 = Op2.getOperand(0);
11375 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11376 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11377 return SDValue();
11378
11379 SDValue Vec1 = Op1.getOperand(0);
11380 SDValue Idx1 = Op1.getOperand(1);
11381 SDValue Vec2 = Op2.getOperand(0);
11382
11383 SDValue FMAOp1 = FMA.getOperand(0);
11384 SDValue FMAOp2 = FMA.getOperand(1);
11385 SDValue FMAAcc = FMA.getOperand(2);
11386
11387 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
11388 FMAOp2.getOpcode() != ISD::FP_EXTEND)
11389 return SDValue();
11390
11391 FMAOp1 = FMAOp1.getOperand(0);
11392 FMAOp2 = FMAOp2.getOperand(0);
11393 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11394 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11395 return SDValue();
11396
11397 SDValue Vec3 = FMAOp1.getOperand(0);
11398 SDValue Vec4 = FMAOp2.getOperand(0);
11399 SDValue Idx2 = FMAOp1.getOperand(1);
11400
11401 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
11402 // Idx1 and Idx2 cannot be the same.
11403 Idx1 == Idx2)
11404 return SDValue();
11405
11406 if (Vec1 == Vec2 || Vec3 == Vec4)
11407 return SDValue();
11408
11409 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
11410 return SDValue();
11411
11412 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
11413 (Vec1 == Vec4 && Vec2 == Vec3)) {
11414 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
11415 DAG.getTargetConstant(0, SL, MVT::i1));
11416 }
11417 }
11418 return SDValue();
11419 }
11420
performSetCCCombine(SDNode * N,DAGCombinerInfo & DCI) const11421 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
11422 DAGCombinerInfo &DCI) const {
11423 SelectionDAG &DAG = DCI.DAG;
11424 SDLoc SL(N);
11425
11426 SDValue LHS = N->getOperand(0);
11427 SDValue RHS = N->getOperand(1);
11428 EVT VT = LHS.getValueType();
11429 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
11430
11431 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
11432 if (!CRHS) {
11433 CRHS = dyn_cast<ConstantSDNode>(LHS);
11434 if (CRHS) {
11435 std::swap(LHS, RHS);
11436 CC = getSetCCSwappedOperands(CC);
11437 }
11438 }
11439
11440 if (CRHS) {
11441 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
11442 isBoolSGPR(LHS.getOperand(0))) {
11443 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
11444 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
11445 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
11446 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
11447 if ((CRHS->isAllOnes() &&
11448 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
11449 (CRHS->isZero() &&
11450 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
11451 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
11452 DAG.getConstant(-1, SL, MVT::i1));
11453 if ((CRHS->isAllOnes() &&
11454 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
11455 (CRHS->isZero() &&
11456 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
11457 return LHS.getOperand(0);
11458 }
11459
11460 const APInt &CRHSVal = CRHS->getAPIntValue();
11461 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
11462 LHS.getOpcode() == ISD::SELECT &&
11463 isa<ConstantSDNode>(LHS.getOperand(1)) &&
11464 isa<ConstantSDNode>(LHS.getOperand(2)) &&
11465 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
11466 isBoolSGPR(LHS.getOperand(0))) {
11467 // Given CT != FT:
11468 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
11469 // setcc (select cc, CT, CF), CF, ne => cc
11470 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
11471 // setcc (select cc, CT, CF), CT, eq => cc
11472 const APInt &CT = LHS.getConstantOperandAPInt(1);
11473 const APInt &CF = LHS.getConstantOperandAPInt(2);
11474
11475 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
11476 (CT == CRHSVal && CC == ISD::SETNE))
11477 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
11478 DAG.getConstant(-1, SL, MVT::i1));
11479 if ((CF == CRHSVal && CC == ISD::SETNE) ||
11480 (CT == CRHSVal && CC == ISD::SETEQ))
11481 return LHS.getOperand(0);
11482 }
11483 }
11484
11485 if (VT != MVT::f32 && VT != MVT::f64 &&
11486 (!Subtarget->has16BitInsts() || VT != MVT::f16))
11487 return SDValue();
11488
11489 // Match isinf/isfinite pattern
11490 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
11491 // (fcmp one (fabs x), inf) -> (fp_class x,
11492 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
11493 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
11494 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
11495 if (!CRHS)
11496 return SDValue();
11497
11498 const APFloat &APF = CRHS->getValueAPF();
11499 if (APF.isInfinity() && !APF.isNegative()) {
11500 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
11501 SIInstrFlags::N_INFINITY;
11502 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
11503 SIInstrFlags::P_ZERO |
11504 SIInstrFlags::N_NORMAL |
11505 SIInstrFlags::P_NORMAL |
11506 SIInstrFlags::N_SUBNORMAL |
11507 SIInstrFlags::P_SUBNORMAL;
11508 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
11509 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
11510 DAG.getConstant(Mask, SL, MVT::i32));
11511 }
11512 }
11513
11514 return SDValue();
11515 }
11516
performCvtF32UByteNCombine(SDNode * N,DAGCombinerInfo & DCI) const11517 SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
11518 DAGCombinerInfo &DCI) const {
11519 SelectionDAG &DAG = DCI.DAG;
11520 SDLoc SL(N);
11521 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
11522
11523 SDValue Src = N->getOperand(0);
11524 SDValue Shift = N->getOperand(0);
11525
11526 // TODO: Extend type shouldn't matter (assuming legal types).
11527 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
11528 Shift = Shift.getOperand(0);
11529
11530 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
11531 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
11532 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
11533 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
11534 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
11535 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
11536 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
11537 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
11538 SDLoc(Shift.getOperand(0)), MVT::i32);
11539
11540 unsigned ShiftOffset = 8 * Offset;
11541 if (Shift.getOpcode() == ISD::SHL)
11542 ShiftOffset -= C->getZExtValue();
11543 else
11544 ShiftOffset += C->getZExtValue();
11545
11546 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
11547 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
11548 MVT::f32, Shifted);
11549 }
11550 }
11551 }
11552
11553 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11554 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
11555 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
11556 // We simplified Src. If this node is not dead, visit it again so it is
11557 // folded properly.
11558 if (N->getOpcode() != ISD::DELETED_NODE)
11559 DCI.AddToWorklist(N);
11560 return SDValue(N, 0);
11561 }
11562
11563 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
11564 if (SDValue DemandedSrc =
11565 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
11566 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
11567
11568 return SDValue();
11569 }
11570
performClampCombine(SDNode * N,DAGCombinerInfo & DCI) const11571 SDValue SITargetLowering::performClampCombine(SDNode *N,
11572 DAGCombinerInfo &DCI) const {
11573 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
11574 if (!CSrc)
11575 return SDValue();
11576
11577 const MachineFunction &MF = DCI.DAG.getMachineFunction();
11578 const APFloat &F = CSrc->getValueAPF();
11579 APFloat Zero = APFloat::getZero(F.getSemantics());
11580 if (F < Zero ||
11581 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
11582 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
11583 }
11584
11585 APFloat One(F.getSemantics(), "1.0");
11586 if (F > One)
11587 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
11588
11589 return SDValue(CSrc, 0);
11590 }
11591
11592
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const11593 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
11594 DAGCombinerInfo &DCI) const {
11595 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
11596 return SDValue();
11597 switch (N->getOpcode()) {
11598 case ISD::ADD:
11599 return performAddCombine(N, DCI);
11600 case ISD::SUB:
11601 return performSubCombine(N, DCI);
11602 case ISD::ADDCARRY:
11603 case ISD::SUBCARRY:
11604 return performAddCarrySubCarryCombine(N, DCI);
11605 case ISD::FADD:
11606 return performFAddCombine(N, DCI);
11607 case ISD::FSUB:
11608 return performFSubCombine(N, DCI);
11609 case ISD::SETCC:
11610 return performSetCCCombine(N, DCI);
11611 case ISD::FMAXNUM:
11612 case ISD::FMINNUM:
11613 case ISD::FMAXNUM_IEEE:
11614 case ISD::FMINNUM_IEEE:
11615 case ISD::SMAX:
11616 case ISD::SMIN:
11617 case ISD::UMAX:
11618 case ISD::UMIN:
11619 case AMDGPUISD::FMIN_LEGACY:
11620 case AMDGPUISD::FMAX_LEGACY:
11621 return performMinMaxCombine(N, DCI);
11622 case ISD::FMA:
11623 return performFMACombine(N, DCI);
11624 case ISD::AND:
11625 return performAndCombine(N, DCI);
11626 case ISD::OR:
11627 return performOrCombine(N, DCI);
11628 case ISD::XOR:
11629 return performXorCombine(N, DCI);
11630 case ISD::ZERO_EXTEND:
11631 return performZeroExtendCombine(N, DCI);
11632 case ISD::SIGN_EXTEND_INREG:
11633 return performSignExtendInRegCombine(N , DCI);
11634 case AMDGPUISD::FP_CLASS:
11635 return performClassCombine(N, DCI);
11636 case ISD::FCANONICALIZE:
11637 return performFCanonicalizeCombine(N, DCI);
11638 case AMDGPUISD::RCP:
11639 return performRcpCombine(N, DCI);
11640 case AMDGPUISD::FRACT:
11641 case AMDGPUISD::RSQ:
11642 case AMDGPUISD::RCP_LEGACY:
11643 case AMDGPUISD::RCP_IFLAG:
11644 case AMDGPUISD::RSQ_CLAMP:
11645 case AMDGPUISD::LDEXP: {
11646 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
11647 SDValue Src = N->getOperand(0);
11648 if (Src.isUndef())
11649 return Src;
11650 break;
11651 }
11652 case ISD::SINT_TO_FP:
11653 case ISD::UINT_TO_FP:
11654 return performUCharToFloatCombine(N, DCI);
11655 case AMDGPUISD::CVT_F32_UBYTE0:
11656 case AMDGPUISD::CVT_F32_UBYTE1:
11657 case AMDGPUISD::CVT_F32_UBYTE2:
11658 case AMDGPUISD::CVT_F32_UBYTE3:
11659 return performCvtF32UByteNCombine(N, DCI);
11660 case AMDGPUISD::FMED3:
11661 return performFMed3Combine(N, DCI);
11662 case AMDGPUISD::CVT_PKRTZ_F16_F32:
11663 return performCvtPkRTZCombine(N, DCI);
11664 case AMDGPUISD::CLAMP:
11665 return performClampCombine(N, DCI);
11666 case ISD::SCALAR_TO_VECTOR: {
11667 SelectionDAG &DAG = DCI.DAG;
11668 EVT VT = N->getValueType(0);
11669
11670 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
11671 if (VT == MVT::v2i16 || VT == MVT::v2f16) {
11672 SDLoc SL(N);
11673 SDValue Src = N->getOperand(0);
11674 EVT EltVT = Src.getValueType();
11675 if (EltVT == MVT::f16)
11676 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
11677
11678 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
11679 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
11680 }
11681
11682 break;
11683 }
11684 case ISD::EXTRACT_VECTOR_ELT:
11685 return performExtractVectorEltCombine(N, DCI);
11686 case ISD::INSERT_VECTOR_ELT:
11687 return performInsertVectorEltCombine(N, DCI);
11688 case ISD::LOAD: {
11689 if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
11690 return Widended;
11691 [[fallthrough]];
11692 }
11693 default: {
11694 if (!DCI.isBeforeLegalize()) {
11695 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
11696 return performMemSDNodeCombine(MemNode, DCI);
11697 }
11698
11699 break;
11700 }
11701 }
11702
11703 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
11704 }
11705
11706 /// Helper function for adjustWritemask
SubIdx2Lane(unsigned Idx)11707 static unsigned SubIdx2Lane(unsigned Idx) {
11708 switch (Idx) {
11709 default: return ~0u;
11710 case AMDGPU::sub0: return 0;
11711 case AMDGPU::sub1: return 1;
11712 case AMDGPU::sub2: return 2;
11713 case AMDGPU::sub3: return 3;
11714 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
11715 }
11716 }
11717
11718 /// Adjust the writemask of MIMG instructions
adjustWritemask(MachineSDNode * & Node,SelectionDAG & DAG) const11719 SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
11720 SelectionDAG &DAG) const {
11721 unsigned Opcode = Node->getMachineOpcode();
11722
11723 // Subtract 1 because the vdata output is not a MachineSDNode operand.
11724 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
11725 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
11726 return Node; // not implemented for D16
11727
11728 SDNode *Users[5] = { nullptr };
11729 unsigned Lane = 0;
11730 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
11731 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
11732 unsigned NewDmask = 0;
11733 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
11734 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
11735 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
11736 Node->getConstantOperandVal(LWEIdx))
11737 ? true
11738 : false;
11739 unsigned TFCLane = 0;
11740 bool HasChain = Node->getNumValues() > 1;
11741
11742 if (OldDmask == 0) {
11743 // These are folded out, but on the chance it happens don't assert.
11744 return Node;
11745 }
11746
11747 unsigned OldBitsSet = llvm::popcount(OldDmask);
11748 // Work out which is the TFE/LWE lane if that is enabled.
11749 if (UsesTFC) {
11750 TFCLane = OldBitsSet;
11751 }
11752
11753 // Try to figure out the used register components
11754 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
11755 I != E; ++I) {
11756
11757 // Don't look at users of the chain.
11758 if (I.getUse().getResNo() != 0)
11759 continue;
11760
11761 // Abort if we can't understand the usage
11762 if (!I->isMachineOpcode() ||
11763 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
11764 return Node;
11765
11766 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
11767 // Note that subregs are packed, i.e. Lane==0 is the first bit set
11768 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
11769 // set, etc.
11770 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
11771 if (Lane == ~0u)
11772 return Node;
11773
11774 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
11775 if (UsesTFC && Lane == TFCLane) {
11776 Users[Lane] = *I;
11777 } else {
11778 // Set which texture component corresponds to the lane.
11779 unsigned Comp;
11780 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
11781 Comp = countTrailingZeros(Dmask);
11782 Dmask &= ~(1 << Comp);
11783 }
11784
11785 // Abort if we have more than one user per component.
11786 if (Users[Lane])
11787 return Node;
11788
11789 Users[Lane] = *I;
11790 NewDmask |= 1 << Comp;
11791 }
11792 }
11793
11794 // Don't allow 0 dmask, as hardware assumes one channel enabled.
11795 bool NoChannels = !NewDmask;
11796 if (NoChannels) {
11797 if (!UsesTFC) {
11798 // No uses of the result and not using TFC. Then do nothing.
11799 return Node;
11800 }
11801 // If the original dmask has one channel - then nothing to do
11802 if (OldBitsSet == 1)
11803 return Node;
11804 // Use an arbitrary dmask - required for the instruction to work
11805 NewDmask = 1;
11806 }
11807 // Abort if there's no change
11808 if (NewDmask == OldDmask)
11809 return Node;
11810
11811 unsigned BitsSet = llvm::popcount(NewDmask);
11812
11813 // Check for TFE or LWE - increase the number of channels by one to account
11814 // for the extra return value
11815 // This will need adjustment for D16 if this is also included in
11816 // adjustWriteMask (this function) but at present D16 are excluded.
11817 unsigned NewChannels = BitsSet + UsesTFC;
11818
11819 int NewOpcode =
11820 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
11821 assert(NewOpcode != -1 &&
11822 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
11823 "failed to find equivalent MIMG op");
11824
11825 // Adjust the writemask in the node
11826 SmallVector<SDValue, 12> Ops;
11827 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
11828 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
11829 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
11830
11831 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
11832
11833 MVT ResultVT = NewChannels == 1 ?
11834 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
11835 NewChannels == 5 ? 8 : NewChannels);
11836 SDVTList NewVTList = HasChain ?
11837 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
11838
11839
11840 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
11841 NewVTList, Ops);
11842
11843 if (HasChain) {
11844 // Update chain.
11845 DAG.setNodeMemRefs(NewNode, Node->memoperands());
11846 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
11847 }
11848
11849 if (NewChannels == 1) {
11850 assert(Node->hasNUsesOfValue(1, 0));
11851 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
11852 SDLoc(Node), Users[Lane]->getValueType(0),
11853 SDValue(NewNode, 0));
11854 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
11855 return nullptr;
11856 }
11857
11858 // Update the users of the node with the new indices
11859 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
11860 SDNode *User = Users[i];
11861 if (!User) {
11862 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
11863 // Users[0] is still nullptr because channel 0 doesn't really have a use.
11864 if (i || !NoChannels)
11865 continue;
11866 } else {
11867 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
11868 DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
11869 }
11870
11871 switch (Idx) {
11872 default: break;
11873 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
11874 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
11875 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
11876 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
11877 }
11878 }
11879
11880 DAG.RemoveDeadNode(Node);
11881 return nullptr;
11882 }
11883
isFrameIndexOp(SDValue Op)11884 static bool isFrameIndexOp(SDValue Op) {
11885 if (Op.getOpcode() == ISD::AssertZext)
11886 Op = Op.getOperand(0);
11887
11888 return isa<FrameIndexSDNode>(Op);
11889 }
11890
11891 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
11892 /// with frame index operands.
11893 /// LLVM assumes that inputs are to these instructions are registers.
legalizeTargetIndependentNode(SDNode * Node,SelectionDAG & DAG) const11894 SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
11895 SelectionDAG &DAG) const {
11896 if (Node->getOpcode() == ISD::CopyToReg) {
11897 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
11898 SDValue SrcVal = Node->getOperand(2);
11899
11900 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
11901 // to try understanding copies to physical registers.
11902 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
11903 SDLoc SL(Node);
11904 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
11905 SDValue VReg = DAG.getRegister(
11906 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
11907
11908 SDNode *Glued = Node->getGluedNode();
11909 SDValue ToVReg
11910 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
11911 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
11912 SDValue ToResultReg
11913 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
11914 VReg, ToVReg.getValue(1));
11915 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
11916 DAG.RemoveDeadNode(Node);
11917 return ToResultReg.getNode();
11918 }
11919 }
11920
11921 SmallVector<SDValue, 8> Ops;
11922 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
11923 if (!isFrameIndexOp(Node->getOperand(i))) {
11924 Ops.push_back(Node->getOperand(i));
11925 continue;
11926 }
11927
11928 SDLoc DL(Node);
11929 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
11930 Node->getOperand(i).getValueType(),
11931 Node->getOperand(i)), 0));
11932 }
11933
11934 return DAG.UpdateNodeOperands(Node, Ops);
11935 }
11936
11937 /// Fold the instructions after selecting them.
11938 /// Returns null if users were already updated.
PostISelFolding(MachineSDNode * Node,SelectionDAG & DAG) const11939 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
11940 SelectionDAG &DAG) const {
11941 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11942 unsigned Opcode = Node->getMachineOpcode();
11943
11944 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
11945 !TII->isGather4(Opcode) &&
11946 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
11947 return adjustWritemask(Node, DAG);
11948 }
11949
11950 if (Opcode == AMDGPU::INSERT_SUBREG ||
11951 Opcode == AMDGPU::REG_SEQUENCE) {
11952 legalizeTargetIndependentNode(Node, DAG);
11953 return Node;
11954 }
11955
11956 switch (Opcode) {
11957 case AMDGPU::V_DIV_SCALE_F32_e64:
11958 case AMDGPU::V_DIV_SCALE_F64_e64: {
11959 // Satisfy the operand register constraint when one of the inputs is
11960 // undefined. Ordinarily each undef value will have its own implicit_def of
11961 // a vreg, so force these to use a single register.
11962 SDValue Src0 = Node->getOperand(1);
11963 SDValue Src1 = Node->getOperand(3);
11964 SDValue Src2 = Node->getOperand(5);
11965
11966 if ((Src0.isMachineOpcode() &&
11967 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
11968 (Src0 == Src1 || Src0 == Src2))
11969 break;
11970
11971 MVT VT = Src0.getValueType().getSimpleVT();
11972 const TargetRegisterClass *RC =
11973 getRegClassFor(VT, Src0.getNode()->isDivergent());
11974
11975 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
11976 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
11977
11978 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
11979 UndefReg, Src0, SDValue());
11980
11981 // src0 must be the same register as src1 or src2, even if the value is
11982 // undefined, so make sure we don't violate this constraint.
11983 if (Src0.isMachineOpcode() &&
11984 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
11985 if (Src1.isMachineOpcode() &&
11986 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
11987 Src0 = Src1;
11988 else if (Src2.isMachineOpcode() &&
11989 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
11990 Src0 = Src2;
11991 else {
11992 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
11993 Src0 = UndefReg;
11994 Src1 = UndefReg;
11995 }
11996 } else
11997 break;
11998
11999 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
12000 Ops[1] = Src0;
12001 Ops[3] = Src1;
12002 Ops[5] = Src2;
12003 Ops.push_back(ImpDef.getValue(1));
12004 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
12005 }
12006 default:
12007 break;
12008 }
12009
12010 return Node;
12011 }
12012
12013 // Any MIMG instructions that use tfe or lwe require an initialization of the
12014 // result register that will be written in the case of a memory access failure.
12015 // The required code is also added to tie this init code to the result of the
12016 // img instruction.
AddIMGInit(MachineInstr & MI) const12017 void SITargetLowering::AddIMGInit(MachineInstr &MI) const {
12018 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12019 const SIRegisterInfo &TRI = TII->getRegisterInfo();
12020 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
12021 MachineBasicBlock &MBB = *MI.getParent();
12022
12023 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
12024 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
12025 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
12026
12027 if (!TFE && !LWE) // intersect_ray
12028 return;
12029
12030 unsigned TFEVal = TFE ? TFE->getImm() : 0;
12031 unsigned LWEVal = LWE->getImm();
12032 unsigned D16Val = D16 ? D16->getImm() : 0;
12033
12034 if (!TFEVal && !LWEVal)
12035 return;
12036
12037 // At least one of TFE or LWE are non-zero
12038 // We have to insert a suitable initialization of the result value and
12039 // tie this to the dest of the image instruction.
12040
12041 const DebugLoc &DL = MI.getDebugLoc();
12042
12043 int DstIdx =
12044 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
12045
12046 // Calculate which dword we have to initialize to 0.
12047 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
12048
12049 // check that dmask operand is found.
12050 assert(MO_Dmask && "Expected dmask operand in instruction");
12051
12052 unsigned dmask = MO_Dmask->getImm();
12053 // Determine the number of active lanes taking into account the
12054 // Gather4 special case
12055 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
12056
12057 bool Packed = !Subtarget->hasUnpackedD16VMem();
12058
12059 unsigned InitIdx =
12060 D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
12061
12062 // Abandon attempt if the dst size isn't large enough
12063 // - this is in fact an error but this is picked up elsewhere and
12064 // reported correctly.
12065 uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
12066 if (DstSize < InitIdx)
12067 return;
12068
12069 // Create a register for the initialization value.
12070 Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
12071 unsigned NewDst = 0; // Final initialized value will be in here
12072
12073 // If PRTStrictNull feature is enabled (the default) then initialize
12074 // all the result registers to 0, otherwise just the error indication
12075 // register (VGPRn+1)
12076 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
12077 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
12078
12079 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
12080 for (; SizeLeft; SizeLeft--, CurrIdx++) {
12081 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
12082 // Initialize dword
12083 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
12084 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
12085 .addImm(0);
12086 // Insert into the super-reg
12087 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
12088 .addReg(PrevDst)
12089 .addReg(SubReg)
12090 .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));
12091
12092 PrevDst = NewDst;
12093 }
12094
12095 // Add as an implicit operand
12096 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
12097
12098 // Tie the just added implicit operand to the dst
12099 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
12100 }
12101
12102 /// Assign the register class depending on the number of
12103 /// bits set in the writemask
AdjustInstrPostInstrSelection(MachineInstr & MI,SDNode * Node) const12104 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
12105 SDNode *Node) const {
12106 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12107
12108 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
12109
12110 if (TII->isVOP3(MI.getOpcode())) {
12111 // Make sure constant bus requirements are respected.
12112 TII->legalizeOperandsVOP3(MRI, MI);
12113
12114 // Prefer VGPRs over AGPRs in mAI instructions where possible.
12115 // This saves a chain-copy of registers and better balance register
12116 // use between vgpr and agpr as agpr tuples tend to be big.
12117 if (!MI.getDesc().operands().empty()) {
12118 unsigned Opc = MI.getOpcode();
12119 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
12120 for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
12121 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) {
12122 if (I == -1)
12123 break;
12124 MachineOperand &Op = MI.getOperand(I);
12125 if (!Op.isReg() || !Op.getReg().isVirtual())
12126 continue;
12127 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
12128 if (!TRI->hasAGPRs(RC))
12129 continue;
12130 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
12131 if (!Src || !Src->isCopy() ||
12132 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
12133 continue;
12134 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
12135 // All uses of agpr64 and agpr32 can also accept vgpr except for
12136 // v_accvgpr_read, but we do not produce agpr reads during selection,
12137 // so no use checks are needed.
12138 MRI.setRegClass(Op.getReg(), NewRC);
12139 }
12140
12141 // Resolve the rest of AV operands to AGPRs.
12142 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
12143 if (Src2->isReg() && Src2->getReg().isVirtual()) {
12144 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
12145 if (TRI->isVectorSuperClass(RC)) {
12146 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
12147 MRI.setRegClass(Src2->getReg(), NewRC);
12148 if (Src2->isTied())
12149 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
12150 }
12151 }
12152 }
12153 }
12154
12155 return;
12156 }
12157
12158 if (TII->isMIMG(MI)) {
12159 if (!MI.mayStore())
12160 AddIMGInit(MI);
12161 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
12162 }
12163 }
12164
buildSMovImm32(SelectionDAG & DAG,const SDLoc & DL,uint64_t Val)12165 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
12166 uint64_t Val) {
12167 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
12168 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
12169 }
12170
wrapAddr64Rsrc(SelectionDAG & DAG,const SDLoc & DL,SDValue Ptr) const12171 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
12172 const SDLoc &DL,
12173 SDValue Ptr) const {
12174 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12175
12176 // Build the half of the subregister with the constants before building the
12177 // full 128-bit register. If we are building multiple resource descriptors,
12178 // this will allow CSEing of the 2-component register.
12179 const SDValue Ops0[] = {
12180 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
12181 buildSMovImm32(DAG, DL, 0),
12182 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
12183 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
12184 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
12185 };
12186
12187 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
12188 MVT::v2i32, Ops0), 0);
12189
12190 // Combine the constants and the pointer.
12191 const SDValue Ops1[] = {
12192 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
12193 Ptr,
12194 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
12195 SubRegHi,
12196 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
12197 };
12198
12199 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
12200 }
12201
12202 /// Return a resource descriptor with the 'Add TID' bit enabled
12203 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
12204 /// of the resource descriptor) to create an offset, which is added to
12205 /// the resource pointer.
buildRSRC(SelectionDAG & DAG,const SDLoc & DL,SDValue Ptr,uint32_t RsrcDword1,uint64_t RsrcDword2And3) const12206 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
12207 SDValue Ptr, uint32_t RsrcDword1,
12208 uint64_t RsrcDword2And3) const {
12209 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
12210 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
12211 if (RsrcDword1) {
12212 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
12213 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
12214 0);
12215 }
12216
12217 SDValue DataLo = buildSMovImm32(DAG, DL,
12218 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
12219 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
12220
12221 const SDValue Ops[] = {
12222 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
12223 PtrLo,
12224 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
12225 PtrHi,
12226 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
12227 DataLo,
12228 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
12229 DataHi,
12230 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
12231 };
12232
12233 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
12234 }
12235
12236 //===----------------------------------------------------------------------===//
12237 // SI Inline Assembly Support
12238 //===----------------------------------------------------------------------===//
12239
12240 std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo * TRI_,StringRef Constraint,MVT VT) const12241 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
12242 StringRef Constraint,
12243 MVT VT) const {
12244 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
12245
12246 const TargetRegisterClass *RC = nullptr;
12247 if (Constraint.size() == 1) {
12248 const unsigned BitWidth = VT.getSizeInBits();
12249 switch (Constraint[0]) {
12250 default:
12251 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
12252 case 's':
12253 case 'r':
12254 switch (BitWidth) {
12255 case 16:
12256 RC = &AMDGPU::SReg_32RegClass;
12257 break;
12258 case 64:
12259 RC = &AMDGPU::SGPR_64RegClass;
12260 break;
12261 default:
12262 RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
12263 if (!RC)
12264 return std::pair(0U, nullptr);
12265 break;
12266 }
12267 break;
12268 case 'v':
12269 switch (BitWidth) {
12270 case 16:
12271 RC = &AMDGPU::VGPR_32RegClass;
12272 break;
12273 default:
12274 RC = TRI->getVGPRClassForBitWidth(BitWidth);
12275 if (!RC)
12276 return std::pair(0U, nullptr);
12277 break;
12278 }
12279 break;
12280 case 'a':
12281 if (!Subtarget->hasMAIInsts())
12282 break;
12283 switch (BitWidth) {
12284 case 16:
12285 RC = &AMDGPU::AGPR_32RegClass;
12286 break;
12287 default:
12288 RC = TRI->getAGPRClassForBitWidth(BitWidth);
12289 if (!RC)
12290 return std::pair(0U, nullptr);
12291 break;
12292 }
12293 break;
12294 }
12295 // We actually support i128, i16 and f16 as inline parameters
12296 // even if they are not reported as legal
12297 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
12298 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
12299 return std::pair(0U, RC);
12300 }
12301
12302 if (Constraint.startswith("{") && Constraint.endswith("}")) {
12303 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
12304 if (RegName.consume_front("v")) {
12305 RC = &AMDGPU::VGPR_32RegClass;
12306 } else if (RegName.consume_front("s")) {
12307 RC = &AMDGPU::SGPR_32RegClass;
12308 } else if (RegName.consume_front("a")) {
12309 RC = &AMDGPU::AGPR_32RegClass;
12310 }
12311
12312 if (RC) {
12313 uint32_t Idx;
12314 if (RegName.consume_front("[")) {
12315 uint32_t End;
12316 bool Failed = RegName.consumeInteger(10, Idx);
12317 Failed |= !RegName.consume_front(":");
12318 Failed |= RegName.consumeInteger(10, End);
12319 Failed |= !RegName.consume_back("]");
12320 if (!Failed) {
12321 uint32_t Width = (End - Idx + 1) * 32;
12322 MCRegister Reg = RC->getRegister(Idx);
12323 if (SIRegisterInfo::isVGPRClass(RC))
12324 RC = TRI->getVGPRClassForBitWidth(Width);
12325 else if (SIRegisterInfo::isSGPRClass(RC))
12326 RC = TRI->getSGPRClassForBitWidth(Width);
12327 else if (SIRegisterInfo::isAGPRClass(RC))
12328 RC = TRI->getAGPRClassForBitWidth(Width);
12329 if (RC) {
12330 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
12331 return std::pair(Reg, RC);
12332 }
12333 }
12334 } else {
12335 bool Failed = RegName.getAsInteger(10, Idx);
12336 if (!Failed && Idx < RC->getNumRegs())
12337 return std::pair(RC->getRegister(Idx), RC);
12338 }
12339 }
12340 }
12341
12342 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
12343 if (Ret.first)
12344 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
12345
12346 return Ret;
12347 }
12348
isImmConstraint(StringRef Constraint)12349 static bool isImmConstraint(StringRef Constraint) {
12350 if (Constraint.size() == 1) {
12351 switch (Constraint[0]) {
12352 default: break;
12353 case 'I':
12354 case 'J':
12355 case 'A':
12356 case 'B':
12357 case 'C':
12358 return true;
12359 }
12360 } else if (Constraint == "DA" ||
12361 Constraint == "DB") {
12362 return true;
12363 }
12364 return false;
12365 }
12366
12367 SITargetLowering::ConstraintType
getConstraintType(StringRef Constraint) const12368 SITargetLowering::getConstraintType(StringRef Constraint) const {
12369 if (Constraint.size() == 1) {
12370 switch (Constraint[0]) {
12371 default: break;
12372 case 's':
12373 case 'v':
12374 case 'a':
12375 return C_RegisterClass;
12376 }
12377 }
12378 if (isImmConstraint(Constraint)) {
12379 return C_Other;
12380 }
12381 return TargetLowering::getConstraintType(Constraint);
12382 }
12383
clearUnusedBits(uint64_t Val,unsigned Size)12384 static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
12385 if (!AMDGPU::isInlinableIntLiteral(Val)) {
12386 Val = Val & maskTrailingOnes<uint64_t>(Size);
12387 }
12388 return Val;
12389 }
12390
LowerAsmOperandForConstraint(SDValue Op,std::string & Constraint,std::vector<SDValue> & Ops,SelectionDAG & DAG) const12391 void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
12392 std::string &Constraint,
12393 std::vector<SDValue> &Ops,
12394 SelectionDAG &DAG) const {
12395 if (isImmConstraint(Constraint)) {
12396 uint64_t Val;
12397 if (getAsmOperandConstVal(Op, Val) &&
12398 checkAsmConstraintVal(Op, Constraint, Val)) {
12399 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
12400 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
12401 }
12402 } else {
12403 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
12404 }
12405 }
12406
getAsmOperandConstVal(SDValue Op,uint64_t & Val) const12407 bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
12408 unsigned Size = Op.getScalarValueSizeInBits();
12409 if (Size > 64)
12410 return false;
12411
12412 if (Size == 16 && !Subtarget->has16BitInsts())
12413 return false;
12414
12415 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
12416 Val = C->getSExtValue();
12417 return true;
12418 }
12419 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
12420 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
12421 return true;
12422 }
12423 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
12424 if (Size != 16 || Op.getNumOperands() != 2)
12425 return false;
12426 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
12427 return false;
12428 if (ConstantSDNode *C = V->getConstantSplatNode()) {
12429 Val = C->getSExtValue();
12430 return true;
12431 }
12432 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
12433 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
12434 return true;
12435 }
12436 }
12437
12438 return false;
12439 }
12440
checkAsmConstraintVal(SDValue Op,const std::string & Constraint,uint64_t Val) const12441 bool SITargetLowering::checkAsmConstraintVal(SDValue Op,
12442 const std::string &Constraint,
12443 uint64_t Val) const {
12444 if (Constraint.size() == 1) {
12445 switch (Constraint[0]) {
12446 case 'I':
12447 return AMDGPU::isInlinableIntLiteral(Val);
12448 case 'J':
12449 return isInt<16>(Val);
12450 case 'A':
12451 return checkAsmConstraintValA(Op, Val);
12452 case 'B':
12453 return isInt<32>(Val);
12454 case 'C':
12455 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
12456 AMDGPU::isInlinableIntLiteral(Val);
12457 default:
12458 break;
12459 }
12460 } else if (Constraint.size() == 2) {
12461 if (Constraint == "DA") {
12462 int64_t HiBits = static_cast<int32_t>(Val >> 32);
12463 int64_t LoBits = static_cast<int32_t>(Val);
12464 return checkAsmConstraintValA(Op, HiBits, 32) &&
12465 checkAsmConstraintValA(Op, LoBits, 32);
12466 }
12467 if (Constraint == "DB") {
12468 return true;
12469 }
12470 }
12471 llvm_unreachable("Invalid asm constraint");
12472 }
12473
checkAsmConstraintValA(SDValue Op,uint64_t Val,unsigned MaxSize) const12474 bool SITargetLowering::checkAsmConstraintValA(SDValue Op,
12475 uint64_t Val,
12476 unsigned MaxSize) const {
12477 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
12478 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
12479 if ((Size == 16 && AMDGPU::isInlinableLiteral16(Val, HasInv2Pi)) ||
12480 (Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
12481 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) {
12482 return true;
12483 }
12484 return false;
12485 }
12486
getAlignedAGPRClassID(unsigned UnalignedClassID)12487 static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
12488 switch (UnalignedClassID) {
12489 case AMDGPU::VReg_64RegClassID:
12490 return AMDGPU::VReg_64_Align2RegClassID;
12491 case AMDGPU::VReg_96RegClassID:
12492 return AMDGPU::VReg_96_Align2RegClassID;
12493 case AMDGPU::VReg_128RegClassID:
12494 return AMDGPU::VReg_128_Align2RegClassID;
12495 case AMDGPU::VReg_160RegClassID:
12496 return AMDGPU::VReg_160_Align2RegClassID;
12497 case AMDGPU::VReg_192RegClassID:
12498 return AMDGPU::VReg_192_Align2RegClassID;
12499 case AMDGPU::VReg_224RegClassID:
12500 return AMDGPU::VReg_224_Align2RegClassID;
12501 case AMDGPU::VReg_256RegClassID:
12502 return AMDGPU::VReg_256_Align2RegClassID;
12503 case AMDGPU::VReg_288RegClassID:
12504 return AMDGPU::VReg_288_Align2RegClassID;
12505 case AMDGPU::VReg_320RegClassID:
12506 return AMDGPU::VReg_320_Align2RegClassID;
12507 case AMDGPU::VReg_352RegClassID:
12508 return AMDGPU::VReg_352_Align2RegClassID;
12509 case AMDGPU::VReg_384RegClassID:
12510 return AMDGPU::VReg_384_Align2RegClassID;
12511 case AMDGPU::VReg_512RegClassID:
12512 return AMDGPU::VReg_512_Align2RegClassID;
12513 case AMDGPU::VReg_1024RegClassID:
12514 return AMDGPU::VReg_1024_Align2RegClassID;
12515 case AMDGPU::AReg_64RegClassID:
12516 return AMDGPU::AReg_64_Align2RegClassID;
12517 case AMDGPU::AReg_96RegClassID:
12518 return AMDGPU::AReg_96_Align2RegClassID;
12519 case AMDGPU::AReg_128RegClassID:
12520 return AMDGPU::AReg_128_Align2RegClassID;
12521 case AMDGPU::AReg_160RegClassID:
12522 return AMDGPU::AReg_160_Align2RegClassID;
12523 case AMDGPU::AReg_192RegClassID:
12524 return AMDGPU::AReg_192_Align2RegClassID;
12525 case AMDGPU::AReg_256RegClassID:
12526 return AMDGPU::AReg_256_Align2RegClassID;
12527 case AMDGPU::AReg_512RegClassID:
12528 return AMDGPU::AReg_512_Align2RegClassID;
12529 case AMDGPU::AReg_1024RegClassID:
12530 return AMDGPU::AReg_1024_Align2RegClassID;
12531 default:
12532 return -1;
12533 }
12534 }
12535
12536 // Figure out which registers should be reserved for stack access. Only after
12537 // the function is legalized do we know all of the non-spill stack objects or if
12538 // calls are present.
finalizeLowering(MachineFunction & MF) const12539 void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
12540 MachineRegisterInfo &MRI = MF.getRegInfo();
12541 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12542 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
12543 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
12544 const SIInstrInfo *TII = ST.getInstrInfo();
12545
12546 if (Info->isEntryFunction()) {
12547 // Callable functions have fixed registers used for stack access.
12548 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
12549 }
12550
12551 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
12552 Info->getStackPtrOffsetReg()));
12553 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
12554 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
12555
12556 // We need to worry about replacing the default register with itself in case
12557 // of MIR testcases missing the MFI.
12558 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
12559 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
12560
12561 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
12562 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
12563
12564 Info->limitOccupancy(MF);
12565
12566 if (ST.isWave32() && !MF.empty()) {
12567 for (auto &MBB : MF) {
12568 for (auto &MI : MBB) {
12569 TII->fixImplicitOperands(MI);
12570 }
12571 }
12572 }
12573
12574 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
12575 // classes if required. Ideally the register class constraints would differ
12576 // per-subtarget, but there's no easy way to achieve that right now. This is
12577 // not a problem for VGPRs because the correctly aligned VGPR class is implied
12578 // from using them as the register class for legal types.
12579 if (ST.needsAlignedVGPRs()) {
12580 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
12581 const Register Reg = Register::index2VirtReg(I);
12582 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
12583 if (!RC)
12584 continue;
12585 int NewClassID = getAlignedAGPRClassID(RC->getID());
12586 if (NewClassID != -1)
12587 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
12588 }
12589 }
12590
12591 TargetLoweringBase::finalizeLowering(MF);
12592 }
12593
computeKnownBitsForFrameIndex(const int FI,KnownBits & Known,const MachineFunction & MF) const12594 void SITargetLowering::computeKnownBitsForFrameIndex(
12595 const int FI, KnownBits &Known, const MachineFunction &MF) const {
12596 TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF);
12597
12598 // Set the high bits to zero based on the maximum allowed scratch size per
12599 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
12600 // calculation won't overflow, so assume the sign bit is never set.
12601 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
12602 }
12603
knownBitsForWorkitemID(const GCNSubtarget & ST,GISelKnownBits & KB,KnownBits & Known,unsigned Dim)12604 static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB,
12605 KnownBits &Known, unsigned Dim) {
12606 unsigned MaxValue =
12607 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
12608 Known.Zero.setHighBits(countLeadingZeros(MaxValue));
12609 }
12610
computeKnownBitsForTargetInstr(GISelKnownBits & KB,Register R,KnownBits & Known,const APInt & DemandedElts,const MachineRegisterInfo & MRI,unsigned Depth) const12611 void SITargetLowering::computeKnownBitsForTargetInstr(
12612 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
12613 const MachineRegisterInfo &MRI, unsigned Depth) const {
12614 const MachineInstr *MI = MRI.getVRegDef(R);
12615 switch (MI->getOpcode()) {
12616 case AMDGPU::G_INTRINSIC: {
12617 switch (MI->getIntrinsicID()) {
12618 case Intrinsic::amdgcn_workitem_id_x:
12619 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
12620 break;
12621 case Intrinsic::amdgcn_workitem_id_y:
12622 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
12623 break;
12624 case Intrinsic::amdgcn_workitem_id_z:
12625 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
12626 break;
12627 case Intrinsic::amdgcn_mbcnt_lo:
12628 case Intrinsic::amdgcn_mbcnt_hi: {
12629 // These return at most the wavefront size - 1.
12630 unsigned Size = MRI.getType(R).getSizeInBits();
12631 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
12632 break;
12633 }
12634 case Intrinsic::amdgcn_groupstaticsize: {
12635 // We can report everything over the maximum size as 0. We can't report
12636 // based on the actual size because we don't know if it's accurate or not
12637 // at any given point.
12638 Known.Zero.setHighBits(
12639 countLeadingZeros(getSubtarget()->getAddressableLocalMemorySize()));
12640 break;
12641 }
12642 }
12643 break;
12644 }
12645 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
12646 Known.Zero.setHighBits(24);
12647 break;
12648 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
12649 Known.Zero.setHighBits(16);
12650 break;
12651 }
12652 }
12653
computeKnownAlignForTargetInstr(GISelKnownBits & KB,Register R,const MachineRegisterInfo & MRI,unsigned Depth) const12654 Align SITargetLowering::computeKnownAlignForTargetInstr(
12655 GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
12656 unsigned Depth) const {
12657 const MachineInstr *MI = MRI.getVRegDef(R);
12658 switch (MI->getOpcode()) {
12659 case AMDGPU::G_INTRINSIC:
12660 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
12661 // FIXME: Can this move to generic code? What about the case where the call
12662 // site specifies a lower alignment?
12663 Intrinsic::ID IID = MI->getIntrinsicID();
12664 LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext();
12665 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
12666 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
12667 return *RetAlign;
12668 return Align(1);
12669 }
12670 default:
12671 return Align(1);
12672 }
12673 }
12674
getPrefLoopAlignment(MachineLoop * ML) const12675 Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
12676 const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
12677 const Align CacheLineAlign = Align(64);
12678
12679 // Pre-GFX10 target did not benefit from loop alignment
12680 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
12681 getSubtarget()->hasInstFwdPrefetchBug())
12682 return PrefAlign;
12683
12684 // On GFX10 I$ is 4 x 64 bytes cache lines.
12685 // By default prefetcher keeps one cache line behind and reads two ahead.
12686 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
12687 // behind and one ahead.
12688 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
12689 // If loop fits 64 bytes it always spans no more than two cache lines and
12690 // does not need an alignment.
12691 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
12692 // Else if loop is less or equal 192 bytes we need two lines behind.
12693
12694 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12695 const MachineBasicBlock *Header = ML->getHeader();
12696 if (Header->getAlignment() != PrefAlign)
12697 return Header->getAlignment(); // Already processed.
12698
12699 unsigned LoopSize = 0;
12700 for (const MachineBasicBlock *MBB : ML->blocks()) {
12701 // If inner loop block is aligned assume in average half of the alignment
12702 // size to be added as nops.
12703 if (MBB != Header)
12704 LoopSize += MBB->getAlignment().value() / 2;
12705
12706 for (const MachineInstr &MI : *MBB) {
12707 LoopSize += TII->getInstSizeInBytes(MI);
12708 if (LoopSize > 192)
12709 return PrefAlign;
12710 }
12711 }
12712
12713 if (LoopSize <= 64)
12714 return PrefAlign;
12715
12716 if (LoopSize <= 128)
12717 return CacheLineAlign;
12718
12719 // If any of parent loops is surrounded by prefetch instructions do not
12720 // insert new for inner loop, which would reset parent's settings.
12721 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
12722 if (MachineBasicBlock *Exit = P->getExitBlock()) {
12723 auto I = Exit->getFirstNonDebugInstr();
12724 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
12725 return CacheLineAlign;
12726 }
12727 }
12728
12729 MachineBasicBlock *Pre = ML->getLoopPreheader();
12730 MachineBasicBlock *Exit = ML->getExitBlock();
12731
12732 if (Pre && Exit) {
12733 auto PreTerm = Pre->getFirstTerminator();
12734 if (PreTerm == Pre->begin() ||
12735 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
12736 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
12737 .addImm(1); // prefetch 2 lines behind PC
12738
12739 auto ExitHead = Exit->getFirstNonDebugInstr();
12740 if (ExitHead == Exit->end() ||
12741 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
12742 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
12743 .addImm(2); // prefetch 1 line behind PC
12744 }
12745
12746 return CacheLineAlign;
12747 }
12748
12749 LLVM_ATTRIBUTE_UNUSED
isCopyFromRegOfInlineAsm(const SDNode * N)12750 static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
12751 assert(N->getOpcode() == ISD::CopyFromReg);
12752 do {
12753 // Follow the chain until we find an INLINEASM node.
12754 N = N->getOperand(0).getNode();
12755 if (N->getOpcode() == ISD::INLINEASM ||
12756 N->getOpcode() == ISD::INLINEASM_BR)
12757 return true;
12758 } while (N->getOpcode() == ISD::CopyFromReg);
12759 return false;
12760 }
12761
isSDNodeSourceOfDivergence(const SDNode * N,FunctionLoweringInfo * FLI,LegacyDivergenceAnalysis * KDA) const12762 bool SITargetLowering::isSDNodeSourceOfDivergence(
12763 const SDNode *N, FunctionLoweringInfo *FLI,
12764 LegacyDivergenceAnalysis *KDA) const {
12765 switch (N->getOpcode()) {
12766 case ISD::CopyFromReg: {
12767 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
12768 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
12769 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
12770 Register Reg = R->getReg();
12771
12772 // FIXME: Why does this need to consider isLiveIn?
12773 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
12774 return !TRI->isSGPRReg(MRI, Reg);
12775
12776 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
12777 return KDA->isDivergent(V);
12778
12779 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
12780 return !TRI->isSGPRReg(MRI, Reg);
12781 }
12782 case ISD::LOAD: {
12783 const LoadSDNode *L = cast<LoadSDNode>(N);
12784 unsigned AS = L->getAddressSpace();
12785 // A flat load may access private memory.
12786 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
12787 }
12788 case ISD::CALLSEQ_END:
12789 return true;
12790 case ISD::INTRINSIC_WO_CHAIN:
12791 return AMDGPU::isIntrinsicSourceOfDivergence(
12792 cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
12793 case ISD::INTRINSIC_W_CHAIN:
12794 return AMDGPU::isIntrinsicSourceOfDivergence(
12795 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
12796 case AMDGPUISD::ATOMIC_CMP_SWAP:
12797 case AMDGPUISD::ATOMIC_INC:
12798 case AMDGPUISD::ATOMIC_DEC:
12799 case AMDGPUISD::ATOMIC_LOAD_FMIN:
12800 case AMDGPUISD::ATOMIC_LOAD_FMAX:
12801 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
12802 case AMDGPUISD::BUFFER_ATOMIC_ADD:
12803 case AMDGPUISD::BUFFER_ATOMIC_SUB:
12804 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
12805 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
12806 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
12807 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
12808 case AMDGPUISD::BUFFER_ATOMIC_AND:
12809 case AMDGPUISD::BUFFER_ATOMIC_OR:
12810 case AMDGPUISD::BUFFER_ATOMIC_XOR:
12811 case AMDGPUISD::BUFFER_ATOMIC_INC:
12812 case AMDGPUISD::BUFFER_ATOMIC_DEC:
12813 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
12814 case AMDGPUISD::BUFFER_ATOMIC_CSUB:
12815 case AMDGPUISD::BUFFER_ATOMIC_FADD:
12816 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
12817 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
12818 // Target-specific read-modify-write atomics are sources of divergence.
12819 return true;
12820 default:
12821 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
12822 // Generic read-modify-write atomics are sources of divergence.
12823 return A->readMem() && A->writeMem();
12824 }
12825 return false;
12826 }
12827 }
12828
denormalsEnabledForType(const SelectionDAG & DAG,EVT VT) const12829 bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
12830 EVT VT) const {
12831 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
12832 case MVT::f32:
12833 return hasFP32Denormals(DAG.getMachineFunction());
12834 case MVT::f64:
12835 case MVT::f16:
12836 return hasFP64FP16Denormals(DAG.getMachineFunction());
12837 default:
12838 return false;
12839 }
12840 }
12841
denormalsEnabledForType(LLT Ty,MachineFunction & MF) const12842 bool SITargetLowering::denormalsEnabledForType(LLT Ty,
12843 MachineFunction &MF) const {
12844 switch (Ty.getScalarSizeInBits()) {
12845 case 32:
12846 return hasFP32Denormals(MF);
12847 case 64:
12848 case 16:
12849 return hasFP64FP16Denormals(MF);
12850 default:
12851 return false;
12852 }
12853 }
12854
isKnownNeverNaNForTargetNode(SDValue Op,const SelectionDAG & DAG,bool SNaN,unsigned Depth) const12855 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
12856 const SelectionDAG &DAG,
12857 bool SNaN,
12858 unsigned Depth) const {
12859 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
12860 const MachineFunction &MF = DAG.getMachineFunction();
12861 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12862
12863 if (Info->getMode().DX10Clamp)
12864 return true; // Clamped to 0.
12865 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
12866 }
12867
12868 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
12869 SNaN, Depth);
12870 }
12871
12872 // Global FP atomic instructions have a hardcoded FP mode and do not support
12873 // FP32 denormals, and only support v2f16 denormals.
fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst * RMW)12874 static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
12875 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
12876 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
12877 if (&Flt == &APFloat::IEEEsingle())
12878 return DenormMode == DenormalMode::getPreserveSign();
12879 return DenormMode == DenormalMode::getIEEE();
12880 }
12881
12882 // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
12883 // floating point atomic instructions. May generate more efficient code,
12884 // but may not respect rounding and denormal modes, and may give incorrect
12885 // results for certain memory destinations.
unsafeFPAtomicsDisabled(Function * F)12886 bool unsafeFPAtomicsDisabled(Function *F) {
12887 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
12888 "true";
12889 }
12890
12891 TargetLowering::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst * RMW) const12892 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
12893 unsigned AS = RMW->getPointerAddressSpace();
12894 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
12895 return AtomicExpansionKind::NotAtomic;
12896
12897 auto SSID = RMW->getSyncScopeID();
12898
12899 auto ReportUnsafeHWInst = [&](TargetLowering::AtomicExpansionKind Kind) {
12900 OptimizationRemarkEmitter ORE(RMW->getFunction());
12901 LLVMContext &Ctx = RMW->getFunction()->getContext();
12902 SmallVector<StringRef> SSNs;
12903 Ctx.getSyncScopeNames(SSNs);
12904 auto MemScope = SSNs[RMW->getSyncScopeID()].empty()
12905 ? "system"
12906 : SSNs[RMW->getSyncScopeID()];
12907 ORE.emit([&]() {
12908 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
12909 << "Hardware instruction generated for atomic "
12910 << RMW->getOperationName(RMW->getOperation())
12911 << " operation at memory scope " << MemScope
12912 << " due to an unsafe request.";
12913 });
12914 return Kind;
12915 };
12916
12917 bool HasSystemScope =
12918 SSID == SyncScope::System ||
12919 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
12920
12921 switch (RMW->getOperation()) {
12922 case AtomicRMWInst::FAdd: {
12923 Type *Ty = RMW->getType();
12924
12925 if (Ty->isHalfTy())
12926 return AtomicExpansionKind::CmpXChg;
12927
12928 if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
12929 return AtomicExpansionKind::CmpXChg;
12930
12931 if (AMDGPU::isFlatGlobalAddrSpace(AS) &&
12932 Subtarget->hasAtomicFaddNoRtnInsts()) {
12933 if (unsafeFPAtomicsDisabled(RMW->getFunction()))
12934 return AtomicExpansionKind::CmpXChg;
12935
12936 // Always expand system scope fp atomics.
12937 if (HasSystemScope)
12938 return AtomicExpansionKind::CmpXChg;
12939
12940 if (AS == AMDGPUAS::GLOBAL_ADDRESS && Ty->isFloatTy()) {
12941 // global atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
12942 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
12943 return ReportUnsafeHWInst(AtomicExpansionKind::None);
12944 // global atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
12945 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
12946 return ReportUnsafeHWInst(AtomicExpansionKind::None);
12947 }
12948
12949 // flat atomic fadd f32: gfx940, gfx11+.
12950 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
12951 Subtarget->hasFlatAtomicFaddF32Inst())
12952 return ReportUnsafeHWInst(AtomicExpansionKind::None);
12953
12954 // global and flat atomic fadd f64: gfx90a, gfx940.
12955 if (Ty->isDoubleTy() && Subtarget->hasGFX90AInsts())
12956 return ReportUnsafeHWInst(AtomicExpansionKind::None);
12957
12958 // If it is in flat address space, and the type is float, we will try to
12959 // expand it, if the target supports global and lds atomic fadd. The
12960 // reason we need that is, in the expansion, we emit the check of address
12961 // space. If it is in global address space, we emit the global atomic
12962 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
12963 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
12964 Subtarget->hasLDSFPAtomicAdd()) {
12965 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
12966 return AtomicExpansionKind::Expand;
12967 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
12968 return AtomicExpansionKind::Expand;
12969 }
12970
12971 return AtomicExpansionKind::CmpXChg;
12972 }
12973
12974 // DS FP atomics do respect the denormal mode, but the rounding mode is
12975 // fixed to round-to-nearest-even.
12976 // The only exception is DS_ADD_F64 which never flushes regardless of mode.
12977 if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
12978 if (!Ty->isDoubleTy())
12979 return AtomicExpansionKind::None;
12980
12981 if (fpModeMatchesGlobalFPAtomicMode(RMW))
12982 return AtomicExpansionKind::None;
12983
12984 return RMW->getFunction()
12985 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
12986 .getValueAsString() == "true"
12987 ? ReportUnsafeHWInst(AtomicExpansionKind::None)
12988 : AtomicExpansionKind::CmpXChg;
12989 }
12990
12991 return AtomicExpansionKind::CmpXChg;
12992 }
12993 case AtomicRMWInst::FMin:
12994 case AtomicRMWInst::FMax:
12995 case AtomicRMWInst::Min:
12996 case AtomicRMWInst::Max:
12997 case AtomicRMWInst::UMin:
12998 case AtomicRMWInst::UMax: {
12999 if (AMDGPU::isFlatGlobalAddrSpace(AS)) {
13000 if (RMW->getType()->isFloatTy() &&
13001 unsafeFPAtomicsDisabled(RMW->getFunction()))
13002 return AtomicExpansionKind::CmpXChg;
13003
13004 // Always expand system scope min/max atomics.
13005 if (HasSystemScope)
13006 return AtomicExpansionKind::CmpXChg;
13007 }
13008 break;
13009 }
13010 default:
13011 break;
13012 }
13013
13014 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
13015 }
13016
13017 TargetLowering::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst * LI) const13018 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
13019 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
13020 ? AtomicExpansionKind::NotAtomic
13021 : AtomicExpansionKind::None;
13022 }
13023
13024 TargetLowering::AtomicExpansionKind
shouldExpandAtomicStoreInIR(StoreInst * SI) const13025 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
13026 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
13027 ? AtomicExpansionKind::NotAtomic
13028 : AtomicExpansionKind::None;
13029 }
13030
13031 TargetLowering::AtomicExpansionKind
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst * CmpX) const13032 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
13033 return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
13034 ? AtomicExpansionKind::NotAtomic
13035 : AtomicExpansionKind::None;
13036 }
13037
13038 const TargetRegisterClass *
getRegClassFor(MVT VT,bool isDivergent) const13039 SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
13040 const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
13041 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
13042 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
13043 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
13044 : &AMDGPU::SReg_32RegClass;
13045 if (!TRI->isSGPRClass(RC) && !isDivergent)
13046 return TRI->getEquivalentSGPRClass(RC);
13047 else if (TRI->isSGPRClass(RC) && isDivergent)
13048 return TRI->getEquivalentVGPRClass(RC);
13049
13050 return RC;
13051 }
13052
13053 // FIXME: This is a workaround for DivergenceAnalysis not understanding always
13054 // uniform values (as produced by the mask results of control flow intrinsics)
13055 // used outside of divergent blocks. The phi users need to also be treated as
13056 // always uniform.
hasCFUser(const Value * V,SmallPtrSet<const Value *,16> & Visited,unsigned WaveSize)13057 static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
13058 unsigned WaveSize) {
13059 // FIXME: We assume we never cast the mask results of a control flow
13060 // intrinsic.
13061 // Early exit if the type won't be consistent as a compile time hack.
13062 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
13063 if (!IT || IT->getBitWidth() != WaveSize)
13064 return false;
13065
13066 if (!isa<Instruction>(V))
13067 return false;
13068 if (!Visited.insert(V).second)
13069 return false;
13070 bool Result = false;
13071 for (const auto *U : V->users()) {
13072 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
13073 if (V == U->getOperand(1)) {
13074 switch (Intrinsic->getIntrinsicID()) {
13075 default:
13076 Result = false;
13077 break;
13078 case Intrinsic::amdgcn_if_break:
13079 case Intrinsic::amdgcn_if:
13080 case Intrinsic::amdgcn_else:
13081 Result = true;
13082 break;
13083 }
13084 }
13085 if (V == U->getOperand(0)) {
13086 switch (Intrinsic->getIntrinsicID()) {
13087 default:
13088 Result = false;
13089 break;
13090 case Intrinsic::amdgcn_end_cf:
13091 case Intrinsic::amdgcn_loop:
13092 Result = true;
13093 break;
13094 }
13095 }
13096 } else {
13097 Result = hasCFUser(U, Visited, WaveSize);
13098 }
13099 if (Result)
13100 break;
13101 }
13102 return Result;
13103 }
13104
requiresUniformRegister(MachineFunction & MF,const Value * V) const13105 bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
13106 const Value *V) const {
13107 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
13108 if (CI->isInlineAsm()) {
13109 // FIXME: This cannot give a correct answer. This should only trigger in
13110 // the case where inline asm returns mixed SGPR and VGPR results, used
13111 // outside the defining block. We don't have a specific result to
13112 // consider, so this assumes if any value is SGPR, the overall register
13113 // also needs to be SGPR.
13114 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
13115 TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
13116 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
13117 for (auto &TC : TargetConstraints) {
13118 if (TC.Type == InlineAsm::isOutput) {
13119 ComputeConstraintToUse(TC, SDValue());
13120 const TargetRegisterClass *RC = getRegForInlineAsmConstraint(
13121 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
13122 if (RC && SIRI->isSGPRClass(RC))
13123 return true;
13124 }
13125 }
13126 }
13127 }
13128 SmallPtrSet<const Value *, 16> Visited;
13129 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
13130 }
13131
hasMemSDNodeUser(SDNode * N) const13132 bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
13133 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
13134 for (; I != E; ++I) {
13135 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
13136 if (getBasePtrIndex(M) == I.getOperandNo())
13137 return true;
13138 }
13139 }
13140 return false;
13141 }
13142
isReassocProfitable(SelectionDAG & DAG,SDValue N0,SDValue N1) const13143 bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
13144 SDValue N1) const {
13145 if (!N0.hasOneUse())
13146 return false;
13147 // Take care of the opportunity to keep N0 uniform
13148 if (N0->isDivergent() || !N1->isDivergent())
13149 return true;
13150 // Check if we have a good chance to form the memory access pattern with the
13151 // base and offset
13152 return (DAG.isBaseWithConstantOffset(N0) &&
13153 hasMemSDNodeUser(*N0->use_begin()));
13154 }
13155
13156 MachineMemOperand::Flags
getTargetMMOFlags(const Instruction & I) const13157 SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
13158 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
13159 if (I.getMetadata("amdgpu.noclobber"))
13160 return MONoClobber;
13161 return MachineMemOperand::MONone;
13162 }
13163
checkForPhysRegDependency(SDNode * Def,SDNode * User,unsigned Op,const TargetRegisterInfo * TRI,const TargetInstrInfo * TII,unsigned & PhysReg,int & Cost) const13164 bool SITargetLowering::checkForPhysRegDependency(
13165 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
13166 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
13167 if (User->getOpcode() != ISD::CopyToReg)
13168 return false;
13169 if (!Def->isMachineOpcode())
13170 return false;
13171 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
13172 if (!MDef)
13173 return false;
13174
13175 unsigned ResNo = User->getOperand(Op).getResNo();
13176 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
13177 return false;
13178 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
13179 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
13180 PhysReg = AMDGPU::SCC;
13181 const TargetRegisterClass *RC =
13182 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
13183 Cost = RC->getCopyCost();
13184 return true;
13185 }
13186 return false;
13187 }
13188
emitExpandAtomicRMW(AtomicRMWInst * AI) const13189 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
13190 assert(Subtarget->hasAtomicFaddInsts() &&
13191 "target should have atomic fadd instructions");
13192 assert(AI->getType()->isFloatTy() &&
13193 AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
13194 "generic atomicrmw expansion only supports FP32 operand in flat "
13195 "address space");
13196 assert(AI->getOperation() == AtomicRMWInst::FAdd &&
13197 "only fadd is supported for now");
13198
13199 // Given: atomicrmw fadd float* %addr, float %val ordering
13200 //
13201 // With this expansion we produce the following code:
13202 // [...]
13203 // %int8ptr = bitcast float* %addr to i8*
13204 // br label %atomicrmw.check.shared
13205 //
13206 // atomicrmw.check.shared:
13207 // %is.shared = call i1 @llvm.amdgcn.is.shared(i8* %int8ptr)
13208 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
13209 //
13210 // atomicrmw.shared:
13211 // %cast.shared = addrspacecast float* %addr to float addrspace(3)*
13212 // %loaded.shared = atomicrmw fadd float addrspace(3)* %cast.shared,
13213 // float %val ordering
13214 // br label %atomicrmw.phi
13215 //
13216 // atomicrmw.check.private:
13217 // %is.private = call i1 @llvm.amdgcn.is.private(i8* %int8ptr)
13218 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
13219 //
13220 // atomicrmw.private:
13221 // %cast.private = addrspacecast float* %addr to float addrspace(5)*
13222 // %loaded.private = load float, float addrspace(5)* %cast.private
13223 // %val.new = fadd float %loaded.private, %val
13224 // store float %val.new, float addrspace(5)* %cast.private
13225 // br label %atomicrmw.phi
13226 //
13227 // atomicrmw.global:
13228 // %cast.global = addrspacecast float* %addr to float addrspace(1)*
13229 // %loaded.global = atomicrmw fadd float addrspace(1)* %cast.global,
13230 // float %val ordering
13231 // br label %atomicrmw.phi
13232 //
13233 // atomicrmw.phi:
13234 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
13235 // [ %loaded.private, %atomicrmw.private ],
13236 // [ %loaded.global, %atomicrmw.global ]
13237 // br label %atomicrmw.end
13238 //
13239 // atomicrmw.end:
13240 // [...]
13241
13242 IRBuilder<> Builder(AI);
13243 LLVMContext &Ctx = Builder.getContext();
13244
13245 BasicBlock *BB = Builder.GetInsertBlock();
13246 Function *F = BB->getParent();
13247 BasicBlock *ExitBB =
13248 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
13249 BasicBlock *CheckSharedBB =
13250 BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
13251 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
13252 BasicBlock *CheckPrivateBB =
13253 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
13254 BasicBlock *PrivateBB =
13255 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
13256 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
13257 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
13258
13259 Value *Val = AI->getValOperand();
13260 Type *ValTy = Val->getType();
13261 Value *Addr = AI->getPointerOperand();
13262 PointerType *PtrTy = cast<PointerType>(Addr->getType());
13263
13264 auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
13265 Value *Val) -> Value * {
13266 AtomicRMWInst *OldVal =
13267 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
13268 AI->getOrdering(), AI->getSyncScopeID());
13269 SmallVector<std::pair<unsigned, MDNode *>> MDs;
13270 AI->getAllMetadata(MDs);
13271 for (auto &P : MDs)
13272 OldVal->setMetadata(P.first, P.second);
13273 return OldVal;
13274 };
13275
13276 std::prev(BB->end())->eraseFromParent();
13277 Builder.SetInsertPoint(BB);
13278 Value *Int8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy());
13279 Builder.CreateBr(CheckSharedBB);
13280
13281 Builder.SetInsertPoint(CheckSharedBB);
13282 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
13283 {Int8Ptr}, nullptr, "is.shared");
13284 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
13285
13286 Builder.SetInsertPoint(SharedBB);
13287 Value *CastToLocal = Builder.CreateAddrSpaceCast(
13288 Addr,
13289 PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::LOCAL_ADDRESS));
13290 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
13291 Builder.CreateBr(PhiBB);
13292
13293 Builder.SetInsertPoint(CheckPrivateBB);
13294 CallInst *IsPrivate = Builder.CreateIntrinsic(
13295 Intrinsic::amdgcn_is_private, {}, {Int8Ptr}, nullptr, "is.private");
13296 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
13297
13298 Builder.SetInsertPoint(PrivateBB);
13299 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
13300 Addr,
13301 PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::PRIVATE_ADDRESS));
13302 Value *LoadedPrivate =
13303 Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
13304 Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
13305 Builder.CreateStore(NewVal, CastToPrivate);
13306 Builder.CreateBr(PhiBB);
13307
13308 Builder.SetInsertPoint(GlobalBB);
13309 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
13310 Addr,
13311 PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::GLOBAL_ADDRESS));
13312 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
13313 Builder.CreateBr(PhiBB);
13314
13315 Builder.SetInsertPoint(PhiBB);
13316 PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
13317 Loaded->addIncoming(LoadedShared, SharedBB);
13318 Loaded->addIncoming(LoadedPrivate, PrivateBB);
13319 Loaded->addIncoming(LoadedGlobal, GlobalBB);
13320 Builder.CreateBr(ExitBB);
13321
13322 AI->replaceAllUsesWith(Loaded);
13323 AI->eraseFromParent();
13324 }
13325