1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUMachineFunction.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/CodeGen/Analysis.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/IR/PatternMatch.h"
26 #include "llvm/Support/CommandLine.h"
27 #include "llvm/Support/KnownBits.h"
28 #include "llvm/Target/TargetMachine.h"
29
30 using namespace llvm;
31
32 #include "AMDGPUGenCallingConv.inc"
33
34 static cl::opt<bool> AMDGPUBypassSlowDiv(
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39 // Find a larger type to do a load / store of a vector with.
getEquivalentMemType(LLVMContext & Ctx,EVT VT)40 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47 }
48
numBitsUnsigned(SDValue Op,SelectionDAG & DAG)49 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
50 return DAG.computeKnownBits(Op).countMaxActiveBits();
51 }
52
numBitsSigned(SDValue Op,SelectionDAG & DAG)53 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
54 // In order for this to be a signed 24-bit value, bit 23, must
55 // be a sign bit.
56 return DAG.ComputeMaxSignificantBits(Op);
57 }
58
AMDGPUTargetLowering(const TargetMachine & TM,const AMDGPUSubtarget & STI)59 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
60 const AMDGPUSubtarget &STI)
61 : TargetLowering(TM), Subtarget(&STI) {
62 // Lower floating point store/load to integer store/load to reduce the number
63 // of patterns in tablegen.
64 setOperationAction(ISD::LOAD, MVT::f32, Promote);
65 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
66
67 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
68 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
69
70 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
71 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
72
73 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
74 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
75
76 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
77 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
78
79 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
80 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
81
82 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
83 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
84
85 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
86 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
87
88 setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
89 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
90
91 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
92 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
93
94 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
95 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
96
97 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
98 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
99
100 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
101 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
102
103 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
104 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
105
106 setOperationAction(ISD::LOAD, MVT::i64, Promote);
107 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
108
109 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
110 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
111
112 setOperationAction(ISD::LOAD, MVT::f64, Promote);
113 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
114
115 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
116 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
117
118 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
119 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
120
121 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
122 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
123
124 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
125 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
126
127 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
128 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
129
130 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
131 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
132
133 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
134 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
135
136 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
137 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
138
139 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
140 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
141
142 setOperationAction(ISD::LOAD, MVT::i128, Promote);
143 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
144
145 // There are no 64-bit extloads. These should be done as a 32-bit extload and
146 // an extension to 64-bit.
147 for (MVT VT : MVT::integer_valuetypes())
148 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,
149 Expand);
150
151 for (MVT VT : MVT::integer_valuetypes()) {
152 if (VT == MVT::i64)
153 continue;
154
155 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
156 setLoadExtAction(Op, VT, MVT::i1, Promote);
157 setLoadExtAction(Op, VT, MVT::i8, Legal);
158 setLoadExtAction(Op, VT, MVT::i16, Legal);
159 setLoadExtAction(Op, VT, MVT::i32, Expand);
160 }
161 }
162
163 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
164 for (auto MemVT :
165 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
166 setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,
167 Expand);
168
169 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
170 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
171 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
172 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
173 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
174 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
175 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
176 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
177 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
178 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
179 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
180 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
181 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
182 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
183
184 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
185 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
186 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
187 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
188 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
189 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
190
191 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
192 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
193 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
194 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
195 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
196 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
197 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
198 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
199 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
200 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
201 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
202 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
203
204 setOperationAction(ISD::STORE, MVT::f32, Promote);
205 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
206
207 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
208 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
209
210 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
211 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
212
213 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
214 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
215
216 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
217 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
218
219 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
220 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
221
222 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
223 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
224
225 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
226 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
227
228 setOperationAction(ISD::STORE, MVT::v9f32, Promote);
229 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
230
231 setOperationAction(ISD::STORE, MVT::v10f32, Promote);
232 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
233
234 setOperationAction(ISD::STORE, MVT::v11f32, Promote);
235 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
236
237 setOperationAction(ISD::STORE, MVT::v12f32, Promote);
238 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
239
240 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
241 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
242
243 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
244 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
245
246 setOperationAction(ISD::STORE, MVT::i64, Promote);
247 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
248
249 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
250 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
251
252 setOperationAction(ISD::STORE, MVT::f64, Promote);
253 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
254
255 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
256 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
257
258 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
259 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
260
261 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
262 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
263
264 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
265 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
266
267 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
268 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
269
270 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
271 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
272
273 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
274 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
275
276 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
277 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
278
279 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
280 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
281
282 setOperationAction(ISD::STORE, MVT::i128, Promote);
283 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
284
285 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
286 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
287 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
288 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
289
290 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
291 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
292 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
293 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
294
295 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
296 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
297 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
298 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
299 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
300 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
301 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
302 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
303
304 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
305 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
306 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
307
308 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
309 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
310
311 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
312 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
313 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
314 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
315
316 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
317 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
318 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
319 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
320
321 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
322 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
323
324 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
325 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
326 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
327 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
328 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
329 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
330 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
331
332 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
333 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
334
335 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
336
337 // For R600, this is totally unsupported, just custom lower to produce an
338 // error.
339 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
340
341 // Library functions. These default to Expand, but we have instructions
342 // for them.
343 setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
344 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
345 MVT::f32, Legal);
346
347 setOperationAction(ISD::FLOG2, MVT::f32, Custom);
348 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
349
350 setOperationAction(
351 {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
352 Custom);
353
354 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
355
356 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
357
358 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
359
360 if (Subtarget->has16BitInsts())
361 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
362 else {
363 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
364 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
365 }
366
367 setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,
368 Custom);
369
370 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
371 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
372 // default unless marked custom/legal.
373 setOperationAction(
374 ISD::IS_FPCLASS,
375 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
376 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
377 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
378 Custom);
379
380 // Expand to fneg + fadd.
381 setOperationAction(ISD::FSUB, MVT::f64, Expand);
382
383 setOperationAction(ISD::CONCAT_VECTORS,
384 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
385 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
386 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
387 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
388 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
389 Custom);
390
391 // FIXME: Why is v8f16/v8bf16 missing?
392 setOperationAction(
393 ISD::EXTRACT_SUBVECTOR,
394 {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,
395 MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,
396 MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,
397 MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,
398 MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,
399 MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
400 MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
401 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
402 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,
403 MVT::v32i16, MVT::v32f16, MVT::v32bf16},
404 Custom);
405
406 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
407 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
408
409 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
410 for (MVT VT : ScalarIntVTs) {
411 // These should use [SU]DIVREM, so set them to expand
412 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
413 Expand);
414
415 // GPU does not have divrem function for signed or unsigned.
416 setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);
417
418 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
419 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);
420
421 setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);
422
423 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
424 setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
425 }
426
427 // The hardware supports 32-bit FSHR, but not FSHL.
428 setOperationAction(ISD::FSHR, MVT::i32, Legal);
429
430 // The hardware supports 32-bit ROTR, but not ROTL.
431 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
432 setOperationAction(ISD::ROTR, MVT::i64, Expand);
433
434 setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
435
436 setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
437 setOperationAction(
438 {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
439 MVT::i64, Custom);
440 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
441
442 setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
443 Legal);
444
445 setOperationAction(
446 {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
447 MVT::i64, Custom);
448
449 for (auto VT : {MVT::i8, MVT::i16})
450 setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom);
451
452 static const MVT::SimpleValueType VectorIntTypes[] = {
453 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
454 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
455
456 for (MVT VT : VectorIntTypes) {
457 // Expand the following operations for the current type by default.
458 setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT,
459 ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU,
460 ISD::MULHS, ISD::OR, ISD::SHL,
461 ISD::SRA, ISD::SRL, ISD::ROTL,
462 ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP,
463 ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV,
464 ISD::SREM, ISD::UREM, ISD::SMUL_LOHI,
465 ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM,
466 ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC,
467 ISD::XOR, ISD::BSWAP, ISD::CTPOP,
468 ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE,
469 ISD::SETCC},
470 VT, Expand);
471 }
472
473 static const MVT::SimpleValueType FloatVectorTypes[] = {
474 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
475 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
476
477 for (MVT VT : FloatVectorTypes) {
478 setOperationAction(
479 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
480 ISD::FADD, ISD::FCEIL, ISD::FCOS,
481 ISD::FDIV, ISD::FEXP2, ISD::FEXP,
482 ISD::FEXP10, ISD::FLOG2, ISD::FREM,
483 ISD::FLOG, ISD::FLOG10, ISD::FPOW,
484 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
485 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
486 ISD::FSQRT, ISD::FSIN, ISD::FSUB,
487 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
488 ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,
489 ISD::FCANONICALIZE, ISD::FROUNDEVEN},
490 VT, Expand);
491 }
492
493 // This causes using an unrolled select operation rather than expansion with
494 // bit operations. This is in general better, but the alternative using BFI
495 // instructions may be better if the select sources are SGPRs.
496 setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
497 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
498
499 setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
500 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
501
502 setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
503 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
504
505 setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
506 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
507
508 setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
509 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
510
511 setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
512 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
513
514 setOperationAction(ISD::SELECT, MVT::v9f32, Promote);
515 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
516
517 setOperationAction(ISD::SELECT, MVT::v10f32, Promote);
518 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
519
520 setOperationAction(ISD::SELECT, MVT::v11f32, Promote);
521 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
522
523 setOperationAction(ISD::SELECT, MVT::v12f32, Promote);
524 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
525
526 // Disable most libcalls.
527 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
528 if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
529 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
530 }
531
532 setSchedulingPreference(Sched::RegPressure);
533 setJumpIsExpensive(true);
534
535 // FIXME: This is only partially true. If we have to do vector compares, any
536 // SGPR pair can be a condition register. If we have a uniform condition, we
537 // are better off doing SALU operations, where there is only one SCC. For now,
538 // we don't have a way of knowing during instruction selection if a condition
539 // will be uniform and we always use vector compares. Assume we are using
540 // vector compares until that is fixed.
541 setHasMultipleConditionRegisters(true);
542
543 setMinCmpXchgSizeInBits(32);
544 setSupportsUnalignedAtomics(false);
545
546 PredictableSelectIsExpensive = false;
547
548 // We want to find all load dependencies for long chains of stores to enable
549 // merging into very wide vectors. The problem is with vectors with > 4
550 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
551 // vectors are a legal type, even though we have to split the loads
552 // usually. When we can more precisely specify load legality per address
553 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
554 // smarter so that they can figure out what to do in 2 iterations without all
555 // N > 4 stores on the same chain.
556 GatherAllAliasesMaxDepth = 16;
557
558 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
559 // about these during lowering.
560 MaxStoresPerMemcpy = 0xffffffff;
561 MaxStoresPerMemmove = 0xffffffff;
562 MaxStoresPerMemset = 0xffffffff;
563
564 // The expansion for 64-bit division is enormous.
565 if (AMDGPUBypassSlowDiv)
566 addBypassSlowDiv(64, 32);
567
568 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
569 ISD::SRA, ISD::SRL,
570 ISD::TRUNCATE, ISD::MUL,
571 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
572 ISD::MULHU, ISD::MULHS,
573 ISD::SELECT, ISD::SELECT_CC,
574 ISD::STORE, ISD::FADD,
575 ISD::FSUB, ISD::FNEG,
576 ISD::FABS, ISD::AssertZext,
577 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
578
579 setMaxAtomicSizeInBitsSupported(64);
580 setMaxDivRemBitWidthSupported(64);
581 }
582
mayIgnoreSignedZero(SDValue Op) const583 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
584 if (getTargetMachine().Options.NoSignedZerosFPMath)
585 return true;
586
587 const auto Flags = Op.getNode()->getFlags();
588 if (Flags.hasNoSignedZeros())
589 return true;
590
591 return false;
592 }
593
594 //===----------------------------------------------------------------------===//
595 // Target Information
596 //===----------------------------------------------------------------------===//
597
598 LLVM_READNONE
fnegFoldsIntoOpcode(unsigned Opc)599 static bool fnegFoldsIntoOpcode(unsigned Opc) {
600 switch (Opc) {
601 case ISD::FADD:
602 case ISD::FSUB:
603 case ISD::FMUL:
604 case ISD::FMA:
605 case ISD::FMAD:
606 case ISD::FMINNUM:
607 case ISD::FMAXNUM:
608 case ISD::FMINNUM_IEEE:
609 case ISD::FMAXNUM_IEEE:
610 case ISD::FMINIMUM:
611 case ISD::FMAXIMUM:
612 case ISD::SELECT:
613 case ISD::FSIN:
614 case ISD::FTRUNC:
615 case ISD::FRINT:
616 case ISD::FNEARBYINT:
617 case ISD::FROUNDEVEN:
618 case ISD::FCANONICALIZE:
619 case AMDGPUISD::RCP:
620 case AMDGPUISD::RCP_LEGACY:
621 case AMDGPUISD::RCP_IFLAG:
622 case AMDGPUISD::SIN_HW:
623 case AMDGPUISD::FMUL_LEGACY:
624 case AMDGPUISD::FMIN_LEGACY:
625 case AMDGPUISD::FMAX_LEGACY:
626 case AMDGPUISD::FMED3:
627 // TODO: handle llvm.amdgcn.fma.legacy
628 return true;
629 case ISD::BITCAST:
630 llvm_unreachable("bitcast is special cased");
631 default:
632 return false;
633 }
634 }
635
fnegFoldsIntoOp(const SDNode * N)636 static bool fnegFoldsIntoOp(const SDNode *N) {
637 unsigned Opc = N->getOpcode();
638 if (Opc == ISD::BITCAST) {
639 // TODO: Is there a benefit to checking the conditions performFNegCombine
640 // does? We don't for the other cases.
641 SDValue BCSrc = N->getOperand(0);
642 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
643 return BCSrc.getNumOperands() == 2 &&
644 BCSrc.getOperand(1).getValueSizeInBits() == 32;
645 }
646
647 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
648 }
649
650 return fnegFoldsIntoOpcode(Opc);
651 }
652
653 /// \p returns true if the operation will definitely need to use a 64-bit
654 /// encoding, and thus will use a VOP3 encoding regardless of the source
655 /// modifiers.
656 LLVM_READONLY
opMustUseVOP3Encoding(const SDNode * N,MVT VT)657 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
658 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
659 VT == MVT::f64;
660 }
661
662 /// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
663 /// type for ISD::SELECT.
664 LLVM_READONLY
selectSupportsSourceMods(const SDNode * N)665 static bool selectSupportsSourceMods(const SDNode *N) {
666 // TODO: Only applies if select will be vector
667 return N->getValueType(0) == MVT::f32;
668 }
669
670 // Most FP instructions support source modifiers, but this could be refined
671 // slightly.
672 LLVM_READONLY
hasSourceMods(const SDNode * N)673 static bool hasSourceMods(const SDNode *N) {
674 if (isa<MemSDNode>(N))
675 return false;
676
677 switch (N->getOpcode()) {
678 case ISD::CopyToReg:
679 case ISD::FDIV:
680 case ISD::FREM:
681 case ISD::INLINEASM:
682 case ISD::INLINEASM_BR:
683 case AMDGPUISD::DIV_SCALE:
684 case ISD::INTRINSIC_W_CHAIN:
685
686 // TODO: Should really be looking at the users of the bitcast. These are
687 // problematic because bitcasts are used to legalize all stores to integer
688 // types.
689 case ISD::BITCAST:
690 return false;
691 case ISD::INTRINSIC_WO_CHAIN: {
692 switch (N->getConstantOperandVal(0)) {
693 case Intrinsic::amdgcn_interp_p1:
694 case Intrinsic::amdgcn_interp_p2:
695 case Intrinsic::amdgcn_interp_mov:
696 case Intrinsic::amdgcn_interp_p1_f16:
697 case Intrinsic::amdgcn_interp_p2_f16:
698 return false;
699 default:
700 return true;
701 }
702 }
703 case ISD::SELECT:
704 return selectSupportsSourceMods(N);
705 default:
706 return true;
707 }
708 }
709
allUsesHaveSourceMods(const SDNode * N,unsigned CostThreshold)710 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
711 unsigned CostThreshold) {
712 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
713 // it is truly free to use a source modifier in all cases. If there are
714 // multiple users but for each one will necessitate using VOP3, there will be
715 // a code size increase. Try to avoid increasing code size unless we know it
716 // will save on the instruction count.
717 unsigned NumMayIncreaseSize = 0;
718 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
719
720 assert(!N->use_empty());
721
722 // XXX - Should this limit number of uses to check?
723 for (const SDNode *U : N->uses()) {
724 if (!hasSourceMods(U))
725 return false;
726
727 if (!opMustUseVOP3Encoding(U, VT)) {
728 if (++NumMayIncreaseSize > CostThreshold)
729 return false;
730 }
731 }
732
733 return true;
734 }
735
getTypeForExtReturn(LLVMContext & Context,EVT VT,ISD::NodeType ExtendKind) const736 EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
737 ISD::NodeType ExtendKind) const {
738 assert(!VT.isVector() && "only scalar expected");
739
740 // Round to the next multiple of 32-bits.
741 unsigned Size = VT.getSizeInBits();
742 if (Size <= 32)
743 return MVT::i32;
744 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
745 }
746
getVectorIdxTy(const DataLayout &) const747 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
748 return MVT::i32;
749 }
750
isSelectSupported(SelectSupportKind SelType) const751 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
752 return true;
753 }
754
755 // The backend supports 32 and 64 bit floating point immediates.
756 // FIXME: Why are we reporting vectors of FP immediates as legal?
isFPImmLegal(const APFloat & Imm,EVT VT,bool ForCodeSize) const757 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
758 bool ForCodeSize) const {
759 EVT ScalarVT = VT.getScalarType();
760 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
761 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
762 }
763
764 // We don't want to shrink f64 / f32 constants.
ShouldShrinkFPConstant(EVT VT) const765 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
766 EVT ScalarVT = VT.getScalarType();
767 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
768 }
769
shouldReduceLoadWidth(SDNode * N,ISD::LoadExtType ExtTy,EVT NewVT) const770 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
771 ISD::LoadExtType ExtTy,
772 EVT NewVT) const {
773 // TODO: This may be worth removing. Check regression tests for diffs.
774 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
775 return false;
776
777 unsigned NewSize = NewVT.getStoreSizeInBits();
778
779 // If we are reducing to a 32-bit load or a smaller multi-dword load,
780 // this is always better.
781 if (NewSize >= 32)
782 return true;
783
784 EVT OldVT = N->getValueType(0);
785 unsigned OldSize = OldVT.getStoreSizeInBits();
786
787 MemSDNode *MN = cast<MemSDNode>(N);
788 unsigned AS = MN->getAddressSpace();
789 // Do not shrink an aligned scalar load to sub-dword.
790 // Scalar engine cannot do sub-dword loads.
791 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
792 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
793 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
794 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
795 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
796 MN->isInvariant())) &&
797 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
798 return false;
799
800 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
801 // extloads, so doing one requires using a buffer_load. In cases where we
802 // still couldn't use a scalar load, using the wider load shouldn't really
803 // hurt anything.
804
805 // If the old size already had to be an extload, there's no harm in continuing
806 // to reduce the width.
807 return (OldSize < 32);
808 }
809
isLoadBitCastBeneficial(EVT LoadTy,EVT CastTy,const SelectionDAG & DAG,const MachineMemOperand & MMO) const810 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
811 const SelectionDAG &DAG,
812 const MachineMemOperand &MMO) const {
813
814 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
815
816 if (LoadTy.getScalarType() == MVT::i32)
817 return false;
818
819 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
820 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
821
822 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
823 return false;
824
825 unsigned Fast = 0;
826 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
827 CastTy, MMO, &Fast) &&
828 Fast;
829 }
830
831 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
832 // profitable with the expansion for 64-bit since it's generally good to
833 // speculate things.
isCheapToSpeculateCttz(Type * Ty) const834 bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
835 return true;
836 }
837
isCheapToSpeculateCtlz(Type * Ty) const838 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
839 return true;
840 }
841
isSDNodeAlwaysUniform(const SDNode * N) const842 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
843 switch (N->getOpcode()) {
844 case ISD::EntryToken:
845 case ISD::TokenFactor:
846 return true;
847 case ISD::INTRINSIC_WO_CHAIN: {
848 unsigned IntrID = N->getConstantOperandVal(0);
849 switch (IntrID) {
850 case Intrinsic::amdgcn_readfirstlane:
851 case Intrinsic::amdgcn_readlane:
852 return true;
853 }
854 return false;
855 }
856 case ISD::LOAD:
857 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
858 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
859 return true;
860 return false;
861 case AMDGPUISD::SETCC: // ballot-style instruction
862 return true;
863 }
864 return false;
865 }
866
getNegatedExpression(SDValue Op,SelectionDAG & DAG,bool LegalOperations,bool ForCodeSize,NegatibleCost & Cost,unsigned Depth) const867 SDValue AMDGPUTargetLowering::getNegatedExpression(
868 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
869 NegatibleCost &Cost, unsigned Depth) const {
870
871 switch (Op.getOpcode()) {
872 case ISD::FMA:
873 case ISD::FMAD: {
874 // Negating a fma is not free if it has users without source mods.
875 if (!allUsesHaveSourceMods(Op.getNode()))
876 return SDValue();
877 break;
878 }
879 case AMDGPUISD::RCP: {
880 SDValue Src = Op.getOperand(0);
881 EVT VT = Op.getValueType();
882 SDLoc SL(Op);
883
884 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
885 ForCodeSize, Cost, Depth + 1);
886 if (NegSrc)
887 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
888 return SDValue();
889 }
890 default:
891 break;
892 }
893
894 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
895 ForCodeSize, Cost, Depth);
896 }
897
898 //===---------------------------------------------------------------------===//
899 // Target Properties
900 //===---------------------------------------------------------------------===//
901
isFAbsFree(EVT VT) const902 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
903 assert(VT.isFloatingPoint());
904
905 // Packed operations do not have a fabs modifier.
906 return VT == MVT::f32 || VT == MVT::f64 ||
907 (Subtarget->has16BitInsts() && VT == MVT::f16);
908 }
909
isFNegFree(EVT VT) const910 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
911 assert(VT.isFloatingPoint());
912 // Report this based on the end legalized type.
913 VT = VT.getScalarType();
914 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
915 }
916
storeOfVectorConstantIsCheap(bool IsZero,EVT MemVT,unsigned NumElem,unsigned AS) const917 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
918 unsigned NumElem,
919 unsigned AS) const {
920 return true;
921 }
922
aggressivelyPreferBuildVectorSources(EVT VecVT) const923 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
924 // There are few operations which truly have vector input operands. Any vector
925 // operation is going to involve operations on each component, and a
926 // build_vector will be a copy per element, so it always makes sense to use a
927 // build_vector input in place of the extracted element to avoid a copy into a
928 // super register.
929 //
930 // We should probably only do this if all users are extracts only, but this
931 // should be the common case.
932 return true;
933 }
934
isTruncateFree(EVT Source,EVT Dest) const935 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
936 // Truncate is just accessing a subregister.
937
938 unsigned SrcSize = Source.getSizeInBits();
939 unsigned DestSize = Dest.getSizeInBits();
940
941 return DestSize < SrcSize && DestSize % 32 == 0 ;
942 }
943
isTruncateFree(Type * Source,Type * Dest) const944 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
945 // Truncate is just accessing a subregister.
946
947 unsigned SrcSize = Source->getScalarSizeInBits();
948 unsigned DestSize = Dest->getScalarSizeInBits();
949
950 if (DestSize== 16 && Subtarget->has16BitInsts())
951 return SrcSize >= 32;
952
953 return DestSize < SrcSize && DestSize % 32 == 0;
954 }
955
isZExtFree(Type * Src,Type * Dest) const956 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
957 unsigned SrcSize = Src->getScalarSizeInBits();
958 unsigned DestSize = Dest->getScalarSizeInBits();
959
960 if (SrcSize == 16 && Subtarget->has16BitInsts())
961 return DestSize >= 32;
962
963 return SrcSize == 32 && DestSize == 64;
964 }
965
isZExtFree(EVT Src,EVT Dest) const966 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
967 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
968 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
969 // this will enable reducing 64-bit operations the 32-bit, which is always
970 // good.
971
972 if (Src == MVT::i16)
973 return Dest == MVT::i32 ||Dest == MVT::i64 ;
974
975 return Src == MVT::i32 && Dest == MVT::i64;
976 }
977
isNarrowingProfitable(EVT SrcVT,EVT DestVT) const978 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
979 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
980 // limited number of native 64-bit operations. Shrinking an operation to fit
981 // in a single 32-bit register should always be helpful. As currently used,
982 // this is much less general than the name suggests, and is only used in
983 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
984 // not profitable, and may actually be harmful.
985 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
986 }
987
isDesirableToCommuteWithShift(const SDNode * N,CombineLevel Level) const988 bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
989 const SDNode* N, CombineLevel Level) const {
990 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
991 N->getOpcode() == ISD::SRL) &&
992 "Expected shift op");
993 // Always commute pre-type legalization and right shifts.
994 // We're looking for shl(or(x,y),z) patterns.
995 if (Level < CombineLevel::AfterLegalizeTypes ||
996 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
997 return true;
998
999 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1000 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1001 (N->use_begin()->getOpcode() == ISD::SRA ||
1002 N->use_begin()->getOpcode() == ISD::SRL))
1003 return false;
1004
1005 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1006 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1007 if (LHS.getOpcode() != ISD::SHL)
1008 return false;
1009 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1010 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1011 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1012 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1013 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1014 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1015 };
1016 SDValue LHS = N->getOperand(0).getOperand(0);
1017 SDValue RHS = N->getOperand(0).getOperand(1);
1018 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1019 }
1020
1021 //===---------------------------------------------------------------------===//
1022 // TargetLowering Callbacks
1023 //===---------------------------------------------------------------------===//
1024
CCAssignFnForCall(CallingConv::ID CC,bool IsVarArg)1025 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
1026 bool IsVarArg) {
1027 switch (CC) {
1028 case CallingConv::AMDGPU_VS:
1029 case CallingConv::AMDGPU_GS:
1030 case CallingConv::AMDGPU_PS:
1031 case CallingConv::AMDGPU_CS:
1032 case CallingConv::AMDGPU_HS:
1033 case CallingConv::AMDGPU_ES:
1034 case CallingConv::AMDGPU_LS:
1035 return CC_AMDGPU;
1036 case CallingConv::AMDGPU_CS_Chain:
1037 case CallingConv::AMDGPU_CS_ChainPreserve:
1038 return CC_AMDGPU_CS_CHAIN;
1039 case CallingConv::C:
1040 case CallingConv::Fast:
1041 case CallingConv::Cold:
1042 return CC_AMDGPU_Func;
1043 case CallingConv::AMDGPU_Gfx:
1044 return CC_SI_Gfx;
1045 case CallingConv::AMDGPU_KERNEL:
1046 case CallingConv::SPIR_KERNEL:
1047 default:
1048 report_fatal_error("Unsupported calling convention for call");
1049 }
1050 }
1051
CCAssignFnForReturn(CallingConv::ID CC,bool IsVarArg)1052 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1053 bool IsVarArg) {
1054 switch (CC) {
1055 case CallingConv::AMDGPU_KERNEL:
1056 case CallingConv::SPIR_KERNEL:
1057 llvm_unreachable("kernels should not be handled here");
1058 case CallingConv::AMDGPU_VS:
1059 case CallingConv::AMDGPU_GS:
1060 case CallingConv::AMDGPU_PS:
1061 case CallingConv::AMDGPU_CS:
1062 case CallingConv::AMDGPU_CS_Chain:
1063 case CallingConv::AMDGPU_CS_ChainPreserve:
1064 case CallingConv::AMDGPU_HS:
1065 case CallingConv::AMDGPU_ES:
1066 case CallingConv::AMDGPU_LS:
1067 return RetCC_SI_Shader;
1068 case CallingConv::AMDGPU_Gfx:
1069 return RetCC_SI_Gfx;
1070 case CallingConv::C:
1071 case CallingConv::Fast:
1072 case CallingConv::Cold:
1073 return RetCC_AMDGPU_Func;
1074 default:
1075 report_fatal_error("Unsupported calling convention.");
1076 }
1077 }
1078
1079 /// The SelectionDAGBuilder will automatically promote function arguments
1080 /// with illegal types. However, this does not work for the AMDGPU targets
1081 /// since the function arguments are stored in memory as these illegal types.
1082 /// In order to handle this properly we need to get the original types sizes
1083 /// from the LLVM IR Function and fixup the ISD:InputArg values before
1084 /// passing them to AnalyzeFormalArguments()
1085
1086 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1087 /// input values across multiple registers. Each item in the Ins array
1088 /// represents a single value that will be stored in registers. Ins[x].VT is
1089 /// the value type of the value that will be stored in the register, so
1090 /// whatever SDNode we lower the argument to needs to be this type.
1091 ///
1092 /// In order to correctly lower the arguments we need to know the size of each
1093 /// argument. Since Ins[x].VT gives us the size of the register that will
1094 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1095 /// for the original function argument so that we can deduce the correct memory
1096 /// type to use for Ins[x]. In most cases the correct memory type will be
1097 /// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1098 /// we have a kernel argument of type v8i8, this argument will be split into
1099 /// 8 parts and each part will be represented by its own item in the Ins array.
1100 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1101 /// the argument before it was split. From this, we deduce that the memory type
1102 /// for each individual part is i8. We pass the memory type as LocVT to the
1103 /// calling convention analysis function and the register type (Ins[x].VT) as
1104 /// the ValVT.
analyzeFormalArgumentsCompute(CCState & State,const SmallVectorImpl<ISD::InputArg> & Ins) const1105 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1106 CCState &State,
1107 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1108 const MachineFunction &MF = State.getMachineFunction();
1109 const Function &Fn = MF.getFunction();
1110 LLVMContext &Ctx = Fn.getParent()->getContext();
1111 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1112 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1113 CallingConv::ID CC = Fn.getCallingConv();
1114
1115 Align MaxAlign = Align(1);
1116 uint64_t ExplicitArgOffset = 0;
1117 const DataLayout &DL = Fn.getParent()->getDataLayout();
1118
1119 unsigned InIndex = 0;
1120
1121 for (const Argument &Arg : Fn.args()) {
1122 const bool IsByRef = Arg.hasByRefAttr();
1123 Type *BaseArgTy = Arg.getType();
1124 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1125 Align Alignment = DL.getValueOrABITypeAlignment(
1126 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1127 MaxAlign = std::max(Alignment, MaxAlign);
1128 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1129
1130 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1131 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1132
1133 // We're basically throwing away everything passed into us and starting over
1134 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1135 // to us as computed in Ins.
1136 //
1137 // We also need to figure out what type legalization is trying to do to get
1138 // the correct memory offsets.
1139
1140 SmallVector<EVT, 16> ValueVTs;
1141 SmallVector<uint64_t, 16> Offsets;
1142 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1143
1144 for (unsigned Value = 0, NumValues = ValueVTs.size();
1145 Value != NumValues; ++Value) {
1146 uint64_t BasePartOffset = Offsets[Value];
1147
1148 EVT ArgVT = ValueVTs[Value];
1149 EVT MemVT = ArgVT;
1150 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1151 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1152
1153 if (NumRegs == 1) {
1154 // This argument is not split, so the IR type is the memory type.
1155 if (ArgVT.isExtended()) {
1156 // We have an extended type, like i24, so we should just use the
1157 // register type.
1158 MemVT = RegisterVT;
1159 } else {
1160 MemVT = ArgVT;
1161 }
1162 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1163 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1164 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1165 // We have a vector value which has been split into a vector with
1166 // the same scalar type, but fewer elements. This should handle
1167 // all the floating-point vector types.
1168 MemVT = RegisterVT;
1169 } else if (ArgVT.isVector() &&
1170 ArgVT.getVectorNumElements() == NumRegs) {
1171 // This arg has been split so that each element is stored in a separate
1172 // register.
1173 MemVT = ArgVT.getScalarType();
1174 } else if (ArgVT.isExtended()) {
1175 // We have an extended type, like i65.
1176 MemVT = RegisterVT;
1177 } else {
1178 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1179 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1180 if (RegisterVT.isInteger()) {
1181 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1182 } else if (RegisterVT.isVector()) {
1183 assert(!RegisterVT.getScalarType().isFloatingPoint());
1184 unsigned NumElements = RegisterVT.getVectorNumElements();
1185 assert(MemoryBits % NumElements == 0);
1186 // This vector type has been split into another vector type with
1187 // a different elements size.
1188 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1189 MemoryBits / NumElements);
1190 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1191 } else {
1192 llvm_unreachable("cannot deduce memory type.");
1193 }
1194 }
1195
1196 // Convert one element vectors to scalar.
1197 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1198 MemVT = MemVT.getScalarType();
1199
1200 // Round up vec3/vec5 argument.
1201 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1202 assert(MemVT.getVectorNumElements() == 3 ||
1203 MemVT.getVectorNumElements() == 5 ||
1204 (MemVT.getVectorNumElements() >= 9 &&
1205 MemVT.getVectorNumElements() <= 12));
1206 MemVT = MemVT.getPow2VectorType(State.getContext());
1207 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1208 MemVT = MemVT.getRoundIntegerType(State.getContext());
1209 }
1210
1211 unsigned PartOffset = 0;
1212 for (unsigned i = 0; i != NumRegs; ++i) {
1213 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1214 BasePartOffset + PartOffset,
1215 MemVT.getSimpleVT(),
1216 CCValAssign::Full));
1217 PartOffset += MemVT.getStoreSize();
1218 }
1219 }
1220 }
1221 }
1222
LowerReturn(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SDLoc & DL,SelectionDAG & DAG) const1223 SDValue AMDGPUTargetLowering::LowerReturn(
1224 SDValue Chain, CallingConv::ID CallConv,
1225 bool isVarArg,
1226 const SmallVectorImpl<ISD::OutputArg> &Outs,
1227 const SmallVectorImpl<SDValue> &OutVals,
1228 const SDLoc &DL, SelectionDAG &DAG) const {
1229 // FIXME: Fails for r600 tests
1230 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1231 // "wave terminate should not have return values");
1232 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1233 }
1234
1235 //===---------------------------------------------------------------------===//
1236 // Target specific lowering
1237 //===---------------------------------------------------------------------===//
1238
1239 /// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFnForCall(CallingConv::ID CC,bool IsVarArg)1240 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1241 bool IsVarArg) {
1242 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1243 }
1244
CCAssignFnForReturn(CallingConv::ID CC,bool IsVarArg)1245 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1246 bool IsVarArg) {
1247 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1248 }
1249
addTokenForArgument(SDValue Chain,SelectionDAG & DAG,MachineFrameInfo & MFI,int ClobberedFI) const1250 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1251 SelectionDAG &DAG,
1252 MachineFrameInfo &MFI,
1253 int ClobberedFI) const {
1254 SmallVector<SDValue, 8> ArgChains;
1255 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1256 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1257
1258 // Include the original chain at the beginning of the list. When this is
1259 // used by target LowerCall hooks, this helps legalize find the
1260 // CALLSEQ_BEGIN node.
1261 ArgChains.push_back(Chain);
1262
1263 // Add a chain value for each stack argument corresponding
1264 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1265 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1266 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1267 if (FI->getIndex() < 0) {
1268 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1269 int64_t InLastByte = InFirstByte;
1270 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1271
1272 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1273 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1274 ArgChains.push_back(SDValue(L, 1));
1275 }
1276 }
1277 }
1278 }
1279
1280 // Build a tokenfactor for all the chains.
1281 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1282 }
1283
lowerUnhandledCall(CallLoweringInfo & CLI,SmallVectorImpl<SDValue> & InVals,StringRef Reason) const1284 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1285 SmallVectorImpl<SDValue> &InVals,
1286 StringRef Reason) const {
1287 SDValue Callee = CLI.Callee;
1288 SelectionDAG &DAG = CLI.DAG;
1289
1290 const Function &Fn = DAG.getMachineFunction().getFunction();
1291
1292 StringRef FuncName("<unknown>");
1293
1294 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1295 FuncName = G->getSymbol();
1296 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1297 FuncName = G->getGlobal()->getName();
1298
1299 DiagnosticInfoUnsupported NoCalls(
1300 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1301 DAG.getContext()->diagnose(NoCalls);
1302
1303 if (!CLI.IsTailCall) {
1304 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1305 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1306 }
1307
1308 return DAG.getEntryNode();
1309 }
1310
LowerCall(CallLoweringInfo & CLI,SmallVectorImpl<SDValue> & InVals) const1311 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1312 SmallVectorImpl<SDValue> &InVals) const {
1313 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1314 }
1315
LowerDYNAMIC_STACKALLOC(SDValue Op,SelectionDAG & DAG) const1316 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1317 SelectionDAG &DAG) const {
1318 const Function &Fn = DAG.getMachineFunction().getFunction();
1319
1320 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1321 SDLoc(Op).getDebugLoc());
1322 DAG.getContext()->diagnose(NoDynamicAlloca);
1323 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1324 return DAG.getMergeValues(Ops, SDLoc());
1325 }
1326
LowerOperation(SDValue Op,SelectionDAG & DAG) const1327 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1328 SelectionDAG &DAG) const {
1329 switch (Op.getOpcode()) {
1330 default:
1331 Op->print(errs(), &DAG);
1332 llvm_unreachable("Custom lowering code for this "
1333 "instruction is not implemented yet!");
1334 break;
1335 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1336 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1337 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1338 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1339 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1340 case ISD::FREM: return LowerFREM(Op, DAG);
1341 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1342 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1343 case ISD::FRINT: return LowerFRINT(Op, DAG);
1344 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1345 case ISD::FROUNDEVEN:
1346 return LowerFROUNDEVEN(Op, DAG);
1347 case ISD::FROUND: return LowerFROUND(Op, DAG);
1348 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1349 case ISD::FLOG2:
1350 return LowerFLOG2(Op, DAG);
1351 case ISD::FLOG:
1352 case ISD::FLOG10:
1353 return LowerFLOGCommon(Op, DAG);
1354 case ISD::FEXP:
1355 case ISD::FEXP10:
1356 return lowerFEXP(Op, DAG);
1357 case ISD::FEXP2:
1358 return lowerFEXP2(Op, DAG);
1359 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1360 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1361 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1362 case ISD::FP_TO_SINT:
1363 case ISD::FP_TO_UINT:
1364 return LowerFP_TO_INT(Op, DAG);
1365 case ISD::CTTZ:
1366 case ISD::CTTZ_ZERO_UNDEF:
1367 case ISD::CTLZ:
1368 case ISD::CTLZ_ZERO_UNDEF:
1369 return LowerCTLZ_CTTZ(Op, DAG);
1370 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1371 }
1372 return Op;
1373 }
1374
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const1375 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1376 SmallVectorImpl<SDValue> &Results,
1377 SelectionDAG &DAG) const {
1378 switch (N->getOpcode()) {
1379 case ISD::SIGN_EXTEND_INREG:
1380 // Different parts of legalization seem to interpret which type of
1381 // sign_extend_inreg is the one to check for custom lowering. The extended
1382 // from type is what really matters, but some places check for custom
1383 // lowering of the result type. This results in trying to use
1384 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1385 // nothing here and let the illegal result integer be handled normally.
1386 return;
1387 case ISD::FLOG2:
1388 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1389 Results.push_back(Lowered);
1390 return;
1391 case ISD::FLOG:
1392 case ISD::FLOG10:
1393 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1394 Results.push_back(Lowered);
1395 return;
1396 case ISD::FEXP2:
1397 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1398 Results.push_back(Lowered);
1399 return;
1400 case ISD::FEXP:
1401 case ISD::FEXP10:
1402 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1403 Results.push_back(Lowered);
1404 return;
1405 case ISD::CTLZ:
1406 case ISD::CTLZ_ZERO_UNDEF:
1407 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1408 Results.push_back(Lowered);
1409 return;
1410 default:
1411 return;
1412 }
1413 }
1414
LowerGlobalAddress(AMDGPUMachineFunction * MFI,SDValue Op,SelectionDAG & DAG) const1415 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1416 SDValue Op,
1417 SelectionDAG &DAG) const {
1418
1419 const DataLayout &DL = DAG.getDataLayout();
1420 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1421 const GlobalValue *GV = G->getGlobal();
1422
1423 if (!MFI->isModuleEntryFunction()) {
1424 if (std::optional<uint32_t> Address =
1425 AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
1426 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1427 }
1428 }
1429
1430 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1431 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1432 if (!MFI->isModuleEntryFunction() &&
1433 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1434 SDLoc DL(Op);
1435 const Function &Fn = DAG.getMachineFunction().getFunction();
1436 DiagnosticInfoUnsupported BadLDSDecl(
1437 Fn, "local memory global used by non-kernel function",
1438 DL.getDebugLoc(), DS_Warning);
1439 DAG.getContext()->diagnose(BadLDSDecl);
1440
1441 // We currently don't have a way to correctly allocate LDS objects that
1442 // aren't directly associated with a kernel. We do force inlining of
1443 // functions that use local objects. However, if these dead functions are
1444 // not eliminated, we don't want a compile time error. Just emit a warning
1445 // and a trap, since there should be no callable path here.
1446 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1447 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1448 Trap, DAG.getRoot());
1449 DAG.setRoot(OutputChain);
1450 return DAG.getUNDEF(Op.getValueType());
1451 }
1452
1453 // XXX: What does the value of G->getOffset() mean?
1454 assert(G->getOffset() == 0 &&
1455 "Do not know what to do with an non-zero offset");
1456
1457 // TODO: We could emit code to handle the initialization somewhere.
1458 // We ignore the initializer for now and legalize it to allow selection.
1459 // The initializer will anyway get errored out during assembly emission.
1460 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1461 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1462 }
1463 return SDValue();
1464 }
1465
LowerCONCAT_VECTORS(SDValue Op,SelectionDAG & DAG) const1466 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1467 SelectionDAG &DAG) const {
1468 SmallVector<SDValue, 8> Args;
1469 SDLoc SL(Op);
1470
1471 EVT VT = Op.getValueType();
1472 if (VT.getVectorElementType().getSizeInBits() < 32) {
1473 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1474 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1475 unsigned NewNumElt = OpBitSize / 32;
1476 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1477 : EVT::getVectorVT(*DAG.getContext(),
1478 MVT::i32, NewNumElt);
1479 for (const SDUse &U : Op->ops()) {
1480 SDValue In = U.get();
1481 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1482 if (NewNumElt > 1)
1483 DAG.ExtractVectorElements(NewIn, Args);
1484 else
1485 Args.push_back(NewIn);
1486 }
1487
1488 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1489 NewNumElt * Op.getNumOperands());
1490 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1491 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1492 }
1493 }
1494
1495 for (const SDUse &U : Op->ops())
1496 DAG.ExtractVectorElements(U.get(), Args);
1497
1498 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1499 }
1500
LowerEXTRACT_SUBVECTOR(SDValue Op,SelectionDAG & DAG) const1501 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1502 SelectionDAG &DAG) const {
1503 SDLoc SL(Op);
1504 SmallVector<SDValue, 8> Args;
1505 unsigned Start = Op.getConstantOperandVal(1);
1506 EVT VT = Op.getValueType();
1507 EVT SrcVT = Op.getOperand(0).getValueType();
1508
1509 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1510 unsigned NumElt = VT.getVectorNumElements();
1511 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1512 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1513
1514 // Extract 32-bit registers at a time.
1515 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1516 EVT NewVT = NumElt == 2
1517 ? MVT::i32
1518 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1519 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1520
1521 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1522 if (NumElt == 2)
1523 Tmp = Args[0];
1524 else
1525 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1526
1527 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1528 }
1529
1530 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1531 VT.getVectorNumElements());
1532
1533 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1534 }
1535
1536 // TODO: Handle fabs too
peekFNeg(SDValue Val)1537 static SDValue peekFNeg(SDValue Val) {
1538 if (Val.getOpcode() == ISD::FNEG)
1539 return Val.getOperand(0);
1540
1541 return Val;
1542 }
1543
peekFPSignOps(SDValue Val)1544 static SDValue peekFPSignOps(SDValue Val) {
1545 if (Val.getOpcode() == ISD::FNEG)
1546 Val = Val.getOperand(0);
1547 if (Val.getOpcode() == ISD::FABS)
1548 Val = Val.getOperand(0);
1549 if (Val.getOpcode() == ISD::FCOPYSIGN)
1550 Val = Val.getOperand(0);
1551 return Val;
1552 }
1553
combineFMinMaxLegacyImpl(const SDLoc & DL,EVT VT,SDValue LHS,SDValue RHS,SDValue True,SDValue False,SDValue CC,DAGCombinerInfo & DCI) const1554 SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1555 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1556 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1557 SelectionDAG &DAG = DCI.DAG;
1558 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1559 switch (CCOpcode) {
1560 case ISD::SETOEQ:
1561 case ISD::SETONE:
1562 case ISD::SETUNE:
1563 case ISD::SETNE:
1564 case ISD::SETUEQ:
1565 case ISD::SETEQ:
1566 case ISD::SETFALSE:
1567 case ISD::SETFALSE2:
1568 case ISD::SETTRUE:
1569 case ISD::SETTRUE2:
1570 case ISD::SETUO:
1571 case ISD::SETO:
1572 break;
1573 case ISD::SETULE:
1574 case ISD::SETULT: {
1575 if (LHS == True)
1576 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1577 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1578 }
1579 case ISD::SETOLE:
1580 case ISD::SETOLT:
1581 case ISD::SETLE:
1582 case ISD::SETLT: {
1583 // Ordered. Assume ordered for undefined.
1584
1585 // Only do this after legalization to avoid interfering with other combines
1586 // which might occur.
1587 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1588 !DCI.isCalledByLegalizer())
1589 return SDValue();
1590
1591 // We need to permute the operands to get the correct NaN behavior. The
1592 // selected operand is the second one based on the failing compare with NaN,
1593 // so permute it based on the compare type the hardware uses.
1594 if (LHS == True)
1595 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1596 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1597 }
1598 case ISD::SETUGE:
1599 case ISD::SETUGT: {
1600 if (LHS == True)
1601 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1602 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1603 }
1604 case ISD::SETGT:
1605 case ISD::SETGE:
1606 case ISD::SETOGE:
1607 case ISD::SETOGT: {
1608 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1609 !DCI.isCalledByLegalizer())
1610 return SDValue();
1611
1612 if (LHS == True)
1613 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1614 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1615 }
1616 case ISD::SETCC_INVALID:
1617 llvm_unreachable("Invalid setcc condcode!");
1618 }
1619 return SDValue();
1620 }
1621
1622 /// Generate Min/Max node
combineFMinMaxLegacy(const SDLoc & DL,EVT VT,SDValue LHS,SDValue RHS,SDValue True,SDValue False,SDValue CC,DAGCombinerInfo & DCI) const1623 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1624 SDValue LHS, SDValue RHS,
1625 SDValue True, SDValue False,
1626 SDValue CC,
1627 DAGCombinerInfo &DCI) const {
1628 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1629 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1630
1631 SelectionDAG &DAG = DCI.DAG;
1632
1633 // If we can't directly match this, try to see if we can fold an fneg to
1634 // match.
1635
1636 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1637 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1638 SDValue NegTrue = peekFNeg(True);
1639
1640 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1641 // fmin/fmax.
1642 //
1643 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1644 // -> fneg (fmin_legacy lhs, K)
1645 //
1646 // TODO: Use getNegatedExpression
1647 if (LHS == NegTrue && CFalse && CRHS) {
1648 APFloat NegRHS = neg(CRHS->getValueAPF());
1649 if (NegRHS == CFalse->getValueAPF()) {
1650 SDValue Combined =
1651 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1652 if (Combined)
1653 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1654 return SDValue();
1655 }
1656 }
1657
1658 return SDValue();
1659 }
1660
1661 std::pair<SDValue, SDValue>
split64BitValue(SDValue Op,SelectionDAG & DAG) const1662 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1663 SDLoc SL(Op);
1664
1665 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1666
1667 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1668 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1669
1670 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1671 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1672
1673 return std::pair(Lo, Hi);
1674 }
1675
getLoHalf64(SDValue Op,SelectionDAG & DAG) const1676 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1677 SDLoc SL(Op);
1678
1679 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1680 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1681 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1682 }
1683
getHiHalf64(SDValue Op,SelectionDAG & DAG) const1684 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1685 SDLoc SL(Op);
1686
1687 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1688 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1689 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1690 }
1691
1692 // Split a vector type into two parts. The first part is a power of two vector.
1693 // The second part is whatever is left over, and is a scalar if it would
1694 // otherwise be a 1-vector.
1695 std::pair<EVT, EVT>
getSplitDestVTs(const EVT & VT,SelectionDAG & DAG) const1696 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1697 EVT LoVT, HiVT;
1698 EVT EltVT = VT.getVectorElementType();
1699 unsigned NumElts = VT.getVectorNumElements();
1700 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1701 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1702 HiVT = NumElts - LoNumElts == 1
1703 ? EltVT
1704 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1705 return std::pair(LoVT, HiVT);
1706 }
1707
1708 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1709 // scalar.
1710 std::pair<SDValue, SDValue>
splitVector(const SDValue & N,const SDLoc & DL,const EVT & LoVT,const EVT & HiVT,SelectionDAG & DAG) const1711 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1712 const EVT &LoVT, const EVT &HiVT,
1713 SelectionDAG &DAG) const {
1714 assert(LoVT.getVectorNumElements() +
1715 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1716 N.getValueType().getVectorNumElements() &&
1717 "More vector elements requested than available!");
1718 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1719 DAG.getVectorIdxConstant(0, DL));
1720 SDValue Hi = DAG.getNode(
1721 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1722 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1723 return std::pair(Lo, Hi);
1724 }
1725
SplitVectorLoad(const SDValue Op,SelectionDAG & DAG) const1726 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1727 SelectionDAG &DAG) const {
1728 LoadSDNode *Load = cast<LoadSDNode>(Op);
1729 EVT VT = Op.getValueType();
1730 SDLoc SL(Op);
1731
1732
1733 // If this is a 2 element vector, we really want to scalarize and not create
1734 // weird 1 element vectors.
1735 if (VT.getVectorNumElements() == 2) {
1736 SDValue Ops[2];
1737 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1738 return DAG.getMergeValues(Ops, SL);
1739 }
1740
1741 SDValue BasePtr = Load->getBasePtr();
1742 EVT MemVT = Load->getMemoryVT();
1743
1744 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1745
1746 EVT LoVT, HiVT;
1747 EVT LoMemVT, HiMemVT;
1748 SDValue Lo, Hi;
1749
1750 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1751 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1752 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1753
1754 unsigned Size = LoMemVT.getStoreSize();
1755 Align BaseAlign = Load->getAlign();
1756 Align HiAlign = commonAlignment(BaseAlign, Size);
1757
1758 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1759 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1760 BaseAlign, Load->getMemOperand()->getFlags());
1761 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1762 SDValue HiLoad =
1763 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1764 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1765 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1766
1767 SDValue Join;
1768 if (LoVT == HiVT) {
1769 // This is the case that the vector is power of two so was evenly split.
1770 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1771 } else {
1772 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1773 DAG.getVectorIdxConstant(0, SL));
1774 Join = DAG.getNode(
1775 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1776 VT, Join, HiLoad,
1777 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1778 }
1779
1780 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1781 LoLoad.getValue(1), HiLoad.getValue(1))};
1782
1783 return DAG.getMergeValues(Ops, SL);
1784 }
1785
WidenOrSplitVectorLoad(SDValue Op,SelectionDAG & DAG) const1786 SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1787 SelectionDAG &DAG) const {
1788 LoadSDNode *Load = cast<LoadSDNode>(Op);
1789 EVT VT = Op.getValueType();
1790 SDValue BasePtr = Load->getBasePtr();
1791 EVT MemVT = Load->getMemoryVT();
1792 SDLoc SL(Op);
1793 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1794 Align BaseAlign = Load->getAlign();
1795 unsigned NumElements = MemVT.getVectorNumElements();
1796
1797 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1798 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1799 if (NumElements != 3 ||
1800 (BaseAlign < Align(8) &&
1801 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1802 return SplitVectorLoad(Op, DAG);
1803
1804 assert(NumElements == 3);
1805
1806 EVT WideVT =
1807 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1808 EVT WideMemVT =
1809 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1810 SDValue WideLoad = DAG.getExtLoad(
1811 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1812 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1813 return DAG.getMergeValues(
1814 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1815 DAG.getVectorIdxConstant(0, SL)),
1816 WideLoad.getValue(1)},
1817 SL);
1818 }
1819
SplitVectorStore(SDValue Op,SelectionDAG & DAG) const1820 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1821 SelectionDAG &DAG) const {
1822 StoreSDNode *Store = cast<StoreSDNode>(Op);
1823 SDValue Val = Store->getValue();
1824 EVT VT = Val.getValueType();
1825
1826 // If this is a 2 element vector, we really want to scalarize and not create
1827 // weird 1 element vectors.
1828 if (VT.getVectorNumElements() == 2)
1829 return scalarizeVectorStore(Store, DAG);
1830
1831 EVT MemVT = Store->getMemoryVT();
1832 SDValue Chain = Store->getChain();
1833 SDValue BasePtr = Store->getBasePtr();
1834 SDLoc SL(Op);
1835
1836 EVT LoVT, HiVT;
1837 EVT LoMemVT, HiMemVT;
1838 SDValue Lo, Hi;
1839
1840 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1841 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1842 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1843
1844 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1845
1846 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1847 Align BaseAlign = Store->getAlign();
1848 unsigned Size = LoMemVT.getStoreSize();
1849 Align HiAlign = commonAlignment(BaseAlign, Size);
1850
1851 SDValue LoStore =
1852 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1853 Store->getMemOperand()->getFlags());
1854 SDValue HiStore =
1855 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1856 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1857
1858 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1859 }
1860
1861 // This is a shortcut for integer division because we have fast i32<->f32
1862 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1863 // float is enough to accurately represent up to a 24-bit signed integer.
LowerDIVREM24(SDValue Op,SelectionDAG & DAG,bool Sign) const1864 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1865 bool Sign) const {
1866 SDLoc DL(Op);
1867 EVT VT = Op.getValueType();
1868 SDValue LHS = Op.getOperand(0);
1869 SDValue RHS = Op.getOperand(1);
1870 MVT IntVT = MVT::i32;
1871 MVT FltVT = MVT::f32;
1872
1873 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1874 if (LHSSignBits < 9)
1875 return SDValue();
1876
1877 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1878 if (RHSSignBits < 9)
1879 return SDValue();
1880
1881 unsigned BitSize = VT.getSizeInBits();
1882 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1883 unsigned DivBits = BitSize - SignBits;
1884 if (Sign)
1885 ++DivBits;
1886
1887 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1888 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1889
1890 SDValue jq = DAG.getConstant(1, DL, IntVT);
1891
1892 if (Sign) {
1893 // char|short jq = ia ^ ib;
1894 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1895
1896 // jq = jq >> (bitsize - 2)
1897 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1898 DAG.getConstant(BitSize - 2, DL, VT));
1899
1900 // jq = jq | 0x1
1901 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1902 }
1903
1904 // int ia = (int)LHS;
1905 SDValue ia = LHS;
1906
1907 // int ib, (int)RHS;
1908 SDValue ib = RHS;
1909
1910 // float fa = (float)ia;
1911 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1912
1913 // float fb = (float)ib;
1914 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1915
1916 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1917 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1918
1919 // fq = trunc(fq);
1920 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1921
1922 // float fqneg = -fq;
1923 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1924
1925 MachineFunction &MF = DAG.getMachineFunction();
1926
1927 bool UseFmadFtz = false;
1928 if (Subtarget->isGCN()) {
1929 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1930 UseFmadFtz =
1931 MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
1932 }
1933
1934 // float fr = mad(fqneg, fb, fa);
1935 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1936 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1937 : (unsigned)ISD::FMAD;
1938 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1939
1940 // int iq = (int)fq;
1941 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1942
1943 // fr = fabs(fr);
1944 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1945
1946 // fb = fabs(fb);
1947 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1948
1949 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1950
1951 // int cv = fr >= fb;
1952 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1953
1954 // jq = (cv ? jq : 0);
1955 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1956
1957 // dst = iq + jq;
1958 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1959
1960 // Rem needs compensation, it's easier to recompute it
1961 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1962 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1963
1964 // Truncate to number of bits this divide really is.
1965 if (Sign) {
1966 SDValue InRegSize
1967 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1968 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1969 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1970 } else {
1971 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1972 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1973 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1974 }
1975
1976 return DAG.getMergeValues({ Div, Rem }, DL);
1977 }
1978
LowerUDIVREM64(SDValue Op,SelectionDAG & DAG,SmallVectorImpl<SDValue> & Results) const1979 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1980 SelectionDAG &DAG,
1981 SmallVectorImpl<SDValue> &Results) const {
1982 SDLoc DL(Op);
1983 EVT VT = Op.getValueType();
1984
1985 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1986
1987 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1988
1989 SDValue One = DAG.getConstant(1, DL, HalfVT);
1990 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1991
1992 //HiLo split
1993 SDValue LHS_Lo, LHS_Hi;
1994 SDValue LHS = Op.getOperand(0);
1995 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
1996
1997 SDValue RHS_Lo, RHS_Hi;
1998 SDValue RHS = Op.getOperand(1);
1999 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2000
2001 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2002 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2003
2004 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2005 LHS_Lo, RHS_Lo);
2006
2007 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2008 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2009
2010 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2011 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2012 return;
2013 }
2014
2015 if (isTypeLegal(MVT::i64)) {
2016 // The algorithm here is based on ideas from "Software Integer Division",
2017 // Tom Rodeheffer, August 2008.
2018
2019 MachineFunction &MF = DAG.getMachineFunction();
2020 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2021
2022 // Compute denominator reciprocal.
2023 unsigned FMAD =
2024 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2025 : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
2026 ? (unsigned)ISD::FMAD
2027 : (unsigned)AMDGPUISD::FMAD_FTZ;
2028
2029 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2030 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2031 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2032 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2033 Cvt_Lo);
2034 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2035 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2036 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2037 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2038 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2039 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2040 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2041 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2042 Mul1);
2043 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2044 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2045 SDValue Rcp64 = DAG.getBitcast(VT,
2046 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2047
2048 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2049 SDValue One64 = DAG.getConstant(1, DL, VT);
2050 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2051 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2052
2053 // First round of UNR (Unsigned integer Newton-Raphson).
2054 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2055 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2056 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2057 SDValue Mulhi1_Lo, Mulhi1_Hi;
2058 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2059 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2060 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2061 Mulhi1_Lo, Zero1);
2062 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2063 Mulhi1_Hi, Add1_Lo.getValue(1));
2064 SDValue Add1 = DAG.getBitcast(VT,
2065 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2066
2067 // Second round of UNR.
2068 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2069 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2070 SDValue Mulhi2_Lo, Mulhi2_Hi;
2071 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2072 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2073 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2074 Mulhi2_Lo, Zero1);
2075 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2076 Mulhi2_Hi, Add2_Lo.getValue(1));
2077 SDValue Add2 = DAG.getBitcast(VT,
2078 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2079
2080 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2081
2082 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2083
2084 SDValue Mul3_Lo, Mul3_Hi;
2085 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2086 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2087 Mul3_Lo, Zero1);
2088 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2089 Mul3_Hi, Sub1_Lo.getValue(1));
2090 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2091 SDValue Sub1 = DAG.getBitcast(VT,
2092 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2093
2094 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2095 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2096 ISD::SETUGE);
2097 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2098 ISD::SETUGE);
2099 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2100
2101 // TODO: Here and below portions of the code can be enclosed into if/endif.
2102 // Currently control flow is unconditional and we have 4 selects after
2103 // potential endif to substitute PHIs.
2104
2105 // if C3 != 0 ...
2106 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2107 RHS_Lo, Zero1);
2108 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2109 RHS_Hi, Sub1_Lo.getValue(1));
2110 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2111 Zero, Sub2_Lo.getValue(1));
2112 SDValue Sub2 = DAG.getBitcast(VT,
2113 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2114
2115 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2116
2117 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2118 ISD::SETUGE);
2119 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2120 ISD::SETUGE);
2121 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2122
2123 // if (C6 != 0)
2124 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2125
2126 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2127 RHS_Lo, Zero1);
2128 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2129 RHS_Hi, Sub2_Lo.getValue(1));
2130 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2131 Zero, Sub3_Lo.getValue(1));
2132 SDValue Sub3 = DAG.getBitcast(VT,
2133 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2134
2135 // endif C6
2136 // endif C3
2137
2138 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2139 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2140
2141 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2142 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2143
2144 Results.push_back(Div);
2145 Results.push_back(Rem);
2146
2147 return;
2148 }
2149
2150 // r600 expandion.
2151 // Get Speculative values
2152 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2153 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2154
2155 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2156 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2157 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2158
2159 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2160 SDValue DIV_Lo = Zero;
2161
2162 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2163
2164 for (unsigned i = 0; i < halfBitWidth; ++i) {
2165 const unsigned bitPos = halfBitWidth - i - 1;
2166 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2167 // Get value of high bit
2168 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2169 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2170 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2171
2172 // Shift
2173 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2174 // Add LHS high bit
2175 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2176
2177 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2178 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2179
2180 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2181
2182 // Update REM
2183 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2184 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2185 }
2186
2187 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2188 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2189 Results.push_back(DIV);
2190 Results.push_back(REM);
2191 }
2192
LowerUDIVREM(SDValue Op,SelectionDAG & DAG) const2193 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2194 SelectionDAG &DAG) const {
2195 SDLoc DL(Op);
2196 EVT VT = Op.getValueType();
2197
2198 if (VT == MVT::i64) {
2199 SmallVector<SDValue, 2> Results;
2200 LowerUDIVREM64(Op, DAG, Results);
2201 return DAG.getMergeValues(Results, DL);
2202 }
2203
2204 if (VT == MVT::i32) {
2205 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2206 return Res;
2207 }
2208
2209 SDValue X = Op.getOperand(0);
2210 SDValue Y = Op.getOperand(1);
2211
2212 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2213 // algorithm used here.
2214
2215 // Initial estimate of inv(y).
2216 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2217
2218 // One round of UNR.
2219 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2220 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2221 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2222 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2223
2224 // Quotient/remainder estimate.
2225 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2226 SDValue R =
2227 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2228
2229 // First quotient/remainder refinement.
2230 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2231 SDValue One = DAG.getConstant(1, DL, VT);
2232 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2233 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2234 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2235 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2236 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2237
2238 // Second quotient/remainder refinement.
2239 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2240 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2241 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2242 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2243 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2244
2245 return DAG.getMergeValues({Q, R}, DL);
2246 }
2247
LowerSDIVREM(SDValue Op,SelectionDAG & DAG) const2248 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2249 SelectionDAG &DAG) const {
2250 SDLoc DL(Op);
2251 EVT VT = Op.getValueType();
2252
2253 SDValue LHS = Op.getOperand(0);
2254 SDValue RHS = Op.getOperand(1);
2255
2256 SDValue Zero = DAG.getConstant(0, DL, VT);
2257 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2258
2259 if (VT == MVT::i32) {
2260 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2261 return Res;
2262 }
2263
2264 if (VT == MVT::i64 &&
2265 DAG.ComputeNumSignBits(LHS) > 32 &&
2266 DAG.ComputeNumSignBits(RHS) > 32) {
2267 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2268
2269 //HiLo split
2270 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2271 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2272 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2273 LHS_Lo, RHS_Lo);
2274 SDValue Res[2] = {
2275 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2276 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2277 };
2278 return DAG.getMergeValues(Res, DL);
2279 }
2280
2281 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2282 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2283 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2284 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2285
2286 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2287 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2288
2289 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2290 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2291
2292 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2293 SDValue Rem = Div.getValue(1);
2294
2295 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2296 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2297
2298 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2299 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2300
2301 SDValue Res[2] = {
2302 Div,
2303 Rem
2304 };
2305 return DAG.getMergeValues(Res, DL);
2306 }
2307
2308 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
LowerFREM(SDValue Op,SelectionDAG & DAG) const2309 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2310 SDLoc SL(Op);
2311 EVT VT = Op.getValueType();
2312 auto Flags = Op->getFlags();
2313 SDValue X = Op.getOperand(0);
2314 SDValue Y = Op.getOperand(1);
2315
2316 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2317 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2318 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2319 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2320 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2321 }
2322
LowerFCEIL(SDValue Op,SelectionDAG & DAG) const2323 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2324 SDLoc SL(Op);
2325 SDValue Src = Op.getOperand(0);
2326
2327 // result = trunc(src)
2328 // if (src > 0.0 && src != result)
2329 // result += 1.0
2330
2331 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2332
2333 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2334 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2335
2336 EVT SetCCVT =
2337 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2338
2339 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2340 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2341 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2342
2343 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2344 // TODO: Should this propagate fast-math-flags?
2345 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2346 }
2347
extractF64Exponent(SDValue Hi,const SDLoc & SL,SelectionDAG & DAG)2348 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2349 SelectionDAG &DAG) {
2350 const unsigned FractBits = 52;
2351 const unsigned ExpBits = 11;
2352
2353 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2354 Hi,
2355 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2356 DAG.getConstant(ExpBits, SL, MVT::i32));
2357 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2358 DAG.getConstant(1023, SL, MVT::i32));
2359
2360 return Exp;
2361 }
2362
LowerFTRUNC(SDValue Op,SelectionDAG & DAG) const2363 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2364 SDLoc SL(Op);
2365 SDValue Src = Op.getOperand(0);
2366
2367 assert(Op.getValueType() == MVT::f64);
2368
2369 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2370
2371 // Extract the upper half, since this is where we will find the sign and
2372 // exponent.
2373 SDValue Hi = getHiHalf64(Src, DAG);
2374
2375 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2376
2377 const unsigned FractBits = 52;
2378
2379 // Extract the sign bit.
2380 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2381 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2382
2383 // Extend back to 64-bits.
2384 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2385 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2386
2387 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2388 const SDValue FractMask
2389 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2390
2391 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2392 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2393 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2394
2395 EVT SetCCVT =
2396 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2397
2398 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2399
2400 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2401 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2402
2403 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2404 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2405
2406 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2407 }
2408
LowerFROUNDEVEN(SDValue Op,SelectionDAG & DAG) const2409 SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2410 SelectionDAG &DAG) const {
2411 SDLoc SL(Op);
2412 SDValue Src = Op.getOperand(0);
2413
2414 assert(Op.getValueType() == MVT::f64);
2415
2416 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2417 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2418 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2419
2420 // TODO: Should this propagate fast-math-flags?
2421
2422 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2423 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2424
2425 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2426
2427 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2428 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2429
2430 EVT SetCCVT =
2431 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2432 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2433
2434 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2435 }
2436
LowerFNEARBYINT(SDValue Op,SelectionDAG & DAG) const2437 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
2438 SelectionDAG &DAG) const {
2439 // FNEARBYINT and FRINT are the same, except in their handling of FP
2440 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2441 // rint, so just treat them as equivalent.
2442 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2443 Op.getOperand(0));
2444 }
2445
LowerFRINT(SDValue Op,SelectionDAG & DAG) const2446 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2447 auto VT = Op.getValueType();
2448 auto Arg = Op.getOperand(0u);
2449 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2450 }
2451
2452 // XXX - May require not supporting f32 denormals?
2453
2454 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2455 // compare and vselect end up producing worse code than scalarizing the whole
2456 // operation.
LowerFROUND(SDValue Op,SelectionDAG & DAG) const2457 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2458 SDLoc SL(Op);
2459 SDValue X = Op.getOperand(0);
2460 EVT VT = Op.getValueType();
2461
2462 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2463
2464 // TODO: Should this propagate fast-math-flags?
2465
2466 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2467
2468 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2469
2470 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2471 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2472
2473 EVT SetCCVT =
2474 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2475
2476 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2477 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2478 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2479
2480 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2481 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2482 }
2483
LowerFFLOOR(SDValue Op,SelectionDAG & DAG) const2484 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2485 SDLoc SL(Op);
2486 SDValue Src = Op.getOperand(0);
2487
2488 // result = trunc(src);
2489 // if (src < 0.0 && src != result)
2490 // result += -1.0.
2491
2492 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2493
2494 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2495 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2496
2497 EVT SetCCVT =
2498 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2499
2500 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2501 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2502 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2503
2504 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2505 // TODO: Should this propagate fast-math-flags?
2506 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2507 }
2508
2509 /// Return true if it's known that \p Src can never be an f32 denormal value.
valueIsKnownNeverF32Denorm(SDValue Src)2510 static bool valueIsKnownNeverF32Denorm(SDValue Src) {
2511 switch (Src.getOpcode()) {
2512 case ISD::FP_EXTEND:
2513 return Src.getOperand(0).getValueType() == MVT::f16;
2514 case ISD::FP16_TO_FP:
2515 case ISD::FFREXP:
2516 return true;
2517 case ISD::INTRINSIC_WO_CHAIN: {
2518 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2519 switch (IntrinsicID) {
2520 case Intrinsic::amdgcn_frexp_mant:
2521 return true;
2522 default:
2523 return false;
2524 }
2525 }
2526 default:
2527 return false;
2528 }
2529
2530 llvm_unreachable("covered opcode switch");
2531 }
2532
allowApproxFunc(const SelectionDAG & DAG,SDNodeFlags Flags)2533 bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
2534 SDNodeFlags Flags) {
2535 if (Flags.hasApproximateFuncs())
2536 return true;
2537 auto &Options = DAG.getTarget().Options;
2538 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2539 }
2540
needsDenormHandlingF32(const SelectionDAG & DAG,SDValue Src,SDNodeFlags Flags)2541 bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
2542 SDValue Src,
2543 SDNodeFlags Flags) {
2544 return !valueIsKnownNeverF32Denorm(Src) &&
2545 DAG.getMachineFunction()
2546 .getDenormalMode(APFloat::IEEEsingle())
2547 .Input != DenormalMode::PreserveSign;
2548 }
2549
getIsLtSmallestNormal(SelectionDAG & DAG,SDValue Src,SDNodeFlags Flags) const2550 SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
2551 SDValue Src,
2552 SDNodeFlags Flags) const {
2553 SDLoc SL(Src);
2554 EVT VT = Src.getValueType();
2555 const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
2556 SDValue SmallestNormal =
2557 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2558
2559 // Want to scale denormals up, but negatives and 0 work just as well on the
2560 // scaled path.
2561 SDValue IsLtSmallestNormal = DAG.getSetCC(
2562 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2563 SmallestNormal, ISD::SETOLT);
2564
2565 return IsLtSmallestNormal;
2566 }
2567
getIsFinite(SelectionDAG & DAG,SDValue Src,SDNodeFlags Flags) const2568 SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
2569 SDNodeFlags Flags) const {
2570 SDLoc SL(Src);
2571 EVT VT = Src.getValueType();
2572 const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
2573 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2574
2575 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2576 SDValue IsFinite = DAG.getSetCC(
2577 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2578 Inf, ISD::SETOLT);
2579 return IsFinite;
2580 }
2581
2582 /// If denormal handling is required return the scaled input to FLOG2, and the
2583 /// check for denormal range. Otherwise, return null values.
2584 std::pair<SDValue, SDValue>
getScaledLogInput(SelectionDAG & DAG,const SDLoc SL,SDValue Src,SDNodeFlags Flags) const2585 AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
2586 SDValue Src, SDNodeFlags Flags) const {
2587 if (!needsDenormHandlingF32(DAG, Src, Flags))
2588 return {};
2589
2590 MVT VT = MVT::f32;
2591 const fltSemantics &Semantics = APFloat::IEEEsingle();
2592 SDValue SmallestNormal =
2593 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2594
2595 SDValue IsLtSmallestNormal = DAG.getSetCC(
2596 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2597 SmallestNormal, ISD::SETOLT);
2598
2599 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2600 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2601 SDValue ScaleFactor =
2602 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2603
2604 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2605 return {ScaledInput, IsLtSmallestNormal};
2606 }
2607
LowerFLOG2(SDValue Op,SelectionDAG & DAG) const2608 SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
2609 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2610 // If we have to handle denormals, scale up the input and adjust the result.
2611
2612 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2613 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2614
2615 SDLoc SL(Op);
2616 EVT VT = Op.getValueType();
2617 SDValue Src = Op.getOperand(0);
2618 SDNodeFlags Flags = Op->getFlags();
2619
2620 if (VT == MVT::f16) {
2621 // Nothing in half is a denormal when promoted to f32.
2622 assert(!Subtarget->has16BitInsts());
2623 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2624 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2625 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2626 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2627 }
2628
2629 auto [ScaledInput, IsLtSmallestNormal] =
2630 getScaledLogInput(DAG, SL, Src, Flags);
2631 if (!ScaledInput)
2632 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2633
2634 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2635
2636 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2637 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2638 SDValue ResultOffset =
2639 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2640 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2641 }
2642
getMad(SelectionDAG & DAG,const SDLoc & SL,EVT VT,SDValue X,SDValue Y,SDValue C,SDNodeFlags Flags=SDNodeFlags ())2643 static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2644 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2645 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2646 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2647 }
2648
LowerFLOGCommon(SDValue Op,SelectionDAG & DAG) const2649 SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
2650 SelectionDAG &DAG) const {
2651 SDValue X = Op.getOperand(0);
2652 EVT VT = Op.getValueType();
2653 SDNodeFlags Flags = Op->getFlags();
2654 SDLoc DL(Op);
2655
2656 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2657 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2658
2659 const auto &Options = getTargetMachine().Options;
2660 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2661 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2662
2663 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2664 // Log and multiply in f32 is good enough for f16.
2665 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2666 }
2667
2668 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2669 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2670 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2671 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2672 }
2673
2674 return Lowered;
2675 }
2676
2677 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2678 if (ScaledInput)
2679 X = ScaledInput;
2680
2681 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2682
2683 SDValue R;
2684 if (Subtarget->hasFastFMAF32()) {
2685 // c+cc are ln(2)/ln(10) to more than 49 bits
2686 const float c_log10 = 0x1.344134p-2f;
2687 const float cc_log10 = 0x1.09f79ep-26f;
2688
2689 // c + cc is ln(2) to more than 49 bits
2690 const float c_log = 0x1.62e42ep-1f;
2691 const float cc_log = 0x1.efa39ep-25f;
2692
2693 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2694 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2695
2696 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2697 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2698 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2699 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2700 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2701 } else {
2702 // ch+ct is ln(2)/ln(10) to more than 36 bits
2703 const float ch_log10 = 0x1.344000p-2f;
2704 const float ct_log10 = 0x1.3509f6p-18f;
2705
2706 // ch + ct is ln(2) to more than 36 bits
2707 const float ch_log = 0x1.62e000p-1f;
2708 const float ct_log = 0x1.0bfbe8p-15f;
2709
2710 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2711 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2712
2713 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2714 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2715 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2716 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2717 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2718
2719 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2720 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2721 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2722 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2723 }
2724
2725 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2726 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2727
2728 // TODO: Check if known finite from source value.
2729 if (!IsFiniteOnly) {
2730 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2731 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2732 }
2733
2734 if (IsScaled) {
2735 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2736 SDValue ShiftK =
2737 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2738 SDValue Shift =
2739 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2740 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2741 }
2742
2743 return R;
2744 }
2745
LowerFLOG10(SDValue Op,SelectionDAG & DAG) const2746 SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
2747 return LowerFLOGCommon(Op, DAG);
2748 }
2749
2750 // Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2751 // promote f16 operation.
LowerFLOGUnsafe(SDValue Src,const SDLoc & SL,SelectionDAG & DAG,bool IsLog10,SDNodeFlags Flags) const2752 SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
2753 SelectionDAG &DAG, bool IsLog10,
2754 SDNodeFlags Flags) const {
2755 EVT VT = Src.getValueType();
2756 unsigned LogOp =
2757 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2758
2759 double Log2BaseInverted =
2760 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2761
2762 if (VT == MVT::f32) {
2763 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2764 if (ScaledInput) {
2765 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2766 SDValue ScaledResultOffset =
2767 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2768
2769 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2770
2771 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2772 ScaledResultOffset, Zero, Flags);
2773
2774 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2775
2776 if (Subtarget->hasFastFMAF32())
2777 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2778 Flags);
2779 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2780 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2781 }
2782 }
2783
2784 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2785 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2786
2787 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2788 Flags);
2789 }
2790
lowerFEXP2(SDValue Op,SelectionDAG & DAG) const2791 SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
2792 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2793 // If we have to handle denormals, scale up the input and adjust the result.
2794
2795 SDLoc SL(Op);
2796 EVT VT = Op.getValueType();
2797 SDValue Src = Op.getOperand(0);
2798 SDNodeFlags Flags = Op->getFlags();
2799
2800 if (VT == MVT::f16) {
2801 // Nothing in half is a denormal when promoted to f32.
2802 assert(!Subtarget->has16BitInsts());
2803 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2804 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2805 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2806 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2807 }
2808
2809 assert(VT == MVT::f32);
2810
2811 if (!needsDenormHandlingF32(DAG, Src, Flags))
2812 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2813
2814 // bool needs_scaling = x < -0x1.f80000p+6f;
2815 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2816
2817 // -nextafter(128.0, -1)
2818 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2819
2820 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2821
2822 SDValue NeedsScaling =
2823 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2824
2825 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2826 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2827
2828 SDValue AddOffset =
2829 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2830
2831 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2832 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2833
2834 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2835 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2836 SDValue ResultScale =
2837 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2838
2839 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2840 }
2841
lowerFEXPUnsafe(SDValue X,const SDLoc & SL,SelectionDAG & DAG,SDNodeFlags Flags) const2842 SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
2843 SelectionDAG &DAG,
2844 SDNodeFlags Flags) const {
2845 EVT VT = X.getValueType();
2846 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2847
2848 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2849 // exp2(M_LOG2E_F * f);
2850 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2851 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2852 : (unsigned)ISD::FEXP2,
2853 SL, VT, Mul, Flags);
2854 }
2855
2856 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2857
2858 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2859 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2860
2861 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2862
2863 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2864
2865 SDValue AdjustedX =
2866 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2867
2868 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2869
2870 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2871
2872 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2873 SDValue AdjustedResult =
2874 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2875
2876 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2877 Flags);
2878 }
2879
2880 /// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2881 /// handled correctly.
lowerFEXP10Unsafe(SDValue X,const SDLoc & SL,SelectionDAG & DAG,SDNodeFlags Flags) const2882 SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
2883 SelectionDAG &DAG,
2884 SDNodeFlags Flags) const {
2885 const EVT VT = X.getValueType();
2886 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2887
2888 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2889 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2890 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2891 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2892
2893 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2894 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2895 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2896 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2897 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2898 }
2899
2900 // bool s = x < -0x1.2f7030p+5f;
2901 // x += s ? 0x1.0p+5f : 0.0f;
2902 // exp10 = exp2(x * 0x1.a92000p+1f) *
2903 // exp2(x * 0x1.4f0978p-11f) *
2904 // (s ? 0x1.9f623ep-107f : 1.0f);
2905
2906 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2907
2908 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2909 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2910
2911 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
2912 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2913 SDValue AdjustedX =
2914 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2915
2916 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2917 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2918
2919 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
2920 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2921 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
2922 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2923
2924 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
2925
2926 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
2927 SDValue AdjustedResult =
2928 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
2929
2930 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
2931 Flags);
2932 }
2933
lowerFEXP(SDValue Op,SelectionDAG & DAG) const2934 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2935 EVT VT = Op.getValueType();
2936 SDLoc SL(Op);
2937 SDValue X = Op.getOperand(0);
2938 SDNodeFlags Flags = Op->getFlags();
2939 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2940
2941 if (VT.getScalarType() == MVT::f16) {
2942 // v_exp_f16 (fmul x, log2e)
2943 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2944 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2945
2946 if (VT.isVector())
2947 return SDValue();
2948
2949 // exp(f16 x) ->
2950 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2951
2952 // Nothing in half is a denormal when promoted to f32.
2953 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2954 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2955 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2956 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2957 }
2958
2959 assert(VT == MVT::f32);
2960
2961 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
2962 // library behavior. Also, is known-not-daz source sufficient?
2963 if (allowApproxFunc(DAG, Flags)) {
2964 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
2965 : lowerFEXPUnsafe(X, SL, DAG, Flags);
2966 }
2967
2968 // Algorithm:
2969 //
2970 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
2971 //
2972 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
2973 // n = 64*m + j, 0 <= j < 64
2974 //
2975 // e^x = 2^((64*m + j + f)/64)
2976 // = (2^m) * (2^(j/64)) * 2^(f/64)
2977 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
2978 //
2979 // f = x*(64/ln(2)) - n
2980 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
2981 //
2982 // e^x = (2^m) * (2^(j/64)) * e^r
2983 //
2984 // (2^(j/64)) is precomputed
2985 //
2986 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2987 // e^r = 1 + q
2988 //
2989 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2990 //
2991 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
2992 SDNodeFlags FlagsNoContract = Flags;
2993 FlagsNoContract.setAllowContract(false);
2994
2995 SDValue PH, PL;
2996 if (Subtarget->hasFastFMAF32()) {
2997 const float c_exp = numbers::log2ef;
2998 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
2999 const float c_exp10 = 0x1.a934f0p+1f;
3000 const float cc_exp10 = 0x1.2f346ep-24f;
3001
3002 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3003 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3004
3005 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3006 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3007 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3008 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3009 } else {
3010 const float ch_exp = 0x1.714000p+0f;
3011 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3012
3013 const float ch_exp10 = 0x1.a92000p+1f;
3014 const float cl_exp10 = 0x1.4f0978p-11f;
3015
3016 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3017 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3018
3019 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3020 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3021 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3022 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3023 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3024
3025 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3026
3027 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3028 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3029 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3030 }
3031
3032 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3033
3034 // It is unsafe to contract this fsub into the PH multiply.
3035 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3036
3037 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3038 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3039 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3040
3041 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3042
3043 SDValue UnderflowCheckConst =
3044 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3045
3046 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3047 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3048 SDValue Underflow =
3049 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3050
3051 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3052 const auto &Options = getTargetMachine().Options;
3053
3054 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3055 SDValue OverflowCheckConst =
3056 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3057 SDValue Overflow =
3058 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3059 SDValue Inf =
3060 DAG.getConstantFP(APFloat::getInf(APFloat::IEEEsingle()), SL, VT);
3061 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3062 }
3063
3064 return R;
3065 }
3066
isCtlzOpc(unsigned Opc)3067 static bool isCtlzOpc(unsigned Opc) {
3068 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3069 }
3070
isCttzOpc(unsigned Opc)3071 static bool isCttzOpc(unsigned Opc) {
3072 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3073 }
3074
lowerCTLZResults(SDValue Op,SelectionDAG & DAG) const3075 SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
3076 SelectionDAG &DAG) const {
3077 auto SL = SDLoc(Op);
3078 auto Arg = Op.getOperand(0u);
3079 auto ResultVT = Op.getValueType();
3080
3081 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3082 return {};
3083
3084 assert(isCtlzOpc(Op.getOpcode()));
3085 assert(ResultVT == Arg.getValueType());
3086
3087 auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
3088 auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3089 auto ShiftVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
3090 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, ShiftVal);
3091 NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
3092 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3093 }
3094
LowerCTLZ_CTTZ(SDValue Op,SelectionDAG & DAG) const3095 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
3096 SDLoc SL(Op);
3097 SDValue Src = Op.getOperand(0);
3098
3099 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3100 bool Ctlz = isCtlzOpc(Op.getOpcode());
3101 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3102
3103 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3104 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3105 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3106
3107 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3108 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3109 // (cttz hi:lo) -> (umin (ffbl src), 32)
3110 // (ctlz_zero_undef src) -> (ffbh src)
3111 // (cttz_zero_undef src) -> (ffbl src)
3112
3113 // 64-bit scalar version produce 32-bit result
3114 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3115 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3116 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3117 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3118 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3119 if (!ZeroUndef) {
3120 const SDValue ConstVal = DAG.getConstant(
3121 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3122 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3123 }
3124 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3125 }
3126
3127 SDValue Lo, Hi;
3128 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3129
3130 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3131 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3132
3133 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3134 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3135 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3136 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3137
3138 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3139 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3140 if (Ctlz)
3141 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3142 else
3143 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3144
3145 SDValue NewOpr;
3146 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3147 if (!ZeroUndef) {
3148 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3149 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3150 }
3151
3152 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3153 }
3154
LowerINT_TO_FP32(SDValue Op,SelectionDAG & DAG,bool Signed) const3155 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
3156 bool Signed) const {
3157 // The regular method converting a 64-bit integer to float roughly consists of
3158 // 2 steps: normalization and rounding. In fact, after normalization, the
3159 // conversion from a 64-bit integer to a float is essentially the same as the
3160 // one from a 32-bit integer. The only difference is that it has more
3161 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3162 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3163 // converted into the correct float number. The basic steps for the unsigned
3164 // conversion are illustrated in the following pseudo code:
3165 //
3166 // f32 uitofp(i64 u) {
3167 // i32 hi, lo = split(u);
3168 // // Only count the leading zeros in hi as we have native support of the
3169 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3170 // // reduced to a 32-bit one automatically.
3171 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3172 // u <<= shamt;
3173 // hi, lo = split(u);
3174 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3175 // // convert it as a 32-bit integer and scale the result back.
3176 // return uitofp(hi) * 2^(32 - shamt);
3177 // }
3178 //
3179 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3180 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3181 // converted instead followed by negation based its sign bit.
3182
3183 SDLoc SL(Op);
3184 SDValue Src = Op.getOperand(0);
3185
3186 SDValue Lo, Hi;
3187 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3188 SDValue Sign;
3189 SDValue ShAmt;
3190 if (Signed && Subtarget->isGCN()) {
3191 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3192 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3193 // account. That is, the maximal shift is
3194 // - 32 if Lo and Hi have opposite signs;
3195 // - 33 if Lo and Hi have the same sign.
3196 //
3197 // Or, MaxShAmt = 33 + OppositeSign, where
3198 //
3199 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3200 // - -1 if Lo and Hi have opposite signs; and
3201 // - 0 otherwise.
3202 //
3203 // All in all, ShAmt is calculated as
3204 //
3205 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3206 //
3207 // or
3208 //
3209 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3210 //
3211 // to reduce the critical path.
3212 SDValue OppositeSign = DAG.getNode(
3213 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3214 DAG.getConstant(31, SL, MVT::i32));
3215 SDValue MaxShAmt =
3216 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3217 OppositeSign);
3218 // Count the leading sign bits.
3219 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3220 // Different from unsigned conversion, the shift should be one bit less to
3221 // preserve the sign bit.
3222 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3223 DAG.getConstant(1, SL, MVT::i32));
3224 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3225 } else {
3226 if (Signed) {
3227 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3228 // absolute value first.
3229 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3230 DAG.getConstant(63, SL, MVT::i64));
3231 SDValue Abs =
3232 DAG.getNode(ISD::XOR, SL, MVT::i64,
3233 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3234 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3235 }
3236 // Count the leading zeros.
3237 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3238 // The shift amount for signed integers is [0, 32].
3239 }
3240 // Normalize the given 64-bit integer.
3241 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3242 // Split it again.
3243 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3244 // Calculate the adjust bit for rounding.
3245 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3246 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3247 DAG.getConstant(1, SL, MVT::i32), Lo);
3248 // Get the 32-bit normalized integer.
3249 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3250 // Convert the normalized 32-bit integer into f32.
3251 unsigned Opc =
3252 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3253 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3254
3255 // Finally, need to scale back the converted floating number as the original
3256 // 64-bit integer is converted as a 32-bit one.
3257 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3258 ShAmt);
3259 // On GCN, use LDEXP directly.
3260 if (Subtarget->isGCN())
3261 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3262
3263 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3264 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3265 // exponent is enough to avoid overflowing into the sign bit.
3266 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3267 DAG.getConstant(23, SL, MVT::i32));
3268 SDValue IVal =
3269 DAG.getNode(ISD::ADD, SL, MVT::i32,
3270 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3271 if (Signed) {
3272 // Set the sign bit.
3273 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3274 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3275 DAG.getConstant(31, SL, MVT::i32));
3276 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3277 }
3278 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3279 }
3280
LowerINT_TO_FP64(SDValue Op,SelectionDAG & DAG,bool Signed) const3281 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
3282 bool Signed) const {
3283 SDLoc SL(Op);
3284 SDValue Src = Op.getOperand(0);
3285
3286 SDValue Lo, Hi;
3287 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3288
3289 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
3290 SL, MVT::f64, Hi);
3291
3292 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3293
3294 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3295 DAG.getConstant(32, SL, MVT::i32));
3296 // TODO: Should this propagate fast-math-flags?
3297 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3298 }
3299
LowerUINT_TO_FP(SDValue Op,SelectionDAG & DAG) const3300 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
3301 SelectionDAG &DAG) const {
3302 // TODO: Factor out code common with LowerSINT_TO_FP.
3303 EVT DestVT = Op.getValueType();
3304 SDValue Src = Op.getOperand(0);
3305 EVT SrcVT = Src.getValueType();
3306
3307 if (SrcVT == MVT::i16) {
3308 if (DestVT == MVT::f16)
3309 return Op;
3310 SDLoc DL(Op);
3311
3312 // Promote src to i32
3313 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3314 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3315 }
3316
3317 if (DestVT == MVT::bf16) {
3318 SDLoc SL(Op);
3319 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3320 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3321 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3322 }
3323
3324 if (SrcVT != MVT::i64)
3325 return Op;
3326
3327 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3328 SDLoc DL(Op);
3329
3330 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3331 SDValue FPRoundFlag =
3332 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3333 SDValue FPRound =
3334 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3335
3336 return FPRound;
3337 }
3338
3339 if (DestVT == MVT::f32)
3340 return LowerINT_TO_FP32(Op, DAG, false);
3341
3342 assert(DestVT == MVT::f64);
3343 return LowerINT_TO_FP64(Op, DAG, false);
3344 }
3345
LowerSINT_TO_FP(SDValue Op,SelectionDAG & DAG) const3346 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
3347 SelectionDAG &DAG) const {
3348 EVT DestVT = Op.getValueType();
3349
3350 SDValue Src = Op.getOperand(0);
3351 EVT SrcVT = Src.getValueType();
3352
3353 if (SrcVT == MVT::i16) {
3354 if (DestVT == MVT::f16)
3355 return Op;
3356
3357 SDLoc DL(Op);
3358 // Promote src to i32
3359 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3360 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3361 }
3362
3363 if (DestVT == MVT::bf16) {
3364 SDLoc SL(Op);
3365 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3366 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3367 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3368 }
3369
3370 if (SrcVT != MVT::i64)
3371 return Op;
3372
3373 // TODO: Factor out code common with LowerUINT_TO_FP.
3374
3375 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3376 SDLoc DL(Op);
3377 SDValue Src = Op.getOperand(0);
3378
3379 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3380 SDValue FPRoundFlag =
3381 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3382 SDValue FPRound =
3383 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3384
3385 return FPRound;
3386 }
3387
3388 if (DestVT == MVT::f32)
3389 return LowerINT_TO_FP32(Op, DAG, true);
3390
3391 assert(DestVT == MVT::f64);
3392 return LowerINT_TO_FP64(Op, DAG, true);
3393 }
3394
LowerFP_TO_INT64(SDValue Op,SelectionDAG & DAG,bool Signed) const3395 SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
3396 bool Signed) const {
3397 SDLoc SL(Op);
3398
3399 SDValue Src = Op.getOperand(0);
3400 EVT SrcVT = Src.getValueType();
3401
3402 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3403
3404 // The basic idea of converting a floating point number into a pair of 32-bit
3405 // integers is illustrated as follows:
3406 //
3407 // tf := trunc(val);
3408 // hif := floor(tf * 2^-32);
3409 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3410 // hi := fptoi(hif);
3411 // lo := fptoi(lof);
3412 //
3413 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3414 SDValue Sign;
3415 if (Signed && SrcVT == MVT::f32) {
3416 // However, a 32-bit floating point number has only 23 bits mantissa and
3417 // it's not enough to hold all the significant bits of `lof` if val is
3418 // negative. To avoid the loss of precision, We need to take the absolute
3419 // value after truncating and flip the result back based on the original
3420 // signedness.
3421 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3422 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3423 DAG.getConstant(31, SL, MVT::i32));
3424 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3425 }
3426
3427 SDValue K0, K1;
3428 if (SrcVT == MVT::f64) {
3429 K0 = DAG.getConstantFP(
3430 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3431 SrcVT);
3432 K1 = DAG.getConstantFP(
3433 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3434 SrcVT);
3435 } else {
3436 K0 = DAG.getConstantFP(
3437 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3438 K1 = DAG.getConstantFP(
3439 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3440 }
3441 // TODO: Should this propagate fast-math-flags?
3442 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3443
3444 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3445
3446 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3447
3448 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3449 : ISD::FP_TO_UINT,
3450 SL, MVT::i32, FloorMul);
3451 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3452
3453 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3454 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3455
3456 if (Signed && SrcVT == MVT::f32) {
3457 assert(Sign);
3458 // Flip the result based on the signedness, which is either all 0s or 1s.
3459 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3460 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3461 // r := xor(r, sign) - sign;
3462 Result =
3463 DAG.getNode(ISD::SUB, SL, MVT::i64,
3464 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3465 }
3466
3467 return Result;
3468 }
3469
LowerFP_TO_FP16(SDValue Op,SelectionDAG & DAG) const3470 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
3471 SDLoc DL(Op);
3472 SDValue N0 = Op.getOperand(0);
3473
3474 // Convert to target node to get known bits
3475 if (N0.getValueType() == MVT::f32)
3476 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3477
3478 if (getTargetMachine().Options.UnsafeFPMath) {
3479 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3480 return SDValue();
3481 }
3482
3483 assert(N0.getSimpleValueType() == MVT::f64);
3484
3485 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3486 const unsigned ExpMask = 0x7ff;
3487 const unsigned ExpBiasf64 = 1023;
3488 const unsigned ExpBiasf16 = 15;
3489 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3490 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3491 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3492 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3493 DAG.getConstant(32, DL, MVT::i64));
3494 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3495 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3496 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3497 DAG.getConstant(20, DL, MVT::i64));
3498 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3499 DAG.getConstant(ExpMask, DL, MVT::i32));
3500 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3501 // add the f16 bias (15) to get the biased exponent for the f16 format.
3502 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3503 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3504
3505 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3506 DAG.getConstant(8, DL, MVT::i32));
3507 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3508 DAG.getConstant(0xffe, DL, MVT::i32));
3509
3510 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3511 DAG.getConstant(0x1ff, DL, MVT::i32));
3512 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3513
3514 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3515 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3516
3517 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3518 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3519 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3520 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3521
3522 // N = M | (E << 12);
3523 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3524 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3525 DAG.getConstant(12, DL, MVT::i32)));
3526
3527 // B = clamp(1-E, 0, 13);
3528 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3529 One, E);
3530 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3531 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3532 DAG.getConstant(13, DL, MVT::i32));
3533
3534 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3535 DAG.getConstant(0x1000, DL, MVT::i32));
3536
3537 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3538 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3539 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3540 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3541
3542 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3543 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3544 DAG.getConstant(0x7, DL, MVT::i32));
3545 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3546 DAG.getConstant(2, DL, MVT::i32));
3547 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3548 One, Zero, ISD::SETEQ);
3549 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3550 One, Zero, ISD::SETGT);
3551 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3552 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3553
3554 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3555 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3556 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3557 I, V, ISD::SETEQ);
3558
3559 // Extract the sign bit.
3560 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3561 DAG.getConstant(16, DL, MVT::i32));
3562 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3563 DAG.getConstant(0x8000, DL, MVT::i32));
3564
3565 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3566 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3567 }
3568
LowerFP_TO_INT(const SDValue Op,SelectionDAG & DAG) const3569 SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
3570 SelectionDAG &DAG) const {
3571 SDValue Src = Op.getOperand(0);
3572 unsigned OpOpcode = Op.getOpcode();
3573 EVT SrcVT = Src.getValueType();
3574 EVT DestVT = Op.getValueType();
3575
3576 // Will be selected natively
3577 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3578 return Op;
3579
3580 if (SrcVT == MVT::bf16) {
3581 SDLoc DL(Op);
3582 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3583 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3584 }
3585
3586 // Promote i16 to i32
3587 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3588 SDLoc DL(Op);
3589
3590 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3591 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3592 }
3593
3594 if (DestVT != MVT::i64)
3595 return Op;
3596
3597 if (SrcVT == MVT::f16 ||
3598 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3599 SDLoc DL(Op);
3600
3601 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3602 unsigned Ext =
3603 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3604 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3605 }
3606
3607 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3608 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3609
3610 return SDValue();
3611 }
3612
LowerSIGN_EXTEND_INREG(SDValue Op,SelectionDAG & DAG) const3613 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
3614 SelectionDAG &DAG) const {
3615 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3616 MVT VT = Op.getSimpleValueType();
3617 MVT ScalarVT = VT.getScalarType();
3618
3619 assert(VT.isVector());
3620
3621 SDValue Src = Op.getOperand(0);
3622 SDLoc DL(Op);
3623
3624 // TODO: Don't scalarize on Evergreen?
3625 unsigned NElts = VT.getVectorNumElements();
3626 SmallVector<SDValue, 8> Args;
3627 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3628
3629 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3630 for (unsigned I = 0; I < NElts; ++I)
3631 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3632
3633 return DAG.getBuildVector(VT, DL, Args);
3634 }
3635
3636 //===----------------------------------------------------------------------===//
3637 // Custom DAG optimizations
3638 //===----------------------------------------------------------------------===//
3639
isU24(SDValue Op,SelectionDAG & DAG)3640 static bool isU24(SDValue Op, SelectionDAG &DAG) {
3641 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3642 }
3643
isI24(SDValue Op,SelectionDAG & DAG)3644 static bool isI24(SDValue Op, SelectionDAG &DAG) {
3645 EVT VT = Op.getValueType();
3646 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3647 // as unsigned 24-bit values.
3648 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
3649 }
3650
simplifyMul24(SDNode * Node24,TargetLowering::DAGCombinerInfo & DCI)3651 static SDValue simplifyMul24(SDNode *Node24,
3652 TargetLowering::DAGCombinerInfo &DCI) {
3653 SelectionDAG &DAG = DCI.DAG;
3654 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3655 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3656
3657 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3658 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3659 unsigned NewOpcode = Node24->getOpcode();
3660 if (IsIntrin) {
3661 unsigned IID = Node24->getConstantOperandVal(0);
3662 switch (IID) {
3663 case Intrinsic::amdgcn_mul_i24:
3664 NewOpcode = AMDGPUISD::MUL_I24;
3665 break;
3666 case Intrinsic::amdgcn_mul_u24:
3667 NewOpcode = AMDGPUISD::MUL_U24;
3668 break;
3669 case Intrinsic::amdgcn_mulhi_i24:
3670 NewOpcode = AMDGPUISD::MULHI_I24;
3671 break;
3672 case Intrinsic::amdgcn_mulhi_u24:
3673 NewOpcode = AMDGPUISD::MULHI_U24;
3674 break;
3675 default:
3676 llvm_unreachable("Expected 24-bit mul intrinsic");
3677 }
3678 }
3679
3680 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3681
3682 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3683 // the operands to have other uses, but will only perform simplifications that
3684 // involve bypassing some nodes for this user.
3685 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3686 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3687 if (DemandedLHS || DemandedRHS)
3688 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3689 DemandedLHS ? DemandedLHS : LHS,
3690 DemandedRHS ? DemandedRHS : RHS);
3691
3692 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3693 // operands if this node is the only user.
3694 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3695 return SDValue(Node24, 0);
3696 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3697 return SDValue(Node24, 0);
3698
3699 return SDValue();
3700 }
3701
3702 template <typename IntTy>
constantFoldBFE(SelectionDAG & DAG,IntTy Src0,uint32_t Offset,uint32_t Width,const SDLoc & DL)3703 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
3704 uint32_t Width, const SDLoc &DL) {
3705 if (Width + Offset < 32) {
3706 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3707 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3708 return DAG.getConstant(Result, DL, MVT::i32);
3709 }
3710
3711 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3712 }
3713
hasVolatileUser(SDNode * Val)3714 static bool hasVolatileUser(SDNode *Val) {
3715 for (SDNode *U : Val->uses()) {
3716 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3717 if (M->isVolatile())
3718 return true;
3719 }
3720 }
3721
3722 return false;
3723 }
3724
shouldCombineMemoryType(EVT VT) const3725 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
3726 // i32 vectors are the canonical memory type.
3727 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3728 return false;
3729
3730 if (!VT.isByteSized())
3731 return false;
3732
3733 unsigned Size = VT.getStoreSize();
3734
3735 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3736 return false;
3737
3738 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3739 return false;
3740
3741 return true;
3742 }
3743
3744 // Replace load of an illegal type with a store of a bitcast to a friendlier
3745 // type.
performLoadCombine(SDNode * N,DAGCombinerInfo & DCI) const3746 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
3747 DAGCombinerInfo &DCI) const {
3748 if (!DCI.isBeforeLegalize())
3749 return SDValue();
3750
3751 LoadSDNode *LN = cast<LoadSDNode>(N);
3752 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3753 return SDValue();
3754
3755 SDLoc SL(N);
3756 SelectionDAG &DAG = DCI.DAG;
3757 EVT VT = LN->getMemoryVT();
3758
3759 unsigned Size = VT.getStoreSize();
3760 Align Alignment = LN->getAlign();
3761 if (Alignment < Size && isTypeLegal(VT)) {
3762 unsigned IsFast;
3763 unsigned AS = LN->getAddressSpace();
3764
3765 // Expand unaligned loads earlier than legalization. Due to visitation order
3766 // problems during legalization, the emitted instructions to pack and unpack
3767 // the bytes again are not eliminated in the case of an unaligned copy.
3768 if (!allowsMisalignedMemoryAccesses(
3769 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3770 if (VT.isVector())
3771 return SplitVectorLoad(SDValue(LN, 0), DAG);
3772
3773 SDValue Ops[2];
3774 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3775
3776 return DAG.getMergeValues(Ops, SDLoc(N));
3777 }
3778
3779 if (!IsFast)
3780 return SDValue();
3781 }
3782
3783 if (!shouldCombineMemoryType(VT))
3784 return SDValue();
3785
3786 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3787
3788 SDValue NewLoad
3789 = DAG.getLoad(NewVT, SL, LN->getChain(),
3790 LN->getBasePtr(), LN->getMemOperand());
3791
3792 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3793 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3794 return SDValue(N, 0);
3795 }
3796
3797 // Replace store of an illegal type with a store of a bitcast to a friendlier
3798 // type.
performStoreCombine(SDNode * N,DAGCombinerInfo & DCI) const3799 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3800 DAGCombinerInfo &DCI) const {
3801 if (!DCI.isBeforeLegalize())
3802 return SDValue();
3803
3804 StoreSDNode *SN = cast<StoreSDNode>(N);
3805 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3806 return SDValue();
3807
3808 EVT VT = SN->getMemoryVT();
3809 unsigned Size = VT.getStoreSize();
3810
3811 SDLoc SL(N);
3812 SelectionDAG &DAG = DCI.DAG;
3813 Align Alignment = SN->getAlign();
3814 if (Alignment < Size && isTypeLegal(VT)) {
3815 unsigned IsFast;
3816 unsigned AS = SN->getAddressSpace();
3817
3818 // Expand unaligned stores earlier than legalization. Due to visitation
3819 // order problems during legalization, the emitted instructions to pack and
3820 // unpack the bytes again are not eliminated in the case of an unaligned
3821 // copy.
3822 if (!allowsMisalignedMemoryAccesses(
3823 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3824 if (VT.isVector())
3825 return SplitVectorStore(SDValue(SN, 0), DAG);
3826
3827 return expandUnalignedStore(SN, DAG);
3828 }
3829
3830 if (!IsFast)
3831 return SDValue();
3832 }
3833
3834 if (!shouldCombineMemoryType(VT))
3835 return SDValue();
3836
3837 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3838 SDValue Val = SN->getValue();
3839
3840 //DCI.AddToWorklist(Val.getNode());
3841
3842 bool OtherUses = !Val.hasOneUse();
3843 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3844 if (OtherUses) {
3845 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3846 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3847 }
3848
3849 return DAG.getStore(SN->getChain(), SL, CastVal,
3850 SN->getBasePtr(), SN->getMemOperand());
3851 }
3852
3853 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3854 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3855 // issues.
performAssertSZExtCombine(SDNode * N,DAGCombinerInfo & DCI) const3856 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3857 DAGCombinerInfo &DCI) const {
3858 SelectionDAG &DAG = DCI.DAG;
3859 SDValue N0 = N->getOperand(0);
3860
3861 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3862 // (vt2 (truncate (assertzext vt0:x, vt1)))
3863 if (N0.getOpcode() == ISD::TRUNCATE) {
3864 SDValue N1 = N->getOperand(1);
3865 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3866 SDLoc SL(N);
3867
3868 SDValue Src = N0.getOperand(0);
3869 EVT SrcVT = Src.getValueType();
3870 if (SrcVT.bitsGE(ExtVT)) {
3871 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3872 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3873 }
3874 }
3875
3876 return SDValue();
3877 }
3878
performIntrinsicWOChainCombine(SDNode * N,DAGCombinerInfo & DCI) const3879 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3880 SDNode *N, DAGCombinerInfo &DCI) const {
3881 unsigned IID = N->getConstantOperandVal(0);
3882 switch (IID) {
3883 case Intrinsic::amdgcn_mul_i24:
3884 case Intrinsic::amdgcn_mul_u24:
3885 case Intrinsic::amdgcn_mulhi_i24:
3886 case Intrinsic::amdgcn_mulhi_u24:
3887 return simplifyMul24(N, DCI);
3888 case Intrinsic::amdgcn_fract:
3889 case Intrinsic::amdgcn_rsq:
3890 case Intrinsic::amdgcn_rcp_legacy:
3891 case Intrinsic::amdgcn_rsq_legacy:
3892 case Intrinsic::amdgcn_rsq_clamp: {
3893 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3894 SDValue Src = N->getOperand(1);
3895 return Src.isUndef() ? Src : SDValue();
3896 }
3897 case Intrinsic::amdgcn_frexp_exp: {
3898 // frexp_exp (fneg x) -> frexp_exp x
3899 // frexp_exp (fabs x) -> frexp_exp x
3900 // frexp_exp (fneg (fabs x)) -> frexp_exp x
3901 SDValue Src = N->getOperand(1);
3902 SDValue PeekSign = peekFPSignOps(Src);
3903 if (PeekSign == Src)
3904 return SDValue();
3905 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
3906 0);
3907 }
3908 default:
3909 return SDValue();
3910 }
3911 }
3912
3913 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3914 /// binary operation \p Opc to it with the corresponding constant operands.
splitBinaryBitConstantOpImpl(DAGCombinerInfo & DCI,const SDLoc & SL,unsigned Opc,SDValue LHS,uint32_t ValLo,uint32_t ValHi) const3915 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3916 DAGCombinerInfo &DCI, const SDLoc &SL,
3917 unsigned Opc, SDValue LHS,
3918 uint32_t ValLo, uint32_t ValHi) const {
3919 SelectionDAG &DAG = DCI.DAG;
3920 SDValue Lo, Hi;
3921 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3922
3923 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3924 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3925
3926 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3927 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3928
3929 // Re-visit the ands. It's possible we eliminated one of them and it could
3930 // simplify the vector.
3931 DCI.AddToWorklist(Lo.getNode());
3932 DCI.AddToWorklist(Hi.getNode());
3933
3934 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3935 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3936 }
3937
performShlCombine(SDNode * N,DAGCombinerInfo & DCI) const3938 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3939 DAGCombinerInfo &DCI) const {
3940 EVT VT = N->getValueType(0);
3941
3942 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3943 if (!RHS)
3944 return SDValue();
3945
3946 SDValue LHS = N->getOperand(0);
3947 unsigned RHSVal = RHS->getZExtValue();
3948 if (!RHSVal)
3949 return LHS;
3950
3951 SDLoc SL(N);
3952 SelectionDAG &DAG = DCI.DAG;
3953
3954 switch (LHS->getOpcode()) {
3955 default:
3956 break;
3957 case ISD::ZERO_EXTEND:
3958 case ISD::SIGN_EXTEND:
3959 case ISD::ANY_EXTEND: {
3960 SDValue X = LHS->getOperand(0);
3961
3962 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3963 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3964 // Prefer build_vector as the canonical form if packed types are legal.
3965 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3966 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3967 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3968 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3969 }
3970
3971 // shl (ext x) => zext (shl x), if shift does not overflow int
3972 if (VT != MVT::i64)
3973 break;
3974 KnownBits Known = DAG.computeKnownBits(X);
3975 unsigned LZ = Known.countMinLeadingZeros();
3976 if (LZ < RHSVal)
3977 break;
3978 EVT XVT = X.getValueType();
3979 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3980 return DAG.getZExtOrTrunc(Shl, SL, VT);
3981 }
3982 }
3983
3984 if (VT != MVT::i64)
3985 return SDValue();
3986
3987 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3988
3989 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3990 // common case, splitting this into a move and a 32-bit shift is faster and
3991 // the same code size.
3992 if (RHSVal < 32)
3993 return SDValue();
3994
3995 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3996
3997 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3998 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3999
4000 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4001
4002 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4003 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4004 }
4005
performSraCombine(SDNode * N,DAGCombinerInfo & DCI) const4006 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
4007 DAGCombinerInfo &DCI) const {
4008 if (N->getValueType(0) != MVT::i64)
4009 return SDValue();
4010
4011 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4012 if (!RHS)
4013 return SDValue();
4014
4015 SelectionDAG &DAG = DCI.DAG;
4016 SDLoc SL(N);
4017 unsigned RHSVal = RHS->getZExtValue();
4018
4019 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4020 if (RHSVal == 32) {
4021 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4022 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4023 DAG.getConstant(31, SL, MVT::i32));
4024
4025 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4026 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4027 }
4028
4029 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4030 if (RHSVal == 63) {
4031 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4032 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4033 DAG.getConstant(31, SL, MVT::i32));
4034 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4035 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4036 }
4037
4038 return SDValue();
4039 }
4040
performSrlCombine(SDNode * N,DAGCombinerInfo & DCI) const4041 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4042 DAGCombinerInfo &DCI) const {
4043 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4044 if (!RHS)
4045 return SDValue();
4046
4047 EVT VT = N->getValueType(0);
4048 SDValue LHS = N->getOperand(0);
4049 unsigned ShiftAmt = RHS->getZExtValue();
4050 SelectionDAG &DAG = DCI.DAG;
4051 SDLoc SL(N);
4052
4053 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4054 // this improves the ability to match BFE patterns in isel.
4055 if (LHS.getOpcode() == ISD::AND) {
4056 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4057 unsigned MaskIdx, MaskLen;
4058 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4059 MaskIdx == ShiftAmt) {
4060 return DAG.getNode(
4061 ISD::AND, SL, VT,
4062 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
4063 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
4064 }
4065 }
4066 }
4067
4068 if (VT != MVT::i64)
4069 return SDValue();
4070
4071 if (ShiftAmt < 32)
4072 return SDValue();
4073
4074 // srl i64:x, C for C >= 32
4075 // =>
4076 // build_pair (srl hi_32(x), C - 32), 0
4077 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4078
4079 SDValue Hi = getHiHalf64(LHS, DAG);
4080
4081 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4082 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4083
4084 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4085
4086 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4087 }
4088
performTruncateCombine(SDNode * N,DAGCombinerInfo & DCI) const4089 SDValue AMDGPUTargetLowering::performTruncateCombine(
4090 SDNode *N, DAGCombinerInfo &DCI) const {
4091 SDLoc SL(N);
4092 SelectionDAG &DAG = DCI.DAG;
4093 EVT VT = N->getValueType(0);
4094 SDValue Src = N->getOperand(0);
4095
4096 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4097 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4098 SDValue Vec = Src.getOperand(0);
4099 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4100 SDValue Elt0 = Vec.getOperand(0);
4101 EVT EltVT = Elt0.getValueType();
4102 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4103 if (EltVT.isFloatingPoint()) {
4104 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4105 EltVT.changeTypeToInteger(), Elt0);
4106 }
4107
4108 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4109 }
4110 }
4111 }
4112
4113 // Equivalent of above for accessing the high element of a vector as an
4114 // integer operation.
4115 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4116 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4117 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
4118 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4119 SDValue BV = stripBitcast(Src.getOperand(0));
4120 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4121 BV.getValueType().getVectorNumElements() == 2) {
4122 SDValue SrcElt = BV.getOperand(1);
4123 EVT SrcEltVT = SrcElt.getValueType();
4124 if (SrcEltVT.isFloatingPoint()) {
4125 SrcElt = DAG.getNode(ISD::BITCAST, SL,
4126 SrcEltVT.changeTypeToInteger(), SrcElt);
4127 }
4128
4129 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4130 }
4131 }
4132 }
4133 }
4134
4135 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4136 //
4137 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4138 // i16 (trunc (srl (i32 (trunc x), K)))
4139 if (VT.getScalarSizeInBits() < 32) {
4140 EVT SrcVT = Src.getValueType();
4141 if (SrcVT.getScalarSizeInBits() > 32 &&
4142 (Src.getOpcode() == ISD::SRL ||
4143 Src.getOpcode() == ISD::SRA ||
4144 Src.getOpcode() == ISD::SHL)) {
4145 SDValue Amt = Src.getOperand(1);
4146 KnownBits Known = DAG.computeKnownBits(Amt);
4147
4148 // - For left shifts, do the transform as long as the shift
4149 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4150 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4151 // losing information stored in the high bits when truncating.
4152 const unsigned MaxCstSize =
4153 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4154 if (Known.getMaxValue().ule(MaxCstSize)) {
4155 EVT MidVT = VT.isVector() ?
4156 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4157 VT.getVectorNumElements()) : MVT::i32;
4158
4159 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4160 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4161 Src.getOperand(0));
4162 DCI.AddToWorklist(Trunc.getNode());
4163
4164 if (Amt.getValueType() != NewShiftVT) {
4165 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4166 DCI.AddToWorklist(Amt.getNode());
4167 }
4168
4169 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4170 Trunc, Amt);
4171 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4172 }
4173 }
4174 }
4175
4176 return SDValue();
4177 }
4178
4179 // We need to specifically handle i64 mul here to avoid unnecessary conversion
4180 // instructions. If we only match on the legalized i64 mul expansion,
4181 // SimplifyDemandedBits will be unable to remove them because there will be
4182 // multiple uses due to the separate mul + mulh[su].
getMul24(SelectionDAG & DAG,const SDLoc & SL,SDValue N0,SDValue N1,unsigned Size,bool Signed)4183 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4184 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4185 if (Size <= 32) {
4186 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4187 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4188 }
4189
4190 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4191 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4192
4193 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4194 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4195
4196 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4197 }
4198
4199 /// If \p V is an add of a constant 1, returns the other operand. Otherwise
4200 /// return SDValue().
getAddOneOp(const SDNode * V)4201 static SDValue getAddOneOp(const SDNode *V) {
4202 if (V->getOpcode() != ISD::ADD)
4203 return SDValue();
4204
4205 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4206 }
4207
performMulCombine(SDNode * N,DAGCombinerInfo & DCI) const4208 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
4209 DAGCombinerInfo &DCI) const {
4210 EVT VT = N->getValueType(0);
4211
4212 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4213 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4214 // unnecessarily). isDivergent() is used as an approximation of whether the
4215 // value is in an SGPR.
4216 if (!N->isDivergent())
4217 return SDValue();
4218
4219 unsigned Size = VT.getSizeInBits();
4220 if (VT.isVector() || Size > 64)
4221 return SDValue();
4222
4223 SelectionDAG &DAG = DCI.DAG;
4224 SDLoc DL(N);
4225
4226 SDValue N0 = N->getOperand(0);
4227 SDValue N1 = N->getOperand(1);
4228
4229 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4230 // matching.
4231
4232 // mul x, (add y, 1) -> add (mul x, y), x
4233 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4234 SDValue AddOp = getAddOneOp(V.getNode());
4235 if (!AddOp)
4236 return SDValue();
4237
4238 if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
4239 return U->getOpcode() == ISD::MUL;
4240 }))
4241 return AddOp;
4242
4243 return SDValue();
4244 };
4245
4246 // FIXME: The selection pattern is not properly checking for commuted
4247 // operands, so we have to place the mul in the LHS
4248 if (SDValue MulOper = IsFoldableAdd(N0)) {
4249 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4250 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4251 }
4252
4253 if (SDValue MulOper = IsFoldableAdd(N1)) {
4254 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4255 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4256 }
4257
4258 // Skip if already mul24.
4259 if (N->getOpcode() != ISD::MUL)
4260 return SDValue();
4261
4262 // There are i16 integer mul/mad.
4263 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4264 return SDValue();
4265
4266 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4267 // in the source into any_extends if the result of the mul is truncated. Since
4268 // we can assume the high bits are whatever we want, use the underlying value
4269 // to avoid the unknown high bits from interfering.
4270 if (N0.getOpcode() == ISD::ANY_EXTEND)
4271 N0 = N0.getOperand(0);
4272
4273 if (N1.getOpcode() == ISD::ANY_EXTEND)
4274 N1 = N1.getOperand(0);
4275
4276 SDValue Mul;
4277
4278 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4279 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4280 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4281 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4282 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4283 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4284 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4285 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4286 } else {
4287 return SDValue();
4288 }
4289
4290 // We need to use sext even for MUL_U24, because MUL_U24 is used
4291 // for signed multiply of 8 and 16-bit types.
4292 return DAG.getSExtOrTrunc(Mul, DL, VT);
4293 }
4294
4295 SDValue
performMulLoHiCombine(SDNode * N,DAGCombinerInfo & DCI) const4296 AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
4297 DAGCombinerInfo &DCI) const {
4298 if (N->getValueType(0) != MVT::i32)
4299 return SDValue();
4300
4301 SelectionDAG &DAG = DCI.DAG;
4302 SDLoc DL(N);
4303
4304 SDValue N0 = N->getOperand(0);
4305 SDValue N1 = N->getOperand(1);
4306
4307 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4308 // in the source into any_extends if the result of the mul is truncated. Since
4309 // we can assume the high bits are whatever we want, use the underlying value
4310 // to avoid the unknown high bits from interfering.
4311 if (N0.getOpcode() == ISD::ANY_EXTEND)
4312 N0 = N0.getOperand(0);
4313 if (N1.getOpcode() == ISD::ANY_EXTEND)
4314 N1 = N1.getOperand(0);
4315
4316 // Try to use two fast 24-bit multiplies (one for each half of the result)
4317 // instead of one slow extending multiply.
4318 unsigned LoOpcode, HiOpcode;
4319 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4320 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4321 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4322 LoOpcode = AMDGPUISD::MUL_U24;
4323 HiOpcode = AMDGPUISD::MULHI_U24;
4324 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4325 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4326 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4327 LoOpcode = AMDGPUISD::MUL_I24;
4328 HiOpcode = AMDGPUISD::MULHI_I24;
4329 } else {
4330 return SDValue();
4331 }
4332
4333 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4334 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4335 DCI.CombineTo(N, Lo, Hi);
4336 return SDValue(N, 0);
4337 }
4338
performMulhsCombine(SDNode * N,DAGCombinerInfo & DCI) const4339 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
4340 DAGCombinerInfo &DCI) const {
4341 EVT VT = N->getValueType(0);
4342
4343 if (!Subtarget->hasMulI24() || VT.isVector())
4344 return SDValue();
4345
4346 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4347 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4348 // unnecessarily). isDivergent() is used as an approximation of whether the
4349 // value is in an SGPR.
4350 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4351 // valu op anyway)
4352 if (Subtarget->hasSMulHi() && !N->isDivergent())
4353 return SDValue();
4354
4355 SelectionDAG &DAG = DCI.DAG;
4356 SDLoc DL(N);
4357
4358 SDValue N0 = N->getOperand(0);
4359 SDValue N1 = N->getOperand(1);
4360
4361 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4362 return SDValue();
4363
4364 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4365 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4366
4367 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4368 DCI.AddToWorklist(Mulhi.getNode());
4369 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4370 }
4371
performMulhuCombine(SDNode * N,DAGCombinerInfo & DCI) const4372 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
4373 DAGCombinerInfo &DCI) const {
4374 EVT VT = N->getValueType(0);
4375
4376 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4377 return SDValue();
4378
4379 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4380 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4381 // unnecessarily). isDivergent() is used as an approximation of whether the
4382 // value is in an SGPR.
4383 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4384 // valu op anyway)
4385 if (Subtarget->hasSMulHi() && !N->isDivergent())
4386 return SDValue();
4387
4388 SelectionDAG &DAG = DCI.DAG;
4389 SDLoc DL(N);
4390
4391 SDValue N0 = N->getOperand(0);
4392 SDValue N1 = N->getOperand(1);
4393
4394 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4395 return SDValue();
4396
4397 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4398 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4399
4400 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4401 DCI.AddToWorklist(Mulhi.getNode());
4402 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4403 }
4404
getFFBX_U32(SelectionDAG & DAG,SDValue Op,const SDLoc & DL,unsigned Opc) const4405 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4406 SDValue Op,
4407 const SDLoc &DL,
4408 unsigned Opc) const {
4409 EVT VT = Op.getValueType();
4410 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4411 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4412 LegalVT != MVT::i16))
4413 return SDValue();
4414
4415 if (VT != MVT::i32)
4416 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4417
4418 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4419 if (VT != MVT::i32)
4420 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4421
4422 return FFBX;
4423 }
4424
4425 // The native instructions return -1 on 0 input. Optimize out a select that
4426 // produces -1 on 0.
4427 //
4428 // TODO: If zero is not undef, we could also do this if the output is compared
4429 // against the bitwidth.
4430 //
4431 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
performCtlz_CttzCombine(const SDLoc & SL,SDValue Cond,SDValue LHS,SDValue RHS,DAGCombinerInfo & DCI) const4432 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
4433 SDValue LHS, SDValue RHS,
4434 DAGCombinerInfo &DCI) const {
4435 if (!isNullConstant(Cond.getOperand(1)))
4436 return SDValue();
4437
4438 SelectionDAG &DAG = DCI.DAG;
4439 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4440 SDValue CmpLHS = Cond.getOperand(0);
4441
4442 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4443 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4444 if (CCOpcode == ISD::SETEQ &&
4445 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4446 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4447 unsigned Opc =
4448 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4449 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4450 }
4451
4452 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4453 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4454 if (CCOpcode == ISD::SETNE &&
4455 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4456 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4457 unsigned Opc =
4458 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4459
4460 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4461 }
4462
4463 return SDValue();
4464 }
4465
distributeOpThroughSelect(TargetLowering::DAGCombinerInfo & DCI,unsigned Op,const SDLoc & SL,SDValue Cond,SDValue N1,SDValue N2)4466 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
4467 unsigned Op,
4468 const SDLoc &SL,
4469 SDValue Cond,
4470 SDValue N1,
4471 SDValue N2) {
4472 SelectionDAG &DAG = DCI.DAG;
4473 EVT VT = N1.getValueType();
4474
4475 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4476 N1.getOperand(0), N2.getOperand(0));
4477 DCI.AddToWorklist(NewSelect.getNode());
4478 return DAG.getNode(Op, SL, VT, NewSelect);
4479 }
4480
4481 // Pull a free FP operation out of a select so it may fold into uses.
4482 //
4483 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4484 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
4485 //
4486 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4487 // select c, (fabs x), +k -> fabs (select c, x, k)
4488 SDValue
foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo & DCI,SDValue N) const4489 AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4490 SDValue N) const {
4491 SelectionDAG &DAG = DCI.DAG;
4492 SDValue Cond = N.getOperand(0);
4493 SDValue LHS = N.getOperand(1);
4494 SDValue RHS = N.getOperand(2);
4495
4496 EVT VT = N.getValueType();
4497 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4498 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4499 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
4500 return SDValue();
4501
4502 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4503 SDLoc(N), Cond, LHS, RHS);
4504 }
4505
4506 bool Inv = false;
4507 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4508 std::swap(LHS, RHS);
4509 Inv = true;
4510 }
4511
4512 // TODO: Support vector constants.
4513 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4514 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4515 !selectSupportsSourceMods(N.getNode())) {
4516 SDLoc SL(N);
4517 // If one side is an fneg/fabs and the other is a constant, we can push the
4518 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4519 SDValue NewLHS = LHS.getOperand(0);
4520 SDValue NewRHS = RHS;
4521
4522 // Careful: if the neg can be folded up, don't try to pull it back down.
4523 bool ShouldFoldNeg = true;
4524
4525 if (NewLHS.hasOneUse()) {
4526 unsigned Opc = NewLHS.getOpcode();
4527 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4528 ShouldFoldNeg = false;
4529 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4530 ShouldFoldNeg = false;
4531 }
4532
4533 if (ShouldFoldNeg) {
4534 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4535 return SDValue();
4536
4537 // We're going to be forced to use a source modifier anyway, there's no
4538 // point to pulling the negate out unless we can get a size reduction by
4539 // negating the constant.
4540 //
4541 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4542 // about cheaper constants.
4543 if (NewLHS.getOpcode() == ISD::FABS &&
4544 getConstantNegateCost(CRHS) != NegatibleCost::Cheaper)
4545 return SDValue();
4546
4547 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
4548 return SDValue();
4549
4550 if (LHS.getOpcode() == ISD::FNEG)
4551 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4552
4553 if (Inv)
4554 std::swap(NewLHS, NewRHS);
4555
4556 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4557 Cond, NewLHS, NewRHS);
4558 DCI.AddToWorklist(NewSelect.getNode());
4559 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4560 }
4561 }
4562
4563 return SDValue();
4564 }
4565
performSelectCombine(SDNode * N,DAGCombinerInfo & DCI) const4566 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
4567 DAGCombinerInfo &DCI) const {
4568 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4569 return Folded;
4570
4571 SDValue Cond = N->getOperand(0);
4572 if (Cond.getOpcode() != ISD::SETCC)
4573 return SDValue();
4574
4575 EVT VT = N->getValueType(0);
4576 SDValue LHS = Cond.getOperand(0);
4577 SDValue RHS = Cond.getOperand(1);
4578 SDValue CC = Cond.getOperand(2);
4579
4580 SDValue True = N->getOperand(1);
4581 SDValue False = N->getOperand(2);
4582
4583 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4584 SelectionDAG &DAG = DCI.DAG;
4585 if (DAG.isConstantValueOfAnyType(True) &&
4586 !DAG.isConstantValueOfAnyType(False)) {
4587 // Swap cmp + select pair to move constant to false input.
4588 // This will allow using VOPC cndmasks more often.
4589 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4590
4591 SDLoc SL(N);
4592 ISD::CondCode NewCC =
4593 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4594
4595 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4596 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4597 }
4598
4599 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4600 SDValue MinMax
4601 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4602 // Revisit this node so we can catch min3/max3/med3 patterns.
4603 //DCI.AddToWorklist(MinMax.getNode());
4604 return MinMax;
4605 }
4606 }
4607
4608 // There's no reason to not do this if the condition has other uses.
4609 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4610 }
4611
isInv2Pi(const APFloat & APF)4612 static bool isInv2Pi(const APFloat &APF) {
4613 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4614 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4615 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4616
4617 return APF.bitwiseIsEqual(KF16) ||
4618 APF.bitwiseIsEqual(KF32) ||
4619 APF.bitwiseIsEqual(KF64);
4620 }
4621
4622 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4623 // additional cost to negate them.
4624 TargetLowering::NegatibleCost
getConstantNegateCost(const ConstantFPSDNode * C) const4625 AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {
4626 if (C->isZero())
4627 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4628
4629 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4630 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4631
4632 return NegatibleCost::Neutral;
4633 }
4634
isConstantCostlierToNegate(SDValue N) const4635 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
4636 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4637 return getConstantNegateCost(C) == NegatibleCost::Expensive;
4638 return false;
4639 }
4640
isConstantCheaperToNegate(SDValue N) const4641 bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
4642 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4643 return getConstantNegateCost(C) == NegatibleCost::Cheaper;
4644 return false;
4645 }
4646
inverseMinMax(unsigned Opc)4647 static unsigned inverseMinMax(unsigned Opc) {
4648 switch (Opc) {
4649 case ISD::FMAXNUM:
4650 return ISD::FMINNUM;
4651 case ISD::FMINNUM:
4652 return ISD::FMAXNUM;
4653 case ISD::FMAXNUM_IEEE:
4654 return ISD::FMINNUM_IEEE;
4655 case ISD::FMINNUM_IEEE:
4656 return ISD::FMAXNUM_IEEE;
4657 case ISD::FMAXIMUM:
4658 return ISD::FMINIMUM;
4659 case ISD::FMINIMUM:
4660 return ISD::FMAXIMUM;
4661 case AMDGPUISD::FMAX_LEGACY:
4662 return AMDGPUISD::FMIN_LEGACY;
4663 case AMDGPUISD::FMIN_LEGACY:
4664 return AMDGPUISD::FMAX_LEGACY;
4665 default:
4666 llvm_unreachable("invalid min/max opcode");
4667 }
4668 }
4669
4670 /// \return true if it's profitable to try to push an fneg into its source
4671 /// instruction.
shouldFoldFNegIntoSrc(SDNode * N,SDValue N0)4672 bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
4673 // If the input has multiple uses and we can either fold the negate down, or
4674 // the other uses cannot, give up. This both prevents unprofitable
4675 // transformations and infinite loops: we won't repeatedly try to fold around
4676 // a negate that has no 'good' form.
4677 if (N0.hasOneUse()) {
4678 // This may be able to fold into the source, but at a code size cost. Don't
4679 // fold if the fold into the user is free.
4680 if (allUsesHaveSourceMods(N, 0))
4681 return false;
4682 } else {
4683 if (fnegFoldsIntoOp(N0.getNode()) &&
4684 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
4685 return false;
4686 }
4687
4688 return true;
4689 }
4690
performFNegCombine(SDNode * N,DAGCombinerInfo & DCI) const4691 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
4692 DAGCombinerInfo &DCI) const {
4693 SelectionDAG &DAG = DCI.DAG;
4694 SDValue N0 = N->getOperand(0);
4695 EVT VT = N->getValueType(0);
4696
4697 unsigned Opc = N0.getOpcode();
4698
4699 if (!shouldFoldFNegIntoSrc(N, N0))
4700 return SDValue();
4701
4702 SDLoc SL(N);
4703 switch (Opc) {
4704 case ISD::FADD: {
4705 if (!mayIgnoreSignedZero(N0))
4706 return SDValue();
4707
4708 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4709 SDValue LHS = N0.getOperand(0);
4710 SDValue RHS = N0.getOperand(1);
4711
4712 if (LHS.getOpcode() != ISD::FNEG)
4713 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4714 else
4715 LHS = LHS.getOperand(0);
4716
4717 if (RHS.getOpcode() != ISD::FNEG)
4718 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4719 else
4720 RHS = RHS.getOperand(0);
4721
4722 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
4723 if (Res.getOpcode() != ISD::FADD)
4724 return SDValue(); // Op got folded away.
4725 if (!N0.hasOneUse())
4726 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4727 return Res;
4728 }
4729 case ISD::FMUL:
4730 case AMDGPUISD::FMUL_LEGACY: {
4731 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4732 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4733 SDValue LHS = N0.getOperand(0);
4734 SDValue RHS = N0.getOperand(1);
4735
4736 if (LHS.getOpcode() == ISD::FNEG)
4737 LHS = LHS.getOperand(0);
4738 else if (RHS.getOpcode() == ISD::FNEG)
4739 RHS = RHS.getOperand(0);
4740 else
4741 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4742
4743 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4744 if (Res.getOpcode() != Opc)
4745 return SDValue(); // Op got folded away.
4746 if (!N0.hasOneUse())
4747 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4748 return Res;
4749 }
4750 case ISD::FMA:
4751 case ISD::FMAD: {
4752 // TODO: handle llvm.amdgcn.fma.legacy
4753 if (!mayIgnoreSignedZero(N0))
4754 return SDValue();
4755
4756 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4757 SDValue LHS = N0.getOperand(0);
4758 SDValue MHS = N0.getOperand(1);
4759 SDValue RHS = N0.getOperand(2);
4760
4761 if (LHS.getOpcode() == ISD::FNEG)
4762 LHS = LHS.getOperand(0);
4763 else if (MHS.getOpcode() == ISD::FNEG)
4764 MHS = MHS.getOperand(0);
4765 else
4766 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4767
4768 if (RHS.getOpcode() != ISD::FNEG)
4769 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4770 else
4771 RHS = RHS.getOperand(0);
4772
4773 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4774 if (Res.getOpcode() != Opc)
4775 return SDValue(); // Op got folded away.
4776 if (!N0.hasOneUse())
4777 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4778 return Res;
4779 }
4780 case ISD::FMAXNUM:
4781 case ISD::FMINNUM:
4782 case ISD::FMAXNUM_IEEE:
4783 case ISD::FMINNUM_IEEE:
4784 case ISD::FMINIMUM:
4785 case ISD::FMAXIMUM:
4786 case AMDGPUISD::FMAX_LEGACY:
4787 case AMDGPUISD::FMIN_LEGACY: {
4788 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4789 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4790 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4791 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4792
4793 SDValue LHS = N0.getOperand(0);
4794 SDValue RHS = N0.getOperand(1);
4795
4796 // 0 doesn't have a negated inline immediate.
4797 // TODO: This constant check should be generalized to other operations.
4798 if (isConstantCostlierToNegate(RHS))
4799 return SDValue();
4800
4801 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4802 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4803 unsigned Opposite = inverseMinMax(Opc);
4804
4805 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4806 if (Res.getOpcode() != Opposite)
4807 return SDValue(); // Op got folded away.
4808 if (!N0.hasOneUse())
4809 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4810 return Res;
4811 }
4812 case AMDGPUISD::FMED3: {
4813 SDValue Ops[3];
4814 for (unsigned I = 0; I < 3; ++I)
4815 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4816
4817 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4818 if (Res.getOpcode() != AMDGPUISD::FMED3)
4819 return SDValue(); // Op got folded away.
4820
4821 if (!N0.hasOneUse()) {
4822 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4823 DAG.ReplaceAllUsesWith(N0, Neg);
4824
4825 for (SDNode *U : Neg->uses())
4826 DCI.AddToWorklist(U);
4827 }
4828
4829 return Res;
4830 }
4831 case ISD::FP_EXTEND:
4832 case ISD::FTRUNC:
4833 case ISD::FRINT:
4834 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4835 case ISD::FROUNDEVEN:
4836 case ISD::FSIN:
4837 case ISD::FCANONICALIZE:
4838 case AMDGPUISD::RCP:
4839 case AMDGPUISD::RCP_LEGACY:
4840 case AMDGPUISD::RCP_IFLAG:
4841 case AMDGPUISD::SIN_HW: {
4842 SDValue CvtSrc = N0.getOperand(0);
4843 if (CvtSrc.getOpcode() == ISD::FNEG) {
4844 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4845 // (fneg (rcp (fneg x))) -> (rcp x)
4846 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4847 }
4848
4849 if (!N0.hasOneUse())
4850 return SDValue();
4851
4852 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4853 // (fneg (rcp x)) -> (rcp (fneg x))
4854 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4855 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4856 }
4857 case ISD::FP_ROUND: {
4858 SDValue CvtSrc = N0.getOperand(0);
4859
4860 if (CvtSrc.getOpcode() == ISD::FNEG) {
4861 // (fneg (fp_round (fneg x))) -> (fp_round x)
4862 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4863 CvtSrc.getOperand(0), N0.getOperand(1));
4864 }
4865
4866 if (!N0.hasOneUse())
4867 return SDValue();
4868
4869 // (fneg (fp_round x)) -> (fp_round (fneg x))
4870 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4871 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4872 }
4873 case ISD::FP16_TO_FP: {
4874 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4875 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4876 // Put the fneg back as a legal source operation that can be matched later.
4877 SDLoc SL(N);
4878
4879 SDValue Src = N0.getOperand(0);
4880 EVT SrcVT = Src.getValueType();
4881
4882 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4883 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4884 DAG.getConstant(0x8000, SL, SrcVT));
4885 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4886 }
4887 case ISD::SELECT: {
4888 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4889 // TODO: Invert conditions of foldFreeOpFromSelect
4890 return SDValue();
4891 }
4892 case ISD::BITCAST: {
4893 SDLoc SL(N);
4894 SDValue BCSrc = N0.getOperand(0);
4895 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4896 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
4897 if (HighBits.getValueType().getSizeInBits() != 32 ||
4898 !fnegFoldsIntoOp(HighBits.getNode()))
4899 return SDValue();
4900
4901 // f64 fneg only really needs to operate on the high half of of the
4902 // register, so try to force it to an f32 operation to help make use of
4903 // source modifiers.
4904 //
4905 //
4906 // fneg (f64 (bitcast (build_vector x, y))) ->
4907 // f64 (bitcast (build_vector (bitcast i32:x to f32),
4908 // (fneg (bitcast i32:y to f32)))
4909
4910 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4911 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4912 SDValue CastBack =
4913 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
4914
4915 SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
4916 Ops.back() = CastBack;
4917 DCI.AddToWorklist(NegHi.getNode());
4918 SDValue Build =
4919 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
4920 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
4921
4922 if (!N0.hasOneUse())
4923 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
4924 return Result;
4925 }
4926
4927 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4928 BCSrc.hasOneUse()) {
4929 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4930 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4931
4932 // TODO: Cast back result for multiple uses is beneficial in some cases.
4933
4934 SDValue LHS =
4935 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4936 SDValue RHS =
4937 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4938
4939 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4940 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4941
4942 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4943 NegRHS);
4944 }
4945
4946 return SDValue();
4947 }
4948 default:
4949 return SDValue();
4950 }
4951 }
4952
performFAbsCombine(SDNode * N,DAGCombinerInfo & DCI) const4953 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
4954 DAGCombinerInfo &DCI) const {
4955 SelectionDAG &DAG = DCI.DAG;
4956 SDValue N0 = N->getOperand(0);
4957
4958 if (!N0.hasOneUse())
4959 return SDValue();
4960
4961 switch (N0.getOpcode()) {
4962 case ISD::FP16_TO_FP: {
4963 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
4964 SDLoc SL(N);
4965 SDValue Src = N0.getOperand(0);
4966 EVT SrcVT = Src.getValueType();
4967
4968 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
4969 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
4970 DAG.getConstant(0x7fff, SL, SrcVT));
4971 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
4972 }
4973 default:
4974 return SDValue();
4975 }
4976 }
4977
performRcpCombine(SDNode * N,DAGCombinerInfo & DCI) const4978 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
4979 DAGCombinerInfo &DCI) const {
4980 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
4981 if (!CFP)
4982 return SDValue();
4983
4984 // XXX - Should this flush denormals?
4985 const APFloat &Val = CFP->getValueAPF();
4986 APFloat One(Val.getSemantics(), "1.0");
4987 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
4988 }
4989
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const4990 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
4991 DAGCombinerInfo &DCI) const {
4992 SelectionDAG &DAG = DCI.DAG;
4993 SDLoc DL(N);
4994
4995 switch(N->getOpcode()) {
4996 default:
4997 break;
4998 case ISD::BITCAST: {
4999 EVT DestVT = N->getValueType(0);
5000
5001 // Push casts through vector builds. This helps avoid emitting a large
5002 // number of copies when materializing floating point vector constants.
5003 //
5004 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5005 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5006 if (DestVT.isVector()) {
5007 SDValue Src = N->getOperand(0);
5008 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5009 (DCI.getDAGCombineLevel() < AfterLegalizeDAG ||
5010 isOperationLegal(ISD::BUILD_VECTOR, DestVT))) {
5011 EVT SrcVT = Src.getValueType();
5012 unsigned NElts = DestVT.getVectorNumElements();
5013
5014 if (SrcVT.getVectorNumElements() == NElts) {
5015 EVT DestEltVT = DestVT.getVectorElementType();
5016
5017 SmallVector<SDValue, 8> CastedElts;
5018 SDLoc SL(N);
5019 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5020 SDValue Elt = Src.getOperand(I);
5021 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5022 }
5023
5024 return DAG.getBuildVector(DestVT, SL, CastedElts);
5025 }
5026 }
5027 }
5028
5029 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5030 break;
5031
5032 // Fold bitcasts of constants.
5033 //
5034 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5035 // TODO: Generalize and move to DAGCombiner
5036 SDValue Src = N->getOperand(0);
5037 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5038 SDLoc SL(N);
5039 uint64_t CVal = C->getZExtValue();
5040 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5041 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5042 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5043 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5044 }
5045
5046 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5047 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5048 SDLoc SL(N);
5049 uint64_t CVal = Val.getZExtValue();
5050 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5051 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5052 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5053
5054 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5055 }
5056
5057 break;
5058 }
5059 case ISD::SHL: {
5060 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5061 break;
5062
5063 return performShlCombine(N, DCI);
5064 }
5065 case ISD::SRL: {
5066 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5067 break;
5068
5069 return performSrlCombine(N, DCI);
5070 }
5071 case ISD::SRA: {
5072 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5073 break;
5074
5075 return performSraCombine(N, DCI);
5076 }
5077 case ISD::TRUNCATE:
5078 return performTruncateCombine(N, DCI);
5079 case ISD::MUL:
5080 return performMulCombine(N, DCI);
5081 case AMDGPUISD::MUL_U24:
5082 case AMDGPUISD::MUL_I24: {
5083 if (SDValue Simplified = simplifyMul24(N, DCI))
5084 return Simplified;
5085 return performMulCombine(N, DCI);
5086 }
5087 case AMDGPUISD::MULHI_I24:
5088 case AMDGPUISD::MULHI_U24:
5089 return simplifyMul24(N, DCI);
5090 case ISD::SMUL_LOHI:
5091 case ISD::UMUL_LOHI:
5092 return performMulLoHiCombine(N, DCI);
5093 case ISD::MULHS:
5094 return performMulhsCombine(N, DCI);
5095 case ISD::MULHU:
5096 return performMulhuCombine(N, DCI);
5097 case ISD::SELECT:
5098 return performSelectCombine(N, DCI);
5099 case ISD::FNEG:
5100 return performFNegCombine(N, DCI);
5101 case ISD::FABS:
5102 return performFAbsCombine(N, DCI);
5103 case AMDGPUISD::BFE_I32:
5104 case AMDGPUISD::BFE_U32: {
5105 assert(!N->getValueType(0).isVector() &&
5106 "Vector handling of BFE not implemented");
5107 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5108 if (!Width)
5109 break;
5110
5111 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5112 if (WidthVal == 0)
5113 return DAG.getConstant(0, DL, MVT::i32);
5114
5115 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5116 if (!Offset)
5117 break;
5118
5119 SDValue BitsFrom = N->getOperand(0);
5120 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5121
5122 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5123
5124 if (OffsetVal == 0) {
5125 // This is already sign / zero extended, so try to fold away extra BFEs.
5126 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5127
5128 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5129 if (OpSignBits >= SignBits)
5130 return BitsFrom;
5131
5132 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5133 if (Signed) {
5134 // This is a sign_extend_inreg. Replace it to take advantage of existing
5135 // DAG Combines. If not eliminated, we will match back to BFE during
5136 // selection.
5137
5138 // TODO: The sext_inreg of extended types ends, although we can could
5139 // handle them in a single BFE.
5140 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5141 DAG.getValueType(SmallVT));
5142 }
5143
5144 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5145 }
5146
5147 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5148 if (Signed) {
5149 return constantFoldBFE<int32_t>(DAG,
5150 CVal->getSExtValue(),
5151 OffsetVal,
5152 WidthVal,
5153 DL);
5154 }
5155
5156 return constantFoldBFE<uint32_t>(DAG,
5157 CVal->getZExtValue(),
5158 OffsetVal,
5159 WidthVal,
5160 DL);
5161 }
5162
5163 if ((OffsetVal + WidthVal) >= 32 &&
5164 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5165 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5166 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5167 BitsFrom, ShiftVal);
5168 }
5169
5170 if (BitsFrom.hasOneUse()) {
5171 APInt Demanded = APInt::getBitsSet(32,
5172 OffsetVal,
5173 OffsetVal + WidthVal);
5174
5175 KnownBits Known;
5176 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
5177 !DCI.isBeforeLegalizeOps());
5178 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5179 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5180 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5181 DCI.CommitTargetLoweringOpt(TLO);
5182 }
5183 }
5184
5185 break;
5186 }
5187 case ISD::LOAD:
5188 return performLoadCombine(N, DCI);
5189 case ISD::STORE:
5190 return performStoreCombine(N, DCI);
5191 case AMDGPUISD::RCP:
5192 case AMDGPUISD::RCP_IFLAG:
5193 return performRcpCombine(N, DCI);
5194 case ISD::AssertZext:
5195 case ISD::AssertSext:
5196 return performAssertSZExtCombine(N, DCI);
5197 case ISD::INTRINSIC_WO_CHAIN:
5198 return performIntrinsicWOChainCombine(N, DCI);
5199 case AMDGPUISD::FMAD_FTZ: {
5200 SDValue N0 = N->getOperand(0);
5201 SDValue N1 = N->getOperand(1);
5202 SDValue N2 = N->getOperand(2);
5203 EVT VT = N->getValueType(0);
5204
5205 // FMAD_FTZ is a FMAD + flush denormals to zero.
5206 // We flush the inputs, the intermediate step, and the output.
5207 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5208 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5209 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5210 if (N0CFP && N1CFP && N2CFP) {
5211 const auto FTZ = [](const APFloat &V) {
5212 if (V.isDenormal()) {
5213 APFloat Zero(V.getSemantics(), 0);
5214 return V.isNegative() ? -Zero : Zero;
5215 }
5216 return V;
5217 };
5218
5219 APFloat V0 = FTZ(N0CFP->getValueAPF());
5220 APFloat V1 = FTZ(N1CFP->getValueAPF());
5221 APFloat V2 = FTZ(N2CFP->getValueAPF());
5222 V0.multiply(V1, APFloat::rmNearestTiesToEven);
5223 V0 = FTZ(V0);
5224 V0.add(V2, APFloat::rmNearestTiesToEven);
5225 return DAG.getConstantFP(FTZ(V0), DL, VT);
5226 }
5227 break;
5228 }
5229 }
5230 return SDValue();
5231 }
5232
5233 //===----------------------------------------------------------------------===//
5234 // Helper functions
5235 //===----------------------------------------------------------------------===//
5236
CreateLiveInRegister(SelectionDAG & DAG,const TargetRegisterClass * RC,Register Reg,EVT VT,const SDLoc & SL,bool RawReg) const5237 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
5238 const TargetRegisterClass *RC,
5239 Register Reg, EVT VT,
5240 const SDLoc &SL,
5241 bool RawReg) const {
5242 MachineFunction &MF = DAG.getMachineFunction();
5243 MachineRegisterInfo &MRI = MF.getRegInfo();
5244 Register VReg;
5245
5246 if (!MRI.isLiveIn(Reg)) {
5247 VReg = MRI.createVirtualRegister(RC);
5248 MRI.addLiveIn(Reg, VReg);
5249 } else {
5250 VReg = MRI.getLiveInVirtReg(Reg);
5251 }
5252
5253 if (RawReg)
5254 return DAG.getRegister(VReg, VT);
5255
5256 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5257 }
5258
5259 // This may be called multiple times, and nothing prevents creating multiple
5260 // objects at the same offset. See if we already defined this object.
getOrCreateFixedStackObject(MachineFrameInfo & MFI,unsigned Size,int64_t Offset)5261 static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
5262 int64_t Offset) {
5263 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5264 if (MFI.getObjectOffset(I) == Offset) {
5265 assert(MFI.getObjectSize(I) == Size);
5266 return I;
5267 }
5268 }
5269
5270 return MFI.CreateFixedObject(Size, Offset, true);
5271 }
5272
loadStackInputValue(SelectionDAG & DAG,EVT VT,const SDLoc & SL,int64_t Offset) const5273 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
5274 EVT VT,
5275 const SDLoc &SL,
5276 int64_t Offset) const {
5277 MachineFunction &MF = DAG.getMachineFunction();
5278 MachineFrameInfo &MFI = MF.getFrameInfo();
5279 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5280
5281 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5282 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5283
5284 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5285 MachineMemOperand::MODereferenceable |
5286 MachineMemOperand::MOInvariant);
5287 }
5288
storeStackInputValue(SelectionDAG & DAG,const SDLoc & SL,SDValue Chain,SDValue ArgVal,int64_t Offset) const5289 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
5290 const SDLoc &SL,
5291 SDValue Chain,
5292 SDValue ArgVal,
5293 int64_t Offset) const {
5294 MachineFunction &MF = DAG.getMachineFunction();
5295 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
5296 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5297
5298 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5299 // Stores to the argument stack area are relative to the stack pointer.
5300 SDValue SP =
5301 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5302 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5303 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5304 MachineMemOperand::MODereferenceable);
5305 return Store;
5306 }
5307
loadInputValue(SelectionDAG & DAG,const TargetRegisterClass * RC,EVT VT,const SDLoc & SL,const ArgDescriptor & Arg) const5308 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
5309 const TargetRegisterClass *RC,
5310 EVT VT, const SDLoc &SL,
5311 const ArgDescriptor &Arg) const {
5312 assert(Arg && "Attempting to load missing argument");
5313
5314 SDValue V = Arg.isRegister() ?
5315 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5316 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5317
5318 if (!Arg.isMasked())
5319 return V;
5320
5321 unsigned Mask = Arg.getMask();
5322 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5323 V = DAG.getNode(ISD::SRL, SL, VT, V,
5324 DAG.getShiftAmountConstant(Shift, VT, SL));
5325 return DAG.getNode(ISD::AND, SL, VT, V,
5326 DAG.getConstant(Mask >> Shift, SL, VT));
5327 }
5328
getImplicitParameterOffset(uint64_t ExplicitKernArgSize,const ImplicitParameter Param) const5329 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5330 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5331 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5332 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5333 uint64_t ArgOffset =
5334 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5335 switch (Param) {
5336 case FIRST_IMPLICIT:
5337 return ArgOffset;
5338 case PRIVATE_BASE:
5339 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
5340 case SHARED_BASE:
5341 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5342 case QUEUE_PTR:
5343 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5344 }
5345 llvm_unreachable("unexpected implicit parameter type");
5346 }
5347
getImplicitParameterOffset(const MachineFunction & MF,const ImplicitParameter Param) const5348 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5349 const MachineFunction &MF, const ImplicitParameter Param) const {
5350 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
5351 return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param);
5352 }
5353
5354 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5355
getTargetNodeName(unsigned Opcode) const5356 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5357 switch ((AMDGPUISD::NodeType)Opcode) {
5358 case AMDGPUISD::FIRST_NUMBER: break;
5359 // AMDIL DAG nodes
5360 NODE_NAME_CASE(UMUL);
5361 NODE_NAME_CASE(BRANCH_COND);
5362
5363 // AMDGPU DAG nodes
5364 NODE_NAME_CASE(IF)
5365 NODE_NAME_CASE(ELSE)
5366 NODE_NAME_CASE(LOOP)
5367 NODE_NAME_CASE(CALL)
5368 NODE_NAME_CASE(TC_RETURN)
5369 NODE_NAME_CASE(TC_RETURN_GFX)
5370 NODE_NAME_CASE(TC_RETURN_CHAIN)
5371 NODE_NAME_CASE(TRAP)
5372 NODE_NAME_CASE(RET_GLUE)
5373 NODE_NAME_CASE(WAVE_ADDRESS)
5374 NODE_NAME_CASE(RETURN_TO_EPILOG)
5375 NODE_NAME_CASE(ENDPGM)
5376 NODE_NAME_CASE(ENDPGM_TRAP)
5377 NODE_NAME_CASE(DWORDADDR)
5378 NODE_NAME_CASE(FRACT)
5379 NODE_NAME_CASE(SETCC)
5380 NODE_NAME_CASE(SETREG)
5381 NODE_NAME_CASE(DENORM_MODE)
5382 NODE_NAME_CASE(FMA_W_CHAIN)
5383 NODE_NAME_CASE(FMUL_W_CHAIN)
5384 NODE_NAME_CASE(CLAMP)
5385 NODE_NAME_CASE(COS_HW)
5386 NODE_NAME_CASE(SIN_HW)
5387 NODE_NAME_CASE(FMAX_LEGACY)
5388 NODE_NAME_CASE(FMIN_LEGACY)
5389 NODE_NAME_CASE(FMAX3)
5390 NODE_NAME_CASE(SMAX3)
5391 NODE_NAME_CASE(UMAX3)
5392 NODE_NAME_CASE(FMIN3)
5393 NODE_NAME_CASE(SMIN3)
5394 NODE_NAME_CASE(UMIN3)
5395 NODE_NAME_CASE(FMED3)
5396 NODE_NAME_CASE(SMED3)
5397 NODE_NAME_CASE(UMED3)
5398 NODE_NAME_CASE(FMAXIMUM3)
5399 NODE_NAME_CASE(FMINIMUM3)
5400 NODE_NAME_CASE(FDOT2)
5401 NODE_NAME_CASE(URECIP)
5402 NODE_NAME_CASE(DIV_SCALE)
5403 NODE_NAME_CASE(DIV_FMAS)
5404 NODE_NAME_CASE(DIV_FIXUP)
5405 NODE_NAME_CASE(FMAD_FTZ)
5406 NODE_NAME_CASE(RCP)
5407 NODE_NAME_CASE(RSQ)
5408 NODE_NAME_CASE(RCP_LEGACY)
5409 NODE_NAME_CASE(RCP_IFLAG)
5410 NODE_NAME_CASE(LOG)
5411 NODE_NAME_CASE(EXP)
5412 NODE_NAME_CASE(FMUL_LEGACY)
5413 NODE_NAME_CASE(RSQ_CLAMP)
5414 NODE_NAME_CASE(FP_CLASS)
5415 NODE_NAME_CASE(DOT4)
5416 NODE_NAME_CASE(CARRY)
5417 NODE_NAME_CASE(BORROW)
5418 NODE_NAME_CASE(BFE_U32)
5419 NODE_NAME_CASE(BFE_I32)
5420 NODE_NAME_CASE(BFI)
5421 NODE_NAME_CASE(BFM)
5422 NODE_NAME_CASE(FFBH_U32)
5423 NODE_NAME_CASE(FFBH_I32)
5424 NODE_NAME_CASE(FFBL_B32)
5425 NODE_NAME_CASE(MUL_U24)
5426 NODE_NAME_CASE(MUL_I24)
5427 NODE_NAME_CASE(MULHI_U24)
5428 NODE_NAME_CASE(MULHI_I24)
5429 NODE_NAME_CASE(MAD_U24)
5430 NODE_NAME_CASE(MAD_I24)
5431 NODE_NAME_CASE(MAD_I64_I32)
5432 NODE_NAME_CASE(MAD_U64_U32)
5433 NODE_NAME_CASE(PERM)
5434 NODE_NAME_CASE(TEXTURE_FETCH)
5435 NODE_NAME_CASE(R600_EXPORT)
5436 NODE_NAME_CASE(CONST_ADDRESS)
5437 NODE_NAME_CASE(REGISTER_LOAD)
5438 NODE_NAME_CASE(REGISTER_STORE)
5439 NODE_NAME_CASE(SAMPLE)
5440 NODE_NAME_CASE(SAMPLEB)
5441 NODE_NAME_CASE(SAMPLED)
5442 NODE_NAME_CASE(SAMPLEL)
5443 NODE_NAME_CASE(CVT_F32_UBYTE0)
5444 NODE_NAME_CASE(CVT_F32_UBYTE1)
5445 NODE_NAME_CASE(CVT_F32_UBYTE2)
5446 NODE_NAME_CASE(CVT_F32_UBYTE3)
5447 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5448 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5449 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5450 NODE_NAME_CASE(CVT_PK_I16_I32)
5451 NODE_NAME_CASE(CVT_PK_U16_U32)
5452 NODE_NAME_CASE(FP_TO_FP16)
5453 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5454 NODE_NAME_CASE(CONST_DATA_PTR)
5455 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5456 NODE_NAME_CASE(LDS)
5457 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
5458 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5459 NODE_NAME_CASE(DUMMY_CHAIN)
5460 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
5461 NODE_NAME_CASE(LOAD_D16_HI)
5462 NODE_NAME_CASE(LOAD_D16_LO)
5463 NODE_NAME_CASE(LOAD_D16_HI_I8)
5464 NODE_NAME_CASE(LOAD_D16_HI_U8)
5465 NODE_NAME_CASE(LOAD_D16_LO_I8)
5466 NODE_NAME_CASE(LOAD_D16_LO_U8)
5467 NODE_NAME_CASE(STORE_MSKOR)
5468 NODE_NAME_CASE(LOAD_CONSTANT)
5469 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5470 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5471 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5472 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5473 NODE_NAME_CASE(DS_ORDERED_COUNT)
5474 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5475 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
5476 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
5477 NODE_NAME_CASE(BUFFER_LOAD)
5478 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5479 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5480 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5481 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5482 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5483 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5484 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5485 NODE_NAME_CASE(SBUFFER_LOAD)
5486 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5487 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5488 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5489 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5490 NODE_NAME_CASE(BUFFER_STORE)
5491 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5492 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5493 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5494 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5495 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5496 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5497 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5498 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5499 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5500 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5501 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5502 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5503 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5504 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5505 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5506 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5507 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5508 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5509 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5510 NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
5511 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5512 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5513 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5514
5515 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
5516 }
5517 return nullptr;
5518 }
5519
getSqrtEstimate(SDValue Operand,SelectionDAG & DAG,int Enabled,int & RefinementSteps,bool & UseOneConstNR,bool Reciprocal) const5520 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
5521 SelectionDAG &DAG, int Enabled,
5522 int &RefinementSteps,
5523 bool &UseOneConstNR,
5524 bool Reciprocal) const {
5525 EVT VT = Operand.getValueType();
5526
5527 if (VT == MVT::f32) {
5528 RefinementSteps = 0;
5529 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5530 }
5531
5532 // TODO: There is also f64 rsq instruction, but the documentation is less
5533 // clear on its precision.
5534
5535 return SDValue();
5536 }
5537
getRecipEstimate(SDValue Operand,SelectionDAG & DAG,int Enabled,int & RefinementSteps) const5538 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
5539 SelectionDAG &DAG, int Enabled,
5540 int &RefinementSteps) const {
5541 EVT VT = Operand.getValueType();
5542
5543 if (VT == MVT::f32) {
5544 // Reciprocal, < 1 ulp error.
5545 //
5546 // This reciprocal approximation converges to < 0.5 ulp error with one
5547 // newton rhapson performed with two fused multiple adds (FMAs).
5548
5549 RefinementSteps = 0;
5550 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5551 }
5552
5553 // TODO: There is also f64 rcp instruction, but the documentation is less
5554 // clear on its precision.
5555
5556 return SDValue();
5557 }
5558
workitemIntrinsicDim(unsigned ID)5559 static unsigned workitemIntrinsicDim(unsigned ID) {
5560 switch (ID) {
5561 case Intrinsic::amdgcn_workitem_id_x:
5562 return 0;
5563 case Intrinsic::amdgcn_workitem_id_y:
5564 return 1;
5565 case Intrinsic::amdgcn_workitem_id_z:
5566 return 2;
5567 default:
5568 llvm_unreachable("not a workitem intrinsic");
5569 }
5570 }
5571
computeKnownBitsForTargetNode(const SDValue Op,KnownBits & Known,const APInt & DemandedElts,const SelectionDAG & DAG,unsigned Depth) const5572 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
5573 const SDValue Op, KnownBits &Known,
5574 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5575
5576 Known.resetAll(); // Don't know anything.
5577
5578 unsigned Opc = Op.getOpcode();
5579
5580 switch (Opc) {
5581 default:
5582 break;
5583 case AMDGPUISD::CARRY:
5584 case AMDGPUISD::BORROW: {
5585 Known.Zero = APInt::getHighBitsSet(32, 31);
5586 break;
5587 }
5588
5589 case AMDGPUISD::BFE_I32:
5590 case AMDGPUISD::BFE_U32: {
5591 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5592 if (!CWidth)
5593 return;
5594
5595 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5596
5597 if (Opc == AMDGPUISD::BFE_U32)
5598 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5599
5600 break;
5601 }
5602 case AMDGPUISD::FP_TO_FP16: {
5603 unsigned BitWidth = Known.getBitWidth();
5604
5605 // High bits are zero.
5606 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
5607 break;
5608 }
5609 case AMDGPUISD::MUL_U24:
5610 case AMDGPUISD::MUL_I24: {
5611 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5612 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5613 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5614 RHSKnown.countMinTrailingZeros();
5615 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5616 // Skip extra check if all bits are known zeros.
5617 if (TrailZ >= 32)
5618 break;
5619
5620 // Truncate to 24 bits.
5621 LHSKnown = LHSKnown.trunc(24);
5622 RHSKnown = RHSKnown.trunc(24);
5623
5624 if (Opc == AMDGPUISD::MUL_I24) {
5625 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5626 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5627 unsigned MaxValBits = LHSValBits + RHSValBits;
5628 if (MaxValBits > 32)
5629 break;
5630 unsigned SignBits = 32 - MaxValBits + 1;
5631 bool LHSNegative = LHSKnown.isNegative();
5632 bool LHSNonNegative = LHSKnown.isNonNegative();
5633 bool LHSPositive = LHSKnown.isStrictlyPositive();
5634 bool RHSNegative = RHSKnown.isNegative();
5635 bool RHSNonNegative = RHSKnown.isNonNegative();
5636 bool RHSPositive = RHSKnown.isStrictlyPositive();
5637
5638 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5639 Known.Zero.setHighBits(SignBits);
5640 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5641 Known.One.setHighBits(SignBits);
5642 } else {
5643 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5644 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5645 unsigned MaxValBits = LHSValBits + RHSValBits;
5646 if (MaxValBits >= 32)
5647 break;
5648 Known.Zero.setBitsFrom(MaxValBits);
5649 }
5650 break;
5651 }
5652 case AMDGPUISD::PERM: {
5653 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5654 if (!CMask)
5655 return;
5656
5657 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5658 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5659 unsigned Sel = CMask->getZExtValue();
5660
5661 for (unsigned I = 0; I < 32; I += 8) {
5662 unsigned SelBits = Sel & 0xff;
5663 if (SelBits < 4) {
5664 SelBits *= 8;
5665 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5666 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5667 } else if (SelBits < 7) {
5668 SelBits = (SelBits & 3) * 8;
5669 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5670 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5671 } else if (SelBits == 0x0c) {
5672 Known.Zero |= 0xFFull << I;
5673 } else if (SelBits > 0x0c) {
5674 Known.One |= 0xFFull << I;
5675 }
5676 Sel >>= 8;
5677 }
5678 break;
5679 }
5680 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5681 Known.Zero.setHighBits(24);
5682 break;
5683 }
5684 case AMDGPUISD::BUFFER_LOAD_USHORT: {
5685 Known.Zero.setHighBits(16);
5686 break;
5687 }
5688 case AMDGPUISD::LDS: {
5689 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5690 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5691
5692 Known.Zero.setHighBits(16);
5693 Known.Zero.setLowBits(Log2(Alignment));
5694 break;
5695 }
5696 case AMDGPUISD::SMIN3:
5697 case AMDGPUISD::SMAX3:
5698 case AMDGPUISD::SMED3:
5699 case AMDGPUISD::UMIN3:
5700 case AMDGPUISD::UMAX3:
5701 case AMDGPUISD::UMED3: {
5702 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5703 if (Known2.isUnknown())
5704 break;
5705
5706 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5707 if (Known1.isUnknown())
5708 break;
5709
5710 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5711 if (Known0.isUnknown())
5712 break;
5713
5714 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5715 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5716 Known.One = Known0.One & Known1.One & Known2.One;
5717 break;
5718 }
5719 case ISD::INTRINSIC_WO_CHAIN: {
5720 unsigned IID = Op.getConstantOperandVal(0);
5721 switch (IID) {
5722 case Intrinsic::amdgcn_workitem_id_x:
5723 case Intrinsic::amdgcn_workitem_id_y:
5724 case Intrinsic::amdgcn_workitem_id_z: {
5725 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5726 DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));
5727 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5728 break;
5729 }
5730 default:
5731 break;
5732 }
5733 }
5734 }
5735 }
5736
ComputeNumSignBitsForTargetNode(SDValue Op,const APInt & DemandedElts,const SelectionDAG & DAG,unsigned Depth) const5737 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
5738 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5739 unsigned Depth) const {
5740 switch (Op.getOpcode()) {
5741 case AMDGPUISD::BFE_I32: {
5742 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5743 if (!Width)
5744 return 1;
5745
5746 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5747 if (!isNullConstant(Op.getOperand(1)))
5748 return SignBits;
5749
5750 // TODO: Could probably figure something out with non-0 offsets.
5751 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5752 return std::max(SignBits, Op0SignBits);
5753 }
5754
5755 case AMDGPUISD::BFE_U32: {
5756 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5757 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5758 }
5759
5760 case AMDGPUISD::CARRY:
5761 case AMDGPUISD::BORROW:
5762 return 31;
5763 case AMDGPUISD::BUFFER_LOAD_BYTE:
5764 return 25;
5765 case AMDGPUISD::BUFFER_LOAD_SHORT:
5766 return 17;
5767 case AMDGPUISD::BUFFER_LOAD_UBYTE:
5768 return 24;
5769 case AMDGPUISD::BUFFER_LOAD_USHORT:
5770 return 16;
5771 case AMDGPUISD::FP_TO_FP16:
5772 return 16;
5773 case AMDGPUISD::SMIN3:
5774 case AMDGPUISD::SMAX3:
5775 case AMDGPUISD::SMED3:
5776 case AMDGPUISD::UMIN3:
5777 case AMDGPUISD::UMAX3:
5778 case AMDGPUISD::UMED3: {
5779 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5780 if (Tmp2 == 1)
5781 return 1; // Early out.
5782
5783 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5784 if (Tmp1 == 1)
5785 return 1; // Early out.
5786
5787 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5788 if (Tmp0 == 1)
5789 return 1; // Early out.
5790
5791 return std::min(Tmp0, std::min(Tmp1, Tmp2));
5792 }
5793 default:
5794 return 1;
5795 }
5796 }
5797
computeNumSignBitsForTargetInstr(GISelKnownBits & Analysis,Register R,const APInt & DemandedElts,const MachineRegisterInfo & MRI,unsigned Depth) const5798 unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
5799 GISelKnownBits &Analysis, Register R,
5800 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5801 unsigned Depth) const {
5802 const MachineInstr *MI = MRI.getVRegDef(R);
5803 if (!MI)
5804 return 1;
5805
5806 // TODO: Check range metadata on MMO.
5807 switch (MI->getOpcode()) {
5808 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5809 return 25;
5810 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5811 return 17;
5812 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5813 return 24;
5814 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5815 return 16;
5816 case AMDGPU::G_AMDGPU_SMED3:
5817 case AMDGPU::G_AMDGPU_UMED3: {
5818 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5819 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5820 if (Tmp2 == 1)
5821 return 1;
5822 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5823 if (Tmp1 == 1)
5824 return 1;
5825 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5826 if (Tmp0 == 1)
5827 return 1;
5828 return std::min(Tmp0, std::min(Tmp1, Tmp2));
5829 }
5830 default:
5831 return 1;
5832 }
5833 }
5834
isKnownNeverNaNForTargetNode(SDValue Op,const SelectionDAG & DAG,bool SNaN,unsigned Depth) const5835 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
5836 const SelectionDAG &DAG,
5837 bool SNaN,
5838 unsigned Depth) const {
5839 unsigned Opcode = Op.getOpcode();
5840 switch (Opcode) {
5841 case AMDGPUISD::FMIN_LEGACY:
5842 case AMDGPUISD::FMAX_LEGACY: {
5843 if (SNaN)
5844 return true;
5845
5846 // TODO: Can check no nans on one of the operands for each one, but which
5847 // one?
5848 return false;
5849 }
5850 case AMDGPUISD::FMUL_LEGACY:
5851 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
5852 if (SNaN)
5853 return true;
5854 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5855 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5856 }
5857 case AMDGPUISD::FMED3:
5858 case AMDGPUISD::FMIN3:
5859 case AMDGPUISD::FMAX3:
5860 case AMDGPUISD::FMINIMUM3:
5861 case AMDGPUISD::FMAXIMUM3:
5862 case AMDGPUISD::FMAD_FTZ: {
5863 if (SNaN)
5864 return true;
5865 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5866 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5867 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5868 }
5869 case AMDGPUISD::CVT_F32_UBYTE0:
5870 case AMDGPUISD::CVT_F32_UBYTE1:
5871 case AMDGPUISD::CVT_F32_UBYTE2:
5872 case AMDGPUISD::CVT_F32_UBYTE3:
5873 return true;
5874
5875 case AMDGPUISD::RCP:
5876 case AMDGPUISD::RSQ:
5877 case AMDGPUISD::RCP_LEGACY:
5878 case AMDGPUISD::RSQ_CLAMP: {
5879 if (SNaN)
5880 return true;
5881
5882 // TODO: Need is known positive check.
5883 return false;
5884 }
5885 case ISD::FLDEXP:
5886 case AMDGPUISD::FRACT: {
5887 if (SNaN)
5888 return true;
5889 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5890 }
5891 case AMDGPUISD::DIV_SCALE:
5892 case AMDGPUISD::DIV_FMAS:
5893 case AMDGPUISD::DIV_FIXUP:
5894 // TODO: Refine on operands.
5895 return SNaN;
5896 case AMDGPUISD::SIN_HW:
5897 case AMDGPUISD::COS_HW: {
5898 // TODO: Need check for infinity
5899 return SNaN;
5900 }
5901 case ISD::INTRINSIC_WO_CHAIN: {
5902 unsigned IntrinsicID = Op.getConstantOperandVal(0);
5903 // TODO: Handle more intrinsics
5904 switch (IntrinsicID) {
5905 case Intrinsic::amdgcn_cubeid:
5906 return true;
5907
5908 case Intrinsic::amdgcn_frexp_mant: {
5909 if (SNaN)
5910 return true;
5911 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5912 }
5913 case Intrinsic::amdgcn_cvt_pkrtz: {
5914 if (SNaN)
5915 return true;
5916 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5917 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5918 }
5919 case Intrinsic::amdgcn_rcp:
5920 case Intrinsic::amdgcn_rsq:
5921 case Intrinsic::amdgcn_rcp_legacy:
5922 case Intrinsic::amdgcn_rsq_legacy:
5923 case Intrinsic::amdgcn_rsq_clamp: {
5924 if (SNaN)
5925 return true;
5926
5927 // TODO: Need is known positive check.
5928 return false;
5929 }
5930 case Intrinsic::amdgcn_trig_preop:
5931 case Intrinsic::amdgcn_fdot2:
5932 // TODO: Refine on operand
5933 return SNaN;
5934 case Intrinsic::amdgcn_fma_legacy:
5935 if (SNaN)
5936 return true;
5937 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5938 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
5939 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
5940 default:
5941 return false;
5942 }
5943 }
5944 default:
5945 return false;
5946 }
5947 }
5948
isReassocProfitable(MachineRegisterInfo & MRI,Register N0,Register N1) const5949 bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
5950 Register N0, Register N1) const {
5951 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
5952 }
5953
5954 TargetLowering::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst * RMW) const5955 AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
5956 switch (RMW->getOperation()) {
5957 case AtomicRMWInst::Nand:
5958 case AtomicRMWInst::FAdd:
5959 case AtomicRMWInst::FSub:
5960 case AtomicRMWInst::FMax:
5961 case AtomicRMWInst::FMin:
5962 return AtomicExpansionKind::CmpXChg;
5963 default: {
5964 if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
5965 unsigned Size = IntTy->getBitWidth();
5966 if (Size == 32 || Size == 64)
5967 return AtomicExpansionKind::None;
5968 }
5969
5970 return AtomicExpansionKind::CmpXChg;
5971 }
5972 }
5973 }
5974
5975 /// Whether it is profitable to sink the operands of an
5976 /// Instruction I to the basic block of I.
5977 /// This helps using several modifiers (like abs and neg) more often.
shouldSinkOperands(Instruction * I,SmallVectorImpl<Use * > & Ops) const5978 bool AMDGPUTargetLowering::shouldSinkOperands(
5979 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
5980 using namespace PatternMatch;
5981
5982 for (auto &Op : I->operands()) {
5983 // Ensure we are not already sinking this operand.
5984 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
5985 continue;
5986
5987 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
5988 Ops.push_back(&Op);
5989 }
5990
5991 return !Ops.empty();
5992 }
5993