1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUMachineFunction.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/CodeGen/Analysis.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/IR/PatternMatch.h"
26 #include "llvm/Support/CommandLine.h"
27 #include "llvm/Support/KnownBits.h"
28 #include "llvm/Target/TargetMachine.h"
29 
30 using namespace llvm;
31 
32 #include "AMDGPUGenCallingConv.inc"
33 
34 static cl::opt<bool> AMDGPUBypassSlowDiv(
35   "amdgpu-bypass-slow-div",
36   cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37   cl::init(true));
38 
39 // Find a larger type to do a load / store of a vector with.
40 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
41   unsigned StoreSize = VT.getStoreSizeInBits();
42   if (StoreSize <= 32)
43     return EVT::getIntegerVT(Ctx, StoreSize);
44 
45   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
46   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47 }
48 
49 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
50   return DAG.computeKnownBits(Op).countMaxActiveBits();
51 }
52 
53 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
54   // In order for this to be a signed 24-bit value, bit 23, must
55   // be a sign bit.
56   return DAG.ComputeMaxSignificantBits(Op);
57 }
58 
59 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
60                                            const AMDGPUSubtarget &STI)
61     : TargetLowering(TM), Subtarget(&STI) {
62   // Lower floating point store/load to integer store/load to reduce the number
63   // of patterns in tablegen.
64   setOperationAction(ISD::LOAD, MVT::f32, Promote);
65   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
66 
67   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
68   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
69 
70   setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
71   AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
72 
73   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
74   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
75 
76   setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
77   AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
78 
79   setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
80   AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
81 
82   setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
83   AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
84 
85   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
86   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
87 
88   setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
89   AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
90 
91   setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
92   AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
93 
94   setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
95   AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
96 
97   setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
98   AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
99 
100   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
101   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
102 
103   setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
104   AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
105 
106   setOperationAction(ISD::LOAD, MVT::i64, Promote);
107   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
108 
109   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
110   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
111 
112   setOperationAction(ISD::LOAD, MVT::f64, Promote);
113   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
114 
115   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
116   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
117 
118   setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
119   AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
120 
121   setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
122   AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
123 
124   setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
125   AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
126 
127   setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
128   AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
129 
130   setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
131   AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
132 
133   setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
134   AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
135 
136   setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
137   AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
138 
139   setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
140   AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
141 
142   setOperationAction(ISD::LOAD, MVT::i128, Promote);
143   AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
144 
145   // There are no 64-bit extloads. These should be done as a 32-bit extload and
146   // an extension to 64-bit.
147   for (MVT VT : MVT::integer_valuetypes())
148     setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,
149                      Expand);
150 
151   for (MVT VT : MVT::integer_valuetypes()) {
152     if (VT == MVT::i64)
153       continue;
154 
155     for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
156       setLoadExtAction(Op, VT, MVT::i1, Promote);
157       setLoadExtAction(Op, VT, MVT::i8, Legal);
158       setLoadExtAction(Op, VT, MVT::i16, Legal);
159       setLoadExtAction(Op, VT, MVT::i32, Expand);
160     }
161   }
162 
163   for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
164     for (auto MemVT :
165          {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
166       setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,
167                        Expand);
168 
169   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
170   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
171   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
172   setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
173   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
174   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
175   setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
176   setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
177 
178   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
179   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
180   setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
181   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
182   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
183   setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
184 
185   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
186   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
187   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
188   setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
189   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
190   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
191   setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
192 
193   setOperationAction(ISD::STORE, MVT::f32, Promote);
194   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
195 
196   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
197   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
198 
199   setOperationAction(ISD::STORE, MVT::v3f32, Promote);
200   AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
201 
202   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
203   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
204 
205   setOperationAction(ISD::STORE, MVT::v5f32, Promote);
206   AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
207 
208   setOperationAction(ISD::STORE, MVT::v6f32, Promote);
209   AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
210 
211   setOperationAction(ISD::STORE, MVT::v7f32, Promote);
212   AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
213 
214   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
215   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
216 
217   setOperationAction(ISD::STORE, MVT::v9f32, Promote);
218   AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
219 
220   setOperationAction(ISD::STORE, MVT::v10f32, Promote);
221   AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
222 
223   setOperationAction(ISD::STORE, MVT::v11f32, Promote);
224   AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
225 
226   setOperationAction(ISD::STORE, MVT::v12f32, Promote);
227   AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
228 
229   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
230   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
231 
232   setOperationAction(ISD::STORE, MVT::v32f32, Promote);
233   AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
234 
235   setOperationAction(ISD::STORE, MVT::i64, Promote);
236   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
237 
238   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
239   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
240 
241   setOperationAction(ISD::STORE, MVT::f64, Promote);
242   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
243 
244   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
245   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
246 
247   setOperationAction(ISD::STORE, MVT::v3i64, Promote);
248   AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
249 
250   setOperationAction(ISD::STORE, MVT::v3f64, Promote);
251   AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
252 
253   setOperationAction(ISD::STORE, MVT::v4i64, Promote);
254   AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
255 
256   setOperationAction(ISD::STORE, MVT::v4f64, Promote);
257   AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
258 
259   setOperationAction(ISD::STORE, MVT::v8i64, Promote);
260   AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
261 
262   setOperationAction(ISD::STORE, MVT::v8f64, Promote);
263   AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
264 
265   setOperationAction(ISD::STORE, MVT::v16i64, Promote);
266   AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
267 
268   setOperationAction(ISD::STORE, MVT::v16f64, Promote);
269   AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
270 
271   setOperationAction(ISD::STORE, MVT::i128, Promote);
272   AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
273 
274   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
275   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
276   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
277   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
278 
279   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
280   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
281   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
282   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
283 
284   setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
285   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
286   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
287   setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
288   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
289   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
290   setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
291   setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
292 
293   setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
294   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
295   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
296 
297   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
298   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
299 
300   setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
301   setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
302   setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
303   setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
304 
305   setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
306   setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
307   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
308   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
309 
310   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
311   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
312 
313   setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
314   setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
315   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
316   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
317   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
318   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
319   setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
320 
321   setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
322   setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
323 
324   setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
325 
326   // This is totally unsupported, just custom lower to produce an error.
327   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
328 
329   // Library functions.  These default to Expand, but we have instructions
330   // for them.
331   setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, ISD::FRINT,
332                       ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
333                      MVT::f32, Legal);
334 
335   setOperationAction(ISD::FLOG2, MVT::f32, Custom);
336   setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
337 
338   setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2}, MVT::f32,
339                      Custom);
340 
341   setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
342 
343   setOperationAction(ISD::FROUNDEVEN, {MVT::f16, MVT::f32, MVT::f64}, Custom);
344 
345   setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
346 
347   if (Subtarget->has16BitInsts())
348     setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
349   else {
350     setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
351     setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
352   }
353 
354   setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP}, MVT::f16, Custom);
355 
356   // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
357   // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
358   // default unless marked custom/legal.
359   setOperationAction(
360       ISD::IS_FPCLASS,
361       {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
362        MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
363        MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
364       Custom);
365 
366   // Expand to fneg + fadd.
367   setOperationAction(ISD::FSUB, MVT::f64, Expand);
368 
369   setOperationAction(ISD::CONCAT_VECTORS,
370                      {MVT::v3i32,  MVT::v3f32,  MVT::v4i32,  MVT::v4f32,
371                       MVT::v5i32,  MVT::v5f32,  MVT::v6i32,  MVT::v6f32,
372                       MVT::v7i32,  MVT::v7f32,  MVT::v8i32,  MVT::v8f32,
373                       MVT::v9i32,  MVT::v9f32,  MVT::v10i32, MVT::v10f32,
374                       MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
375                      Custom);
376   setOperationAction(
377       ISD::EXTRACT_SUBVECTOR,
378       {MVT::v2f16,  MVT::v2i16,  MVT::v4f16,  MVT::v4i16,  MVT::v2f32,
379        MVT::v2i32,  MVT::v3f32,  MVT::v3i32,  MVT::v4f32,  MVT::v4i32,
380        MVT::v5f32,  MVT::v5i32,  MVT::v6f32,  MVT::v6i32,  MVT::v7f32,
381        MVT::v7i32,  MVT::v8f32,  MVT::v8i32,  MVT::v9f32,  MVT::v9i32,
382        MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32,
383        MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
384        MVT::v32f32, MVT::v32i32, MVT::v2f64,  MVT::v2i64,  MVT::v3f64,
385        MVT::v3i64,  MVT::v4f64,  MVT::v4i64,  MVT::v8f64,  MVT::v8i64,
386        MVT::v16f64, MVT::v16i64},
387       Custom);
388 
389   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
390   setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
391 
392   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
393   for (MVT VT : ScalarIntVTs) {
394     // These should use [SU]DIVREM, so set them to expand
395     setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
396                        Expand);
397 
398     // GPU does not have divrem function for signed or unsigned.
399     setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);
400 
401     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
402     setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);
403 
404     setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);
405 
406     // AMDGPU uses ADDC/SUBC/ADDE/SUBE
407     setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
408   }
409 
410   // The hardware supports 32-bit FSHR, but not FSHL.
411   setOperationAction(ISD::FSHR, MVT::i32, Legal);
412 
413   // The hardware supports 32-bit ROTR, but not ROTL.
414   setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
415   setOperationAction(ISD::ROTR, MVT::i64, Expand);
416 
417   setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
418 
419   setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
420   setOperationAction(
421       {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
422       MVT::i64, Custom);
423   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
424 
425   setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
426                      Legal);
427 
428   setOperationAction(
429       {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
430       MVT::i64, Custom);
431 
432   static const MVT::SimpleValueType VectorIntTypes[] = {
433       MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
434       MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
435 
436   for (MVT VT : VectorIntTypes) {
437     // Expand the following operations for the current type by default.
438     setOperationAction({ISD::ADD,        ISD::AND,     ISD::FP_TO_SINT,
439                         ISD::FP_TO_UINT, ISD::MUL,     ISD::MULHU,
440                         ISD::MULHS,      ISD::OR,      ISD::SHL,
441                         ISD::SRA,        ISD::SRL,     ISD::ROTL,
442                         ISD::ROTR,       ISD::SUB,     ISD::SINT_TO_FP,
443                         ISD::UINT_TO_FP, ISD::SDIV,    ISD::UDIV,
444                         ISD::SREM,       ISD::UREM,    ISD::SMUL_LOHI,
445                         ISD::UMUL_LOHI,  ISD::SDIVREM, ISD::UDIVREM,
446                         ISD::SELECT,     ISD::VSELECT, ISD::SELECT_CC,
447                         ISD::XOR,        ISD::BSWAP,   ISD::CTPOP,
448                         ISD::CTTZ,       ISD::CTLZ,    ISD::VECTOR_SHUFFLE,
449                         ISD::SETCC},
450                        VT, Expand);
451   }
452 
453   static const MVT::SimpleValueType FloatVectorTypes[] = {
454       MVT::v2f32, MVT::v3f32,  MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
455       MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
456 
457   for (MVT VT : FloatVectorTypes) {
458     setOperationAction(
459         {ISD::FABS,    ISD::FMINNUM,      ISD::FMAXNUM,   ISD::FADD,
460          ISD::FCEIL,   ISD::FCOS,         ISD::FDIV,      ISD::FEXP2,
461          ISD::FEXP,    ISD::FLOG2,        ISD::FREM,      ISD::FLOG,
462          ISD::FLOG10,  ISD::FPOW,         ISD::FFLOOR,    ISD::FTRUNC,
463          ISD::FMUL,    ISD::FMA,          ISD::FRINT,     ISD::FNEARBYINT,
464          ISD::FSQRT,   ISD::FSIN,         ISD::FSUB,      ISD::FNEG,
465          ISD::VSELECT, ISD::SELECT_CC,    ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE,
466          ISD::SETCC,   ISD::FCANONICALIZE},
467         VT, Expand);
468   }
469 
470   // This causes using an unrolled select operation rather than expansion with
471   // bit operations. This is in general better, but the alternative using BFI
472   // instructions may be better if the select sources are SGPRs.
473   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
474   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
475 
476   setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
477   AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
478 
479   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
480   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
481 
482   setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
483   AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
484 
485   setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
486   AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
487 
488   setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
489   AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
490 
491   setOperationAction(ISD::SELECT, MVT::v9f32, Promote);
492   AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
493 
494   setOperationAction(ISD::SELECT, MVT::v10f32, Promote);
495   AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
496 
497   setOperationAction(ISD::SELECT, MVT::v11f32, Promote);
498   AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
499 
500   setOperationAction(ISD::SELECT, MVT::v12f32, Promote);
501   AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
502 
503   // There are no libcalls of any kind.
504   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
505     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
506 
507   setSchedulingPreference(Sched::RegPressure);
508   setJumpIsExpensive(true);
509 
510   // FIXME: This is only partially true. If we have to do vector compares, any
511   // SGPR pair can be a condition register. If we have a uniform condition, we
512   // are better off doing SALU operations, where there is only one SCC. For now,
513   // we don't have a way of knowing during instruction selection if a condition
514   // will be uniform and we always use vector compares. Assume we are using
515   // vector compares until that is fixed.
516   setHasMultipleConditionRegisters(true);
517 
518   setMinCmpXchgSizeInBits(32);
519   setSupportsUnalignedAtomics(false);
520 
521   PredictableSelectIsExpensive = false;
522 
523   // We want to find all load dependencies for long chains of stores to enable
524   // merging into very wide vectors. The problem is with vectors with > 4
525   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
526   // vectors are a legal type, even though we have to split the loads
527   // usually. When we can more precisely specify load legality per address
528   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
529   // smarter so that they can figure out what to do in 2 iterations without all
530   // N > 4 stores on the same chain.
531   GatherAllAliasesMaxDepth = 16;
532 
533   // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
534   // about these during lowering.
535   MaxStoresPerMemcpy  = 0xffffffff;
536   MaxStoresPerMemmove = 0xffffffff;
537   MaxStoresPerMemset  = 0xffffffff;
538 
539   // The expansion for 64-bit division is enormous.
540   if (AMDGPUBypassSlowDiv)
541     addBypassSlowDiv(64, 32);
542 
543   setTargetDAGCombine({ISD::BITCAST,    ISD::SHL,
544                        ISD::SRA,        ISD::SRL,
545                        ISD::TRUNCATE,   ISD::MUL,
546                        ISD::SMUL_LOHI,  ISD::UMUL_LOHI,
547                        ISD::MULHU,      ISD::MULHS,
548                        ISD::SELECT,     ISD::SELECT_CC,
549                        ISD::STORE,      ISD::FADD,
550                        ISD::FSUB,       ISD::FNEG,
551                        ISD::FABS,       ISD::AssertZext,
552                        ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
553 }
554 
555 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
556   if (getTargetMachine().Options.NoSignedZerosFPMath)
557     return true;
558 
559   const auto Flags = Op.getNode()->getFlags();
560   if (Flags.hasNoSignedZeros())
561     return true;
562 
563   return false;
564 }
565 
566 //===----------------------------------------------------------------------===//
567 // Target Information
568 //===----------------------------------------------------------------------===//
569 
570 LLVM_READNONE
571 static bool fnegFoldsIntoOpcode(unsigned Opc) {
572   switch (Opc) {
573   case ISD::FADD:
574   case ISD::FSUB:
575   case ISD::FMUL:
576   case ISD::FMA:
577   case ISD::FMAD:
578   case ISD::FMINNUM:
579   case ISD::FMAXNUM:
580   case ISD::FMINNUM_IEEE:
581   case ISD::FMAXNUM_IEEE:
582   case ISD::SELECT:
583   case ISD::FSIN:
584   case ISD::FTRUNC:
585   case ISD::FRINT:
586   case ISD::FNEARBYINT:
587   case ISD::FCANONICALIZE:
588   case AMDGPUISD::RCP:
589   case AMDGPUISD::RCP_LEGACY:
590   case AMDGPUISD::RCP_IFLAG:
591   case AMDGPUISD::SIN_HW:
592   case AMDGPUISD::FMUL_LEGACY:
593   case AMDGPUISD::FMIN_LEGACY:
594   case AMDGPUISD::FMAX_LEGACY:
595   case AMDGPUISD::FMED3:
596     // TODO: handle llvm.amdgcn.fma.legacy
597     return true;
598   case ISD::BITCAST:
599     llvm_unreachable("bitcast is special cased");
600   default:
601     return false;
602   }
603 }
604 
605 static bool fnegFoldsIntoOp(const SDNode *N) {
606   unsigned Opc = N->getOpcode();
607   if (Opc == ISD::BITCAST) {
608     // TODO: Is there a benefit to checking the conditions performFNegCombine
609     // does? We don't for the other cases.
610     SDValue BCSrc = N->getOperand(0);
611     if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
612       return BCSrc.getNumOperands() == 2 &&
613              BCSrc.getOperand(1).getValueSizeInBits() == 32;
614     }
615 
616     return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
617   }
618 
619   return fnegFoldsIntoOpcode(Opc);
620 }
621 
622 /// \p returns true if the operation will definitely need to use a 64-bit
623 /// encoding, and thus will use a VOP3 encoding regardless of the source
624 /// modifiers.
625 LLVM_READONLY
626 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
627   return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
628          VT == MVT::f64;
629 }
630 
631 /// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
632 /// type for ISD::SELECT.
633 LLVM_READONLY
634 static bool selectSupportsSourceMods(const SDNode *N) {
635   // TODO: Only applies if select will be vector
636   return N->getValueType(0) == MVT::f32;
637 }
638 
639 // Most FP instructions support source modifiers, but this could be refined
640 // slightly.
641 LLVM_READONLY
642 static bool hasSourceMods(const SDNode *N) {
643   if (isa<MemSDNode>(N))
644     return false;
645 
646   switch (N->getOpcode()) {
647   case ISD::CopyToReg:
648   case ISD::FDIV:
649   case ISD::FREM:
650   case ISD::INLINEASM:
651   case ISD::INLINEASM_BR:
652   case AMDGPUISD::DIV_SCALE:
653   case ISD::INTRINSIC_W_CHAIN:
654 
655   // TODO: Should really be looking at the users of the bitcast. These are
656   // problematic because bitcasts are used to legalize all stores to integer
657   // types.
658   case ISD::BITCAST:
659     return false;
660   case ISD::INTRINSIC_WO_CHAIN: {
661     switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
662     case Intrinsic::amdgcn_interp_p1:
663     case Intrinsic::amdgcn_interp_p2:
664     case Intrinsic::amdgcn_interp_mov:
665     case Intrinsic::amdgcn_interp_p1_f16:
666     case Intrinsic::amdgcn_interp_p2_f16:
667       return false;
668     default:
669       return true;
670     }
671   }
672   case ISD::SELECT:
673     return selectSupportsSourceMods(N);
674   default:
675     return true;
676   }
677 }
678 
679 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
680                                                  unsigned CostThreshold) {
681   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
682   // it is truly free to use a source modifier in all cases. If there are
683   // multiple users but for each one will necessitate using VOP3, there will be
684   // a code size increase. Try to avoid increasing code size unless we know it
685   // will save on the instruction count.
686   unsigned NumMayIncreaseSize = 0;
687   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
688 
689   assert(!N->use_empty());
690 
691   // XXX - Should this limit number of uses to check?
692   for (const SDNode *U : N->uses()) {
693     if (!hasSourceMods(U))
694       return false;
695 
696     if (!opMustUseVOP3Encoding(U, VT)) {
697       if (++NumMayIncreaseSize > CostThreshold)
698         return false;
699     }
700   }
701 
702   return true;
703 }
704 
705 EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
706                                               ISD::NodeType ExtendKind) const {
707   assert(!VT.isVector() && "only scalar expected");
708 
709   // Round to the next multiple of 32-bits.
710   unsigned Size = VT.getSizeInBits();
711   if (Size <= 32)
712     return MVT::i32;
713   return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
714 }
715 
716 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
717   return MVT::i32;
718 }
719 
720 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
721   return true;
722 }
723 
724 // The backend supports 32 and 64 bit floating point immediates.
725 // FIXME: Why are we reporting vectors of FP immediates as legal?
726 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
727                                         bool ForCodeSize) const {
728   EVT ScalarVT = VT.getScalarType();
729   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
730          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
731 }
732 
733 // We don't want to shrink f64 / f32 constants.
734 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
735   EVT ScalarVT = VT.getScalarType();
736   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
737 }
738 
739 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
740                                                  ISD::LoadExtType ExtTy,
741                                                  EVT NewVT) const {
742   // TODO: This may be worth removing. Check regression tests for diffs.
743   if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
744     return false;
745 
746   unsigned NewSize = NewVT.getStoreSizeInBits();
747 
748   // If we are reducing to a 32-bit load or a smaller multi-dword load,
749   // this is always better.
750   if (NewSize >= 32)
751     return true;
752 
753   EVT OldVT = N->getValueType(0);
754   unsigned OldSize = OldVT.getStoreSizeInBits();
755 
756   MemSDNode *MN = cast<MemSDNode>(N);
757   unsigned AS = MN->getAddressSpace();
758   // Do not shrink an aligned scalar load to sub-dword.
759   // Scalar engine cannot do sub-dword loads.
760   if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
761       (AS == AMDGPUAS::CONSTANT_ADDRESS ||
762        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
763        (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
764         MN->isInvariant())) &&
765       AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
766     return false;
767 
768   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
769   // extloads, so doing one requires using a buffer_load. In cases where we
770   // still couldn't use a scalar load, using the wider load shouldn't really
771   // hurt anything.
772 
773   // If the old size already had to be an extload, there's no harm in continuing
774   // to reduce the width.
775   return (OldSize < 32);
776 }
777 
778 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
779                                                    const SelectionDAG &DAG,
780                                                    const MachineMemOperand &MMO) const {
781 
782   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
783 
784   if (LoadTy.getScalarType() == MVT::i32)
785     return false;
786 
787   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
788   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
789 
790   if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
791     return false;
792 
793   unsigned Fast = 0;
794   return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
795                                         CastTy, MMO, &Fast) &&
796          Fast;
797 }
798 
799 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
800 // profitable with the expansion for 64-bit since it's generally good to
801 // speculate things.
802 bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
803   return true;
804 }
805 
806 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
807   return true;
808 }
809 
810 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
811   switch (N->getOpcode()) {
812   case ISD::EntryToken:
813   case ISD::TokenFactor:
814     return true;
815   case ISD::INTRINSIC_WO_CHAIN: {
816     unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
817     switch (IntrID) {
818     case Intrinsic::amdgcn_readfirstlane:
819     case Intrinsic::amdgcn_readlane:
820       return true;
821     }
822     return false;
823   }
824   case ISD::LOAD:
825     if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
826         AMDGPUAS::CONSTANT_ADDRESS_32BIT)
827       return true;
828     return false;
829   case AMDGPUISD::SETCC: // ballot-style instruction
830     return true;
831   }
832   return false;
833 }
834 
835 SDValue AMDGPUTargetLowering::getNegatedExpression(
836     SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
837     NegatibleCost &Cost, unsigned Depth) const {
838 
839   switch (Op.getOpcode()) {
840   case ISD::FMA:
841   case ISD::FMAD: {
842     // Negating a fma is not free if it has users without source mods.
843     if (!allUsesHaveSourceMods(Op.getNode()))
844       return SDValue();
845     break;
846   }
847   case AMDGPUISD::RCP: {
848     SDValue Src = Op.getOperand(0);
849     EVT VT = Op.getValueType();
850     SDLoc SL(Op);
851 
852     SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
853                                           ForCodeSize, Cost, Depth + 1);
854     if (NegSrc)
855       return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
856     return SDValue();
857   }
858   default:
859     break;
860   }
861 
862   return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
863                                               ForCodeSize, Cost, Depth);
864 }
865 
866 //===---------------------------------------------------------------------===//
867 // Target Properties
868 //===---------------------------------------------------------------------===//
869 
870 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
871   assert(VT.isFloatingPoint());
872 
873   // Packed operations do not have a fabs modifier.
874   return VT == MVT::f32 || VT == MVT::f64 ||
875          (Subtarget->has16BitInsts() && VT == MVT::f16);
876 }
877 
878 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
879   assert(VT.isFloatingPoint());
880   // Report this based on the end legalized type.
881   VT = VT.getScalarType();
882   return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
883 }
884 
885 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
886                                                          unsigned NumElem,
887                                                          unsigned AS) const {
888   return true;
889 }
890 
891 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
892   // There are few operations which truly have vector input operands. Any vector
893   // operation is going to involve operations on each component, and a
894   // build_vector will be a copy per element, so it always makes sense to use a
895   // build_vector input in place of the extracted element to avoid a copy into a
896   // super register.
897   //
898   // We should probably only do this if all users are extracts only, but this
899   // should be the common case.
900   return true;
901 }
902 
903 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
904   // Truncate is just accessing a subregister.
905 
906   unsigned SrcSize = Source.getSizeInBits();
907   unsigned DestSize = Dest.getSizeInBits();
908 
909   return DestSize < SrcSize && DestSize % 32 == 0 ;
910 }
911 
912 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
913   // Truncate is just accessing a subregister.
914 
915   unsigned SrcSize = Source->getScalarSizeInBits();
916   unsigned DestSize = Dest->getScalarSizeInBits();
917 
918   if (DestSize== 16 && Subtarget->has16BitInsts())
919     return SrcSize >= 32;
920 
921   return DestSize < SrcSize && DestSize % 32 == 0;
922 }
923 
924 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
925   unsigned SrcSize = Src->getScalarSizeInBits();
926   unsigned DestSize = Dest->getScalarSizeInBits();
927 
928   if (SrcSize == 16 && Subtarget->has16BitInsts())
929     return DestSize >= 32;
930 
931   return SrcSize == 32 && DestSize == 64;
932 }
933 
934 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
935   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
936   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
937   // this will enable reducing 64-bit operations the 32-bit, which is always
938   // good.
939 
940   if (Src == MVT::i16)
941     return Dest == MVT::i32 ||Dest == MVT::i64 ;
942 
943   return Src == MVT::i32 && Dest == MVT::i64;
944 }
945 
946 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
947   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
948   // limited number of native 64-bit operations. Shrinking an operation to fit
949   // in a single 32-bit register should always be helpful. As currently used,
950   // this is much less general than the name suggests, and is only used in
951   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
952   // not profitable, and may actually be harmful.
953   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
954 }
955 
956 bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
957     const SDNode* N, CombineLevel Level) const {
958   assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
959           N->getOpcode() == ISD::SRL) &&
960          "Expected shift op");
961   // Always commute pre-type legalization and right shifts.
962   // We're looking for shl(or(x,y),z) patterns.
963   if (Level < CombineLevel::AfterLegalizeTypes ||
964       N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
965     return true;
966 
967   // If only user is a i32 right-shift, then don't destroy a BFE pattern.
968   if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
969       (N->use_begin()->getOpcode() == ISD::SRA ||
970        N->use_begin()->getOpcode() == ISD::SRL))
971     return false;
972 
973   // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
974   auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
975     if (LHS.getOpcode() != ISD::SHL)
976       return false;
977     auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
978     auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
979     auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
980     return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
981            LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
982            RHSLd->getExtensionType() == ISD::ZEXTLOAD;
983   };
984   SDValue LHS = N->getOperand(0).getOperand(0);
985   SDValue RHS = N->getOperand(0).getOperand(1);
986   return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
987 }
988 
989 //===---------------------------------------------------------------------===//
990 // TargetLowering Callbacks
991 //===---------------------------------------------------------------------===//
992 
993 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
994                                                   bool IsVarArg) {
995   switch (CC) {
996   case CallingConv::AMDGPU_VS:
997   case CallingConv::AMDGPU_GS:
998   case CallingConv::AMDGPU_PS:
999   case CallingConv::AMDGPU_CS:
1000   case CallingConv::AMDGPU_HS:
1001   case CallingConv::AMDGPU_ES:
1002   case CallingConv::AMDGPU_LS:
1003     return CC_AMDGPU;
1004   case CallingConv::C:
1005   case CallingConv::Fast:
1006   case CallingConv::Cold:
1007     return CC_AMDGPU_Func;
1008   case CallingConv::AMDGPU_Gfx:
1009     return CC_SI_Gfx;
1010   case CallingConv::AMDGPU_KERNEL:
1011   case CallingConv::SPIR_KERNEL:
1012   default:
1013     report_fatal_error("Unsupported calling convention for call");
1014   }
1015 }
1016 
1017 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1018                                                     bool IsVarArg) {
1019   switch (CC) {
1020   case CallingConv::AMDGPU_KERNEL:
1021   case CallingConv::SPIR_KERNEL:
1022     llvm_unreachable("kernels should not be handled here");
1023   case CallingConv::AMDGPU_VS:
1024   case CallingConv::AMDGPU_GS:
1025   case CallingConv::AMDGPU_PS:
1026   case CallingConv::AMDGPU_CS:
1027   case CallingConv::AMDGPU_HS:
1028   case CallingConv::AMDGPU_ES:
1029   case CallingConv::AMDGPU_LS:
1030     return RetCC_SI_Shader;
1031   case CallingConv::AMDGPU_Gfx:
1032     return RetCC_SI_Gfx;
1033   case CallingConv::C:
1034   case CallingConv::Fast:
1035   case CallingConv::Cold:
1036     return RetCC_AMDGPU_Func;
1037   default:
1038     report_fatal_error("Unsupported calling convention.");
1039   }
1040 }
1041 
1042 /// The SelectionDAGBuilder will automatically promote function arguments
1043 /// with illegal types.  However, this does not work for the AMDGPU targets
1044 /// since the function arguments are stored in memory as these illegal types.
1045 /// In order to handle this properly we need to get the original types sizes
1046 /// from the LLVM IR Function and fixup the ISD:InputArg values before
1047 /// passing them to AnalyzeFormalArguments()
1048 
1049 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1050 /// input values across multiple registers.  Each item in the Ins array
1051 /// represents a single value that will be stored in registers.  Ins[x].VT is
1052 /// the value type of the value that will be stored in the register, so
1053 /// whatever SDNode we lower the argument to needs to be this type.
1054 ///
1055 /// In order to correctly lower the arguments we need to know the size of each
1056 /// argument.  Since Ins[x].VT gives us the size of the register that will
1057 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1058 /// for the original function argument so that we can deduce the correct memory
1059 /// type to use for Ins[x].  In most cases the correct memory type will be
1060 /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
1061 /// we have a kernel argument of type v8i8, this argument will be split into
1062 /// 8 parts and each part will be represented by its own item in the Ins array.
1063 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1064 /// the argument before it was split.  From this, we deduce that the memory type
1065 /// for each individual part is i8.  We pass the memory type as LocVT to the
1066 /// calling convention analysis function and the register type (Ins[x].VT) as
1067 /// the ValVT.
1068 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1069   CCState &State,
1070   const SmallVectorImpl<ISD::InputArg> &Ins) const {
1071   const MachineFunction &MF = State.getMachineFunction();
1072   const Function &Fn = MF.getFunction();
1073   LLVMContext &Ctx = Fn.getParent()->getContext();
1074   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1075   const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1076   CallingConv::ID CC = Fn.getCallingConv();
1077 
1078   Align MaxAlign = Align(1);
1079   uint64_t ExplicitArgOffset = 0;
1080   const DataLayout &DL = Fn.getParent()->getDataLayout();
1081 
1082   unsigned InIndex = 0;
1083 
1084   for (const Argument &Arg : Fn.args()) {
1085     const bool IsByRef = Arg.hasByRefAttr();
1086     Type *BaseArgTy = Arg.getType();
1087     Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1088     Align Alignment = DL.getValueOrABITypeAlignment(
1089         IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1090     MaxAlign = std::max(Alignment, MaxAlign);
1091     uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1092 
1093     uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1094     ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1095 
1096     // We're basically throwing away everything passed into us and starting over
1097     // to get accurate in-memory offsets. The "PartOffset" is completely useless
1098     // to us as computed in Ins.
1099     //
1100     // We also need to figure out what type legalization is trying to do to get
1101     // the correct memory offsets.
1102 
1103     SmallVector<EVT, 16> ValueVTs;
1104     SmallVector<uint64_t, 16> Offsets;
1105     ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1106 
1107     for (unsigned Value = 0, NumValues = ValueVTs.size();
1108          Value != NumValues; ++Value) {
1109       uint64_t BasePartOffset = Offsets[Value];
1110 
1111       EVT ArgVT = ValueVTs[Value];
1112       EVT MemVT = ArgVT;
1113       MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1114       unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1115 
1116       if (NumRegs == 1) {
1117         // This argument is not split, so the IR type is the memory type.
1118         if (ArgVT.isExtended()) {
1119           // We have an extended type, like i24, so we should just use the
1120           // register type.
1121           MemVT = RegisterVT;
1122         } else {
1123           MemVT = ArgVT;
1124         }
1125       } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1126                  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1127         assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1128         // We have a vector value which has been split into a vector with
1129         // the same scalar type, but fewer elements.  This should handle
1130         // all the floating-point vector types.
1131         MemVT = RegisterVT;
1132       } else if (ArgVT.isVector() &&
1133                  ArgVT.getVectorNumElements() == NumRegs) {
1134         // This arg has been split so that each element is stored in a separate
1135         // register.
1136         MemVT = ArgVT.getScalarType();
1137       } else if (ArgVT.isExtended()) {
1138         // We have an extended type, like i65.
1139         MemVT = RegisterVT;
1140       } else {
1141         unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1142         assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1143         if (RegisterVT.isInteger()) {
1144           MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1145         } else if (RegisterVT.isVector()) {
1146           assert(!RegisterVT.getScalarType().isFloatingPoint());
1147           unsigned NumElements = RegisterVT.getVectorNumElements();
1148           assert(MemoryBits % NumElements == 0);
1149           // This vector type has been split into another vector type with
1150           // a different elements size.
1151           EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1152                                            MemoryBits / NumElements);
1153           MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1154         } else {
1155           llvm_unreachable("cannot deduce memory type.");
1156         }
1157       }
1158 
1159       // Convert one element vectors to scalar.
1160       if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1161         MemVT = MemVT.getScalarType();
1162 
1163       // Round up vec3/vec5 argument.
1164       if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1165         assert(MemVT.getVectorNumElements() == 3 ||
1166                MemVT.getVectorNumElements() == 5 ||
1167                (MemVT.getVectorNumElements() >= 9 &&
1168                 MemVT.getVectorNumElements() <= 12));
1169         MemVT = MemVT.getPow2VectorType(State.getContext());
1170       } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1171         MemVT = MemVT.getRoundIntegerType(State.getContext());
1172       }
1173 
1174       unsigned PartOffset = 0;
1175       for (unsigned i = 0; i != NumRegs; ++i) {
1176         State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1177                                                BasePartOffset + PartOffset,
1178                                                MemVT.getSimpleVT(),
1179                                                CCValAssign::Full));
1180         PartOffset += MemVT.getStoreSize();
1181       }
1182     }
1183   }
1184 }
1185 
1186 SDValue AMDGPUTargetLowering::LowerReturn(
1187   SDValue Chain, CallingConv::ID CallConv,
1188   bool isVarArg,
1189   const SmallVectorImpl<ISD::OutputArg> &Outs,
1190   const SmallVectorImpl<SDValue> &OutVals,
1191   const SDLoc &DL, SelectionDAG &DAG) const {
1192   // FIXME: Fails for r600 tests
1193   //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1194   // "wave terminate should not have return values");
1195   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1196 }
1197 
1198 //===---------------------------------------------------------------------===//
1199 // Target specific lowering
1200 //===---------------------------------------------------------------------===//
1201 
1202 /// Selects the correct CCAssignFn for a given CallingConvention value.
1203 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1204                                                     bool IsVarArg) {
1205   return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1206 }
1207 
1208 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1209                                                       bool IsVarArg) {
1210   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1211 }
1212 
1213 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1214                                                   SelectionDAG &DAG,
1215                                                   MachineFrameInfo &MFI,
1216                                                   int ClobberedFI) const {
1217   SmallVector<SDValue, 8> ArgChains;
1218   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1219   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1220 
1221   // Include the original chain at the beginning of the list. When this is
1222   // used by target LowerCall hooks, this helps legalize find the
1223   // CALLSEQ_BEGIN node.
1224   ArgChains.push_back(Chain);
1225 
1226   // Add a chain value for each stack argument corresponding
1227   for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1228     if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1229       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1230         if (FI->getIndex() < 0) {
1231           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1232           int64_t InLastByte = InFirstByte;
1233           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1234 
1235           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1236               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1237             ArgChains.push_back(SDValue(L, 1));
1238         }
1239       }
1240     }
1241   }
1242 
1243   // Build a tokenfactor for all the chains.
1244   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1245 }
1246 
1247 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1248                                                  SmallVectorImpl<SDValue> &InVals,
1249                                                  StringRef Reason) const {
1250   SDValue Callee = CLI.Callee;
1251   SelectionDAG &DAG = CLI.DAG;
1252 
1253   const Function &Fn = DAG.getMachineFunction().getFunction();
1254 
1255   StringRef FuncName("<unknown>");
1256 
1257   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1258     FuncName = G->getSymbol();
1259   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1260     FuncName = G->getGlobal()->getName();
1261 
1262   DiagnosticInfoUnsupported NoCalls(
1263     Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1264   DAG.getContext()->diagnose(NoCalls);
1265 
1266   if (!CLI.IsTailCall) {
1267     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1268       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1269   }
1270 
1271   return DAG.getEntryNode();
1272 }
1273 
1274 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1275                                         SmallVectorImpl<SDValue> &InVals) const {
1276   return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1277 }
1278 
1279 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1280                                                       SelectionDAG &DAG) const {
1281   const Function &Fn = DAG.getMachineFunction().getFunction();
1282 
1283   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1284                                             SDLoc(Op).getDebugLoc());
1285   DAG.getContext()->diagnose(NoDynamicAlloca);
1286   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1287   return DAG.getMergeValues(Ops, SDLoc());
1288 }
1289 
1290 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1291                                              SelectionDAG &DAG) const {
1292   switch (Op.getOpcode()) {
1293   default:
1294     Op->print(errs(), &DAG);
1295     llvm_unreachable("Custom lowering code for this "
1296                      "instruction is not implemented yet!");
1297     break;
1298   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1299   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1300   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1301   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1302   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1303   case ISD::FREM: return LowerFREM(Op, DAG);
1304   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1305   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1306   case ISD::FRINT: return LowerFRINT(Op, DAG);
1307   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1308   case ISD::FROUNDEVEN:
1309     return LowerFROUNDEVEN(Op, DAG);
1310   case ISD::FROUND: return LowerFROUND(Op, DAG);
1311   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1312   case ISD::FLOG2:
1313     return LowerFLOG2(Op, DAG);
1314   case ISD::FLOG:
1315   case ISD::FLOG10:
1316     return LowerFLOGCommon(Op, DAG);
1317   case ISD::FEXP:
1318     return lowerFEXP(Op, DAG);
1319   case ISD::FEXP2:
1320     return lowerFEXP2(Op, DAG);
1321   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1322   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1323   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1324   case ISD::FP_TO_SINT:
1325   case ISD::FP_TO_UINT:
1326     return LowerFP_TO_INT(Op, DAG);
1327   case ISD::CTTZ:
1328   case ISD::CTTZ_ZERO_UNDEF:
1329   case ISD::CTLZ:
1330   case ISD::CTLZ_ZERO_UNDEF:
1331     return LowerCTLZ_CTTZ(Op, DAG);
1332   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1333   }
1334   return Op;
1335 }
1336 
1337 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1338                                               SmallVectorImpl<SDValue> &Results,
1339                                               SelectionDAG &DAG) const {
1340   switch (N->getOpcode()) {
1341   case ISD::SIGN_EXTEND_INREG:
1342     // Different parts of legalization seem to interpret which type of
1343     // sign_extend_inreg is the one to check for custom lowering. The extended
1344     // from type is what really matters, but some places check for custom
1345     // lowering of the result type. This results in trying to use
1346     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1347     // nothing here and let the illegal result integer be handled normally.
1348     return;
1349   case ISD::FLOG2:
1350     if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1351       Results.push_back(Lowered);
1352     return;
1353   case ISD::FLOG:
1354   case ISD::FLOG10:
1355     if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1356       Results.push_back(Lowered);
1357     return;
1358   case ISD::FEXP2:
1359     if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1360       Results.push_back(Lowered);
1361     return;
1362   case ISD::FEXP:
1363     if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1364       Results.push_back(Lowered);
1365     return;
1366   default:
1367     return;
1368   }
1369 }
1370 
1371 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1372                                                  SDValue Op,
1373                                                  SelectionDAG &DAG) const {
1374 
1375   const DataLayout &DL = DAG.getDataLayout();
1376   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1377   const GlobalValue *GV = G->getGlobal();
1378 
1379   if (!MFI->isModuleEntryFunction()) {
1380     if (std::optional<uint32_t> Address =
1381             AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
1382       return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1383     }
1384   }
1385 
1386   if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1387       G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1388     if (!MFI->isModuleEntryFunction() &&
1389         !GV->getName().equals("llvm.amdgcn.module.lds")) {
1390       SDLoc DL(Op);
1391       const Function &Fn = DAG.getMachineFunction().getFunction();
1392       DiagnosticInfoUnsupported BadLDSDecl(
1393         Fn, "local memory global used by non-kernel function",
1394         DL.getDebugLoc(), DS_Warning);
1395       DAG.getContext()->diagnose(BadLDSDecl);
1396 
1397       // We currently don't have a way to correctly allocate LDS objects that
1398       // aren't directly associated with a kernel. We do force inlining of
1399       // functions that use local objects. However, if these dead functions are
1400       // not eliminated, we don't want a compile time error. Just emit a warning
1401       // and a trap, since there should be no callable path here.
1402       SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1403       SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1404                                         Trap, DAG.getRoot());
1405       DAG.setRoot(OutputChain);
1406       return DAG.getUNDEF(Op.getValueType());
1407     }
1408 
1409     // XXX: What does the value of G->getOffset() mean?
1410     assert(G->getOffset() == 0 &&
1411          "Do not know what to do with an non-zero offset");
1412 
1413     // TODO: We could emit code to handle the initialization somewhere.
1414     // We ignore the initializer for now and legalize it to allow selection.
1415     // The initializer will anyway get errored out during assembly emission.
1416     unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1417     return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1418   }
1419   return SDValue();
1420 }
1421 
1422 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1423                                                   SelectionDAG &DAG) const {
1424   SmallVector<SDValue, 8> Args;
1425   SDLoc SL(Op);
1426 
1427   EVT VT = Op.getValueType();
1428   if (VT.getVectorElementType().getSizeInBits() < 32) {
1429     unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1430     if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1431       unsigned NewNumElt = OpBitSize / 32;
1432       EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1433                                       : EVT::getVectorVT(*DAG.getContext(),
1434                                                          MVT::i32, NewNumElt);
1435       for (const SDUse &U : Op->ops()) {
1436         SDValue In = U.get();
1437         SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1438         if (NewNumElt > 1)
1439           DAG.ExtractVectorElements(NewIn, Args);
1440         else
1441           Args.push_back(NewIn);
1442       }
1443 
1444       EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1445                                    NewNumElt * Op.getNumOperands());
1446       SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1447       return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1448     }
1449   }
1450 
1451   for (const SDUse &U : Op->ops())
1452     DAG.ExtractVectorElements(U.get(), Args);
1453 
1454   return DAG.getBuildVector(Op.getValueType(), SL, Args);
1455 }
1456 
1457 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1458                                                      SelectionDAG &DAG) const {
1459   SDLoc SL(Op);
1460   SmallVector<SDValue, 8> Args;
1461   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1462   EVT VT = Op.getValueType();
1463   EVT SrcVT = Op.getOperand(0).getValueType();
1464 
1465   if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1466     unsigned NumElt = VT.getVectorNumElements();
1467     unsigned NumSrcElt = SrcVT.getVectorNumElements();
1468     assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1469 
1470     // Extract 32-bit registers at a time.
1471     EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1472     EVT NewVT = NumElt == 2
1473                     ? MVT::i32
1474                     : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1475     SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1476 
1477     DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1478     if (NumElt == 2)
1479       Tmp = Args[0];
1480     else
1481       Tmp = DAG.getBuildVector(NewVT, SL, Args);
1482 
1483     return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1484   }
1485 
1486   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1487                             VT.getVectorNumElements());
1488 
1489   return DAG.getBuildVector(Op.getValueType(), SL, Args);
1490 }
1491 
1492 // TODO: Handle fabs too
1493 static SDValue peekFNeg(SDValue Val) {
1494   if (Val.getOpcode() == ISD::FNEG)
1495     return Val.getOperand(0);
1496 
1497   return Val;
1498 }
1499 
1500 static SDValue peekFPSignOps(SDValue Val) {
1501   if (Val.getOpcode() == ISD::FNEG)
1502     Val = Val.getOperand(0);
1503   if (Val.getOpcode() == ISD::FABS)
1504     Val = Val.getOperand(0);
1505   if (Val.getOpcode() == ISD::FCOPYSIGN)
1506     Val = Val.getOperand(0);
1507   return Val;
1508 }
1509 
1510 SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1511     const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1512     SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1513   SelectionDAG &DAG = DCI.DAG;
1514   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1515   switch (CCOpcode) {
1516   case ISD::SETOEQ:
1517   case ISD::SETONE:
1518   case ISD::SETUNE:
1519   case ISD::SETNE:
1520   case ISD::SETUEQ:
1521   case ISD::SETEQ:
1522   case ISD::SETFALSE:
1523   case ISD::SETFALSE2:
1524   case ISD::SETTRUE:
1525   case ISD::SETTRUE2:
1526   case ISD::SETUO:
1527   case ISD::SETO:
1528     break;
1529   case ISD::SETULE:
1530   case ISD::SETULT: {
1531     if (LHS == True)
1532       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1533     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1534   }
1535   case ISD::SETOLE:
1536   case ISD::SETOLT:
1537   case ISD::SETLE:
1538   case ISD::SETLT: {
1539     // Ordered. Assume ordered for undefined.
1540 
1541     // Only do this after legalization to avoid interfering with other combines
1542     // which might occur.
1543     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1544         !DCI.isCalledByLegalizer())
1545       return SDValue();
1546 
1547     // We need to permute the operands to get the correct NaN behavior. The
1548     // selected operand is the second one based on the failing compare with NaN,
1549     // so permute it based on the compare type the hardware uses.
1550     if (LHS == True)
1551       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1552     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1553   }
1554   case ISD::SETUGE:
1555   case ISD::SETUGT: {
1556     if (LHS == True)
1557       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1558     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1559   }
1560   case ISD::SETGT:
1561   case ISD::SETGE:
1562   case ISD::SETOGE:
1563   case ISD::SETOGT: {
1564     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1565         !DCI.isCalledByLegalizer())
1566       return SDValue();
1567 
1568     if (LHS == True)
1569       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1570     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1571   }
1572   case ISD::SETCC_INVALID:
1573     llvm_unreachable("Invalid setcc condcode!");
1574   }
1575   return SDValue();
1576 }
1577 
1578 /// Generate Min/Max node
1579 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1580                                                    SDValue LHS, SDValue RHS,
1581                                                    SDValue True, SDValue False,
1582                                                    SDValue CC,
1583                                                    DAGCombinerInfo &DCI) const {
1584   if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1585     return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1586 
1587   SelectionDAG &DAG = DCI.DAG;
1588 
1589   // If we can't directly match this, try to see if we can fold an fneg to
1590   // match.
1591 
1592   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1593   ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1594   SDValue NegTrue = peekFNeg(True);
1595 
1596   // Undo the combine foldFreeOpFromSelect does if it helps us match the
1597   // fmin/fmax.
1598   //
1599   // select (fcmp olt (lhs, K)), (fneg lhs), -K
1600   // -> fneg (fmin_legacy lhs, K)
1601   //
1602   // TODO: Use getNegatedExpression
1603   if (LHS == NegTrue && CFalse && CRHS) {
1604     APFloat NegRHS = neg(CRHS->getValueAPF());
1605     if (NegRHS == CFalse->getValueAPF()) {
1606       SDValue Combined =
1607           combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1608       if (Combined)
1609         return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1610       return SDValue();
1611     }
1612   }
1613 
1614   return SDValue();
1615 }
1616 
1617 std::pair<SDValue, SDValue>
1618 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1619   SDLoc SL(Op);
1620 
1621   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1622 
1623   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1624   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1625 
1626   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1627   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1628 
1629   return std::pair(Lo, Hi);
1630 }
1631 
1632 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1633   SDLoc SL(Op);
1634 
1635   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1636   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1637   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1638 }
1639 
1640 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1641   SDLoc SL(Op);
1642 
1643   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1644   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1645   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1646 }
1647 
1648 // Split a vector type into two parts. The first part is a power of two vector.
1649 // The second part is whatever is left over, and is a scalar if it would
1650 // otherwise be a 1-vector.
1651 std::pair<EVT, EVT>
1652 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1653   EVT LoVT, HiVT;
1654   EVT EltVT = VT.getVectorElementType();
1655   unsigned NumElts = VT.getVectorNumElements();
1656   unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1657   LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1658   HiVT = NumElts - LoNumElts == 1
1659              ? EltVT
1660              : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1661   return std::pair(LoVT, HiVT);
1662 }
1663 
1664 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1665 // scalar.
1666 std::pair<SDValue, SDValue>
1667 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1668                                   const EVT &LoVT, const EVT &HiVT,
1669                                   SelectionDAG &DAG) const {
1670   assert(LoVT.getVectorNumElements() +
1671                  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1672              N.getValueType().getVectorNumElements() &&
1673          "More vector elements requested than available!");
1674   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1675                            DAG.getVectorIdxConstant(0, DL));
1676   SDValue Hi = DAG.getNode(
1677       HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1678       HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1679   return std::pair(Lo, Hi);
1680 }
1681 
1682 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1683                                               SelectionDAG &DAG) const {
1684   LoadSDNode *Load = cast<LoadSDNode>(Op);
1685   EVT VT = Op.getValueType();
1686   SDLoc SL(Op);
1687 
1688 
1689   // If this is a 2 element vector, we really want to scalarize and not create
1690   // weird 1 element vectors.
1691   if (VT.getVectorNumElements() == 2) {
1692     SDValue Ops[2];
1693     std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1694     return DAG.getMergeValues(Ops, SL);
1695   }
1696 
1697   SDValue BasePtr = Load->getBasePtr();
1698   EVT MemVT = Load->getMemoryVT();
1699 
1700   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1701 
1702   EVT LoVT, HiVT;
1703   EVT LoMemVT, HiMemVT;
1704   SDValue Lo, Hi;
1705 
1706   std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1707   std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1708   std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1709 
1710   unsigned Size = LoMemVT.getStoreSize();
1711   Align BaseAlign = Load->getAlign();
1712   Align HiAlign = commonAlignment(BaseAlign, Size);
1713 
1714   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1715                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
1716                                   BaseAlign, Load->getMemOperand()->getFlags());
1717   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1718   SDValue HiLoad =
1719       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1720                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1721                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1722 
1723   SDValue Join;
1724   if (LoVT == HiVT) {
1725     // This is the case that the vector is power of two so was evenly split.
1726     Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1727   } else {
1728     Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1729                        DAG.getVectorIdxConstant(0, SL));
1730     Join = DAG.getNode(
1731         HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1732         VT, Join, HiLoad,
1733         DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1734   }
1735 
1736   SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1737                                      LoLoad.getValue(1), HiLoad.getValue(1))};
1738 
1739   return DAG.getMergeValues(Ops, SL);
1740 }
1741 
1742 SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1743                                                      SelectionDAG &DAG) const {
1744   LoadSDNode *Load = cast<LoadSDNode>(Op);
1745   EVT VT = Op.getValueType();
1746   SDValue BasePtr = Load->getBasePtr();
1747   EVT MemVT = Load->getMemoryVT();
1748   SDLoc SL(Op);
1749   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1750   Align BaseAlign = Load->getAlign();
1751   unsigned NumElements = MemVT.getVectorNumElements();
1752 
1753   // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1754   // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1755   if (NumElements != 3 ||
1756       (BaseAlign < Align(8) &&
1757        !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1758     return SplitVectorLoad(Op, DAG);
1759 
1760   assert(NumElements == 3);
1761 
1762   EVT WideVT =
1763       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1764   EVT WideMemVT =
1765       EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1766   SDValue WideLoad = DAG.getExtLoad(
1767       Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1768       WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1769   return DAG.getMergeValues(
1770       {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1771                    DAG.getVectorIdxConstant(0, SL)),
1772        WideLoad.getValue(1)},
1773       SL);
1774 }
1775 
1776 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1777                                                SelectionDAG &DAG) const {
1778   StoreSDNode *Store = cast<StoreSDNode>(Op);
1779   SDValue Val = Store->getValue();
1780   EVT VT = Val.getValueType();
1781 
1782   // If this is a 2 element vector, we really want to scalarize and not create
1783   // weird 1 element vectors.
1784   if (VT.getVectorNumElements() == 2)
1785     return scalarizeVectorStore(Store, DAG);
1786 
1787   EVT MemVT = Store->getMemoryVT();
1788   SDValue Chain = Store->getChain();
1789   SDValue BasePtr = Store->getBasePtr();
1790   SDLoc SL(Op);
1791 
1792   EVT LoVT, HiVT;
1793   EVT LoMemVT, HiMemVT;
1794   SDValue Lo, Hi;
1795 
1796   std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1797   std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1798   std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1799 
1800   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1801 
1802   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1803   Align BaseAlign = Store->getAlign();
1804   unsigned Size = LoMemVT.getStoreSize();
1805   Align HiAlign = commonAlignment(BaseAlign, Size);
1806 
1807   SDValue LoStore =
1808       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1809                         Store->getMemOperand()->getFlags());
1810   SDValue HiStore =
1811       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1812                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1813 
1814   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1815 }
1816 
1817 // This is a shortcut for integer division because we have fast i32<->f32
1818 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1819 // float is enough to accurately represent up to a 24-bit signed integer.
1820 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1821                                             bool Sign) const {
1822   SDLoc DL(Op);
1823   EVT VT = Op.getValueType();
1824   SDValue LHS = Op.getOperand(0);
1825   SDValue RHS = Op.getOperand(1);
1826   MVT IntVT = MVT::i32;
1827   MVT FltVT = MVT::f32;
1828 
1829   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1830   if (LHSSignBits < 9)
1831     return SDValue();
1832 
1833   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1834   if (RHSSignBits < 9)
1835     return SDValue();
1836 
1837   unsigned BitSize = VT.getSizeInBits();
1838   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1839   unsigned DivBits = BitSize - SignBits;
1840   if (Sign)
1841     ++DivBits;
1842 
1843   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1844   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1845 
1846   SDValue jq = DAG.getConstant(1, DL, IntVT);
1847 
1848   if (Sign) {
1849     // char|short jq = ia ^ ib;
1850     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1851 
1852     // jq = jq >> (bitsize - 2)
1853     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1854                      DAG.getConstant(BitSize - 2, DL, VT));
1855 
1856     // jq = jq | 0x1
1857     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1858   }
1859 
1860   // int ia = (int)LHS;
1861   SDValue ia = LHS;
1862 
1863   // int ib, (int)RHS;
1864   SDValue ib = RHS;
1865 
1866   // float fa = (float)ia;
1867   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1868 
1869   // float fb = (float)ib;
1870   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1871 
1872   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1873                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1874 
1875   // fq = trunc(fq);
1876   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1877 
1878   // float fqneg = -fq;
1879   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1880 
1881   MachineFunction &MF = DAG.getMachineFunction();
1882 
1883   bool UseFmadFtz = false;
1884   if (Subtarget->isGCN()) {
1885     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1886     UseFmadFtz =
1887         MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
1888   }
1889 
1890   // float fr = mad(fqneg, fb, fa);
1891   unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1892                     : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1893                                  : (unsigned)ISD::FMAD;
1894   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1895 
1896   // int iq = (int)fq;
1897   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1898 
1899   // fr = fabs(fr);
1900   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1901 
1902   // fb = fabs(fb);
1903   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1904 
1905   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1906 
1907   // int cv = fr >= fb;
1908   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1909 
1910   // jq = (cv ? jq : 0);
1911   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1912 
1913   // dst = iq + jq;
1914   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1915 
1916   // Rem needs compensation, it's easier to recompute it
1917   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1918   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1919 
1920   // Truncate to number of bits this divide really is.
1921   if (Sign) {
1922     SDValue InRegSize
1923       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1924     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1925     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1926   } else {
1927     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1928     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1929     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1930   }
1931 
1932   return DAG.getMergeValues({ Div, Rem }, DL);
1933 }
1934 
1935 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1936                                       SelectionDAG &DAG,
1937                                       SmallVectorImpl<SDValue> &Results) const {
1938   SDLoc DL(Op);
1939   EVT VT = Op.getValueType();
1940 
1941   assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1942 
1943   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1944 
1945   SDValue One = DAG.getConstant(1, DL, HalfVT);
1946   SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1947 
1948   //HiLo split
1949   SDValue LHS_Lo, LHS_Hi;
1950   SDValue LHS = Op.getOperand(0);
1951   std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
1952 
1953   SDValue RHS_Lo, RHS_Hi;
1954   SDValue RHS = Op.getOperand(1);
1955   std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
1956 
1957   if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1958       DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1959 
1960     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1961                               LHS_Lo, RHS_Lo);
1962 
1963     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1964     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1965 
1966     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1967     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1968     return;
1969   }
1970 
1971   if (isTypeLegal(MVT::i64)) {
1972     // The algorithm here is based on ideas from "Software Integer Division",
1973     // Tom Rodeheffer, August 2008.
1974 
1975     MachineFunction &MF = DAG.getMachineFunction();
1976     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1977 
1978     // Compute denominator reciprocal.
1979     unsigned FMAD =
1980         !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1981         : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
1982             ? (unsigned)ISD::FMAD
1983             : (unsigned)AMDGPUISD::FMAD_FTZ;
1984 
1985     SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1986     SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1987     SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1988       DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1989       Cvt_Lo);
1990     SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1991     SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1992       DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1993     SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1994       DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1995     SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1996     SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1997       DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1998       Mul1);
1999     SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2000     SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2001     SDValue Rcp64 = DAG.getBitcast(VT,
2002                         DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2003 
2004     SDValue Zero64 = DAG.getConstant(0, DL, VT);
2005     SDValue One64  = DAG.getConstant(1, DL, VT);
2006     SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2007     SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2008 
2009     // First round of UNR (Unsigned integer Newton-Raphson).
2010     SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2011     SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2012     SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2013     SDValue Mulhi1_Lo, Mulhi1_Hi;
2014     std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2015         DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2016     SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2017                                   Mulhi1_Lo, Zero1);
2018     SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2019                                   Mulhi1_Hi, Add1_Lo.getValue(1));
2020     SDValue Add1 = DAG.getBitcast(VT,
2021                         DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2022 
2023     // Second round of UNR.
2024     SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2025     SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2026     SDValue Mulhi2_Lo, Mulhi2_Hi;
2027     std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2028         DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2029     SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2030                                   Mulhi2_Lo, Zero1);
2031     SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2032                                   Mulhi2_Hi, Add2_Lo.getValue(1));
2033     SDValue Add2 = DAG.getBitcast(VT,
2034                         DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2035 
2036     SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2037 
2038     SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2039 
2040     SDValue Mul3_Lo, Mul3_Hi;
2041     std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2042     SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2043                                   Mul3_Lo, Zero1);
2044     SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2045                                   Mul3_Hi, Sub1_Lo.getValue(1));
2046     SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2047     SDValue Sub1 = DAG.getBitcast(VT,
2048                         DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2049 
2050     SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2051     SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2052                                  ISD::SETUGE);
2053     SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2054                                  ISD::SETUGE);
2055     SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2056 
2057     // TODO: Here and below portions of the code can be enclosed into if/endif.
2058     // Currently control flow is unconditional and we have 4 selects after
2059     // potential endif to substitute PHIs.
2060 
2061     // if C3 != 0 ...
2062     SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2063                                   RHS_Lo, Zero1);
2064     SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2065                                   RHS_Hi, Sub1_Lo.getValue(1));
2066     SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2067                                   Zero, Sub2_Lo.getValue(1));
2068     SDValue Sub2 = DAG.getBitcast(VT,
2069                         DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2070 
2071     SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2072 
2073     SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2074                                  ISD::SETUGE);
2075     SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2076                                  ISD::SETUGE);
2077     SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2078 
2079     // if (C6 != 0)
2080     SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2081 
2082     SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2083                                   RHS_Lo, Zero1);
2084     SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2085                                   RHS_Hi, Sub2_Lo.getValue(1));
2086     SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2087                                   Zero, Sub3_Lo.getValue(1));
2088     SDValue Sub3 = DAG.getBitcast(VT,
2089                         DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2090 
2091     // endif C6
2092     // endif C3
2093 
2094     SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2095     SDValue Div  = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2096 
2097     SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2098     SDValue Rem  = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2099 
2100     Results.push_back(Div);
2101     Results.push_back(Rem);
2102 
2103     return;
2104   }
2105 
2106   // r600 expandion.
2107   // Get Speculative values
2108   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2109   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2110 
2111   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2112   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2113   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2114 
2115   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2116   SDValue DIV_Lo = Zero;
2117 
2118   const unsigned halfBitWidth = HalfVT.getSizeInBits();
2119 
2120   for (unsigned i = 0; i < halfBitWidth; ++i) {
2121     const unsigned bitPos = halfBitWidth - i - 1;
2122     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2123     // Get value of high bit
2124     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2125     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2126     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2127 
2128     // Shift
2129     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2130     // Add LHS high bit
2131     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2132 
2133     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2134     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2135 
2136     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2137 
2138     // Update REM
2139     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2140     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2141   }
2142 
2143   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2144   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2145   Results.push_back(DIV);
2146   Results.push_back(REM);
2147 }
2148 
2149 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2150                                            SelectionDAG &DAG) const {
2151   SDLoc DL(Op);
2152   EVT VT = Op.getValueType();
2153 
2154   if (VT == MVT::i64) {
2155     SmallVector<SDValue, 2> Results;
2156     LowerUDIVREM64(Op, DAG, Results);
2157     return DAG.getMergeValues(Results, DL);
2158   }
2159 
2160   if (VT == MVT::i32) {
2161     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2162       return Res;
2163   }
2164 
2165   SDValue X = Op.getOperand(0);
2166   SDValue Y = Op.getOperand(1);
2167 
2168   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2169   // algorithm used here.
2170 
2171   // Initial estimate of inv(y).
2172   SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2173 
2174   // One round of UNR.
2175   SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2176   SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2177   Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2178                   DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2179 
2180   // Quotient/remainder estimate.
2181   SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2182   SDValue R =
2183       DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2184 
2185   // First quotient/remainder refinement.
2186   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2187   SDValue One = DAG.getConstant(1, DL, VT);
2188   SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2189   Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2190                   DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2191   R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2192                   DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2193 
2194   // Second quotient/remainder refinement.
2195   Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2196   Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2197                   DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2198   R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2199                   DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2200 
2201   return DAG.getMergeValues({Q, R}, DL);
2202 }
2203 
2204 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2205                                            SelectionDAG &DAG) const {
2206   SDLoc DL(Op);
2207   EVT VT = Op.getValueType();
2208 
2209   SDValue LHS = Op.getOperand(0);
2210   SDValue RHS = Op.getOperand(1);
2211 
2212   SDValue Zero = DAG.getConstant(0, DL, VT);
2213   SDValue NegOne = DAG.getConstant(-1, DL, VT);
2214 
2215   if (VT == MVT::i32) {
2216     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2217       return Res;
2218   }
2219 
2220   if (VT == MVT::i64 &&
2221       DAG.ComputeNumSignBits(LHS) > 32 &&
2222       DAG.ComputeNumSignBits(RHS) > 32) {
2223     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2224 
2225     //HiLo split
2226     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2227     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2228     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2229                                  LHS_Lo, RHS_Lo);
2230     SDValue Res[2] = {
2231       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2232       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2233     };
2234     return DAG.getMergeValues(Res, DL);
2235   }
2236 
2237   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2238   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2239   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2240   SDValue RSign = LHSign; // Remainder sign is the same as LHS
2241 
2242   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2243   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2244 
2245   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2246   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2247 
2248   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2249   SDValue Rem = Div.getValue(1);
2250 
2251   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2252   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2253 
2254   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2255   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2256 
2257   SDValue Res[2] = {
2258     Div,
2259     Rem
2260   };
2261   return DAG.getMergeValues(Res, DL);
2262 }
2263 
2264 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2265 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2266   SDLoc SL(Op);
2267   EVT VT = Op.getValueType();
2268   auto Flags = Op->getFlags();
2269   SDValue X = Op.getOperand(0);
2270   SDValue Y = Op.getOperand(1);
2271 
2272   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2273   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2274   SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2275   // TODO: For f32 use FMAD instead if !hasFastFMA32?
2276   return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2277 }
2278 
2279 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2280   SDLoc SL(Op);
2281   SDValue Src = Op.getOperand(0);
2282 
2283   // result = trunc(src)
2284   // if (src > 0.0 && src != result)
2285   //   result += 1.0
2286 
2287   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2288 
2289   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2290   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2291 
2292   EVT SetCCVT =
2293       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2294 
2295   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2296   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2297   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2298 
2299   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2300   // TODO: Should this propagate fast-math-flags?
2301   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2302 }
2303 
2304 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2305                                   SelectionDAG &DAG) {
2306   const unsigned FractBits = 52;
2307   const unsigned ExpBits = 11;
2308 
2309   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2310                                 Hi,
2311                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2312                                 DAG.getConstant(ExpBits, SL, MVT::i32));
2313   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2314                             DAG.getConstant(1023, SL, MVT::i32));
2315 
2316   return Exp;
2317 }
2318 
2319 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2320   SDLoc SL(Op);
2321   SDValue Src = Op.getOperand(0);
2322 
2323   assert(Op.getValueType() == MVT::f64);
2324 
2325   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2326 
2327   // Extract the upper half, since this is where we will find the sign and
2328   // exponent.
2329   SDValue Hi = getHiHalf64(Src, DAG);
2330 
2331   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2332 
2333   const unsigned FractBits = 52;
2334 
2335   // Extract the sign bit.
2336   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2337   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2338 
2339   // Extend back to 64-bits.
2340   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2341   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2342 
2343   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2344   const SDValue FractMask
2345     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2346 
2347   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2348   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2349   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2350 
2351   EVT SetCCVT =
2352       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2353 
2354   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2355 
2356   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2357   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2358 
2359   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2360   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2361 
2362   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2363 }
2364 
2365 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2366   SDLoc SL(Op);
2367   SDValue Src = Op.getOperand(0);
2368 
2369   assert(Op.getValueType() == MVT::f64);
2370 
2371   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2372   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2373   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2374 
2375   // TODO: Should this propagate fast-math-flags?
2376 
2377   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2378   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2379 
2380   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2381 
2382   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2383   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2384 
2385   EVT SetCCVT =
2386       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2387   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2388 
2389   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2390 }
2391 
2392 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2393   // FNEARBYINT and FRINT are the same, except in their handling of FP
2394   // exceptions. Those aren't really meaningful for us, and OpenCL only has
2395   // rint, so just treat them as equivalent.
2396   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2397 }
2398 
2399 SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2400                                               SelectionDAG &DAG) const {
2401   auto VT = Op.getValueType();
2402   auto Arg = Op.getOperand(0u);
2403   return DAG.getNode(ISD::FRINT, SDLoc(Op), VT, Arg);
2404 }
2405 
2406 // XXX - May require not supporting f32 denormals?
2407 
2408 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2409 // compare and vselect end up producing worse code than scalarizing the whole
2410 // operation.
2411 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2412   SDLoc SL(Op);
2413   SDValue X = Op.getOperand(0);
2414   EVT VT = Op.getValueType();
2415 
2416   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2417 
2418   // TODO: Should this propagate fast-math-flags?
2419 
2420   SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2421 
2422   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2423 
2424   const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2425   const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2426   const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2427 
2428   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2429 
2430   EVT SetCCVT =
2431       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2432 
2433   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2434 
2435   SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2436 
2437   return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2438 }
2439 
2440 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2441   SDLoc SL(Op);
2442   SDValue Src = Op.getOperand(0);
2443 
2444   // result = trunc(src);
2445   // if (src < 0.0 && src != result)
2446   //   result += -1.0.
2447 
2448   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2449 
2450   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2451   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2452 
2453   EVT SetCCVT =
2454       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2455 
2456   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2457   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2458   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2459 
2460   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2461   // TODO: Should this propagate fast-math-flags?
2462   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2463 }
2464 
2465 /// Return true if it's known that \p Src can never be an f32 denormal value.
2466 static bool valueIsKnownNeverF32Denorm(SDValue Src) {
2467   switch (Src.getOpcode()) {
2468   case ISD::FP_EXTEND:
2469     return Src.getOperand(0).getValueType() == MVT::f16;
2470   case ISD::FP16_TO_FP:
2471     return true;
2472   default:
2473     return false;
2474   }
2475 
2476   llvm_unreachable("covered opcode switch");
2477 }
2478 
2479 static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags) {
2480   if (Flags.hasApproximateFuncs())
2481     return true;
2482   auto &Options = DAG.getTarget().Options;
2483   return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2484 }
2485 
2486 static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src,
2487                                    SDNodeFlags Flags) {
2488   return !valueIsKnownNeverF32Denorm(Src) &&
2489          DAG.getMachineFunction()
2490                  .getDenormalMode(APFloat::IEEEsingle())
2491                  .Input != DenormalMode::PreserveSign;
2492 }
2493 
2494 SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
2495                                                     SDValue Src,
2496                                                     SDNodeFlags Flags) const {
2497   SDLoc SL(Src);
2498   EVT VT = Src.getValueType();
2499   const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
2500   SDValue SmallestNormal =
2501       DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2502 
2503   // Want to scale denormals up, but negatives and 0 work just as well on the
2504   // scaled path.
2505   SDValue IsLtSmallestNormal = DAG.getSetCC(
2506       SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2507       SmallestNormal, ISD::SETOLT);
2508 
2509   return IsLtSmallestNormal;
2510 }
2511 
2512 SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
2513                                           SDNodeFlags Flags) const {
2514   SDLoc SL(Src);
2515   EVT VT = Src.getValueType();
2516   const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
2517   SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2518 
2519   SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2520   SDValue IsFinite = DAG.getSetCC(
2521       SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2522       Inf, ISD::SETOLT);
2523   return IsFinite;
2524 }
2525 
2526 /// If denormal handling is required return the scaled input to FLOG2, and the
2527 /// check for denormal range. Otherwise, return null values.
2528 std::pair<SDValue, SDValue>
2529 AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
2530                                         SDValue Src, SDNodeFlags Flags) const {
2531   if (!needsDenormHandlingF32(DAG, Src, Flags))
2532     return {};
2533 
2534   MVT VT = MVT::f32;
2535   const fltSemantics &Semantics = APFloat::IEEEsingle();
2536   SDValue SmallestNormal =
2537       DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2538 
2539   SDValue IsLtSmallestNormal = DAG.getSetCC(
2540       SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2541       SmallestNormal, ISD::SETOLT);
2542 
2543   SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2544   SDValue One = DAG.getConstantFP(1.0, SL, VT);
2545   SDValue ScaleFactor =
2546       DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2547 
2548   SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2549   return {ScaledInput, IsLtSmallestNormal};
2550 }
2551 
2552 SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
2553   // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2554   // If we have to handle denormals, scale up the input and adjust the result.
2555 
2556   // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2557   // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2558 
2559   SDLoc SL(Op);
2560   EVT VT = Op.getValueType();
2561   SDValue Src = Op.getOperand(0);
2562   SDNodeFlags Flags = Op->getFlags();
2563 
2564   if (VT == MVT::f16) {
2565     // Nothing in half is a denormal when promoted to f32.
2566     assert(!Subtarget->has16BitInsts());
2567     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2568     SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2569     return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2570                        DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2571   }
2572 
2573   auto [ScaledInput, IsLtSmallestNormal] =
2574       getScaledLogInput(DAG, SL, Src, Flags);
2575   if (!ScaledInput)
2576     return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2577 
2578   SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2579 
2580   SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2581   SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2582   SDValue ResultOffset =
2583       DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2584   return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2585 }
2586 
2587 static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2588                       SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2589   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2590   return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2591 }
2592 
2593 SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
2594                                               SelectionDAG &DAG) const {
2595   SDValue X = Op.getOperand(0);
2596   EVT VT = Op.getValueType();
2597   SDNodeFlags Flags = Op->getFlags();
2598   SDLoc DL(Op);
2599 
2600   const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2601   assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2602 
2603   const auto &Options = getTargetMachine().Options;
2604   if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2605       Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2606 
2607     if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2608       // Log and multiply in f32 is good enough for f16.
2609       X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2610     }
2611 
2612     SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2613     if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2614       return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2615                          DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2616     }
2617 
2618     return Lowered;
2619   }
2620 
2621   auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2622   if (ScaledInput)
2623     X = ScaledInput;
2624 
2625   SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2626 
2627   SDValue R;
2628   if (Subtarget->hasFastFMAF32()) {
2629     // c+cc are ln(2)/ln(10) to more than 49 bits
2630     const float c_log10 = 0x1.344134p-2f;
2631     const float cc_log10 = 0x1.09f79ep-26f;
2632 
2633     // c + cc is ln(2) to more than 49 bits
2634     const float c_log = 0x1.62e42ep-1f;
2635     const float cc_log = 0x1.efa39ep-25f;
2636 
2637     SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2638     SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2639 
2640     R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2641     SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2642     SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2643     SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2644     R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2645   } else {
2646     // ch+ct is ln(2)/ln(10) to more than 36 bits
2647     const float ch_log10 = 0x1.344000p-2f;
2648     const float ct_log10 = 0x1.3509f6p-18f;
2649 
2650     // ch + ct is ln(2) to more than 36 bits
2651     const float ch_log = 0x1.62e000p-1f;
2652     const float ct_log = 0x1.0bfbe8p-15f;
2653 
2654     SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2655     SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2656 
2657     SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2658     SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2659     SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2660     SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2661     SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2662 
2663     SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2664     SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2665     SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2666     R = getMad(DAG, DL, VT, YH, CH, Mad1);
2667   }
2668 
2669   const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2670                             (Flags.hasNoInfs() || Options.NoInfsFPMath);
2671 
2672   // TODO: Check if known finite from source value.
2673   if (!IsFiniteOnly) {
2674     SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2675     R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2676   }
2677 
2678   if (IsScaled) {
2679     SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2680     SDValue ShiftK =
2681         DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2682     SDValue Shift =
2683         DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2684     R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2685   }
2686 
2687   return R;
2688 }
2689 
2690 SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
2691   return LowerFLOGCommon(Op, DAG);
2692 }
2693 
2694 // Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2695 // promote f16 operation.
2696 SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
2697                                               SelectionDAG &DAG, bool IsLog10,
2698                                               SDNodeFlags Flags) const {
2699   EVT VT = Src.getValueType();
2700   unsigned LogOp = VT == MVT::f32 ? AMDGPUISD::LOG : ISD::FLOG2;
2701 
2702   double Log2BaseInverted =
2703       IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2704 
2705   if (VT == MVT::f32) {
2706     auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2707     if (ScaledInput) {
2708       SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2709       SDValue ScaledResultOffset =
2710           DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2711 
2712       SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2713 
2714       SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2715                                          ScaledResultOffset, Zero, Flags);
2716 
2717       SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2718 
2719       if (Subtarget->hasFastFMAF32())
2720         return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2721                            Flags);
2722       SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2723       return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2724     }
2725   }
2726 
2727   SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2728   SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2729 
2730   return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2731                      Flags);
2732 }
2733 
2734 SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
2735   // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2736   // If we have to handle denormals, scale up the input and adjust the result.
2737 
2738   SDLoc SL(Op);
2739   EVT VT = Op.getValueType();
2740   SDValue Src = Op.getOperand(0);
2741   SDNodeFlags Flags = Op->getFlags();
2742 
2743   if (VT == MVT::f16) {
2744     // Nothing in half is a denormal when promoted to f32.
2745     assert(!Subtarget->has16BitInsts());
2746     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2747     SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2748     return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2749                        DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2750   }
2751 
2752   assert(VT == MVT::f32);
2753 
2754   if (!needsDenormHandlingF32(DAG, Src, Flags))
2755     return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2756 
2757   // bool needs_scaling = x < -0x1.f80000p+6f;
2758   // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2759 
2760   // -nextafter(128.0, -1)
2761   SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2762 
2763   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2764 
2765   SDValue NeedsScaling =
2766       DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2767 
2768   SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2769   SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2770 
2771   SDValue AddOffset =
2772       DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2773 
2774   SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2775   SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2776 
2777   SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2778   SDValue One = DAG.getConstantFP(1.0, SL, VT);
2779   SDValue ResultScale =
2780       DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2781 
2782   return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2783 }
2784 
2785 SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue Op, const SDLoc &SL,
2786                                               SelectionDAG &DAG,
2787                                               SDNodeFlags Flags) const {
2788   // exp2(M_LOG2E_F * f);
2789   EVT VT = Op.getValueType();
2790   const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2791   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Op, K, Flags);
2792   return DAG.getNode(VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2, SL, VT, Mul,
2793                      Flags);
2794 }
2795 
2796 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2797   EVT VT = Op.getValueType();
2798   SDLoc SL(Op);
2799   SDValue X = Op.getOperand(0);
2800   SDNodeFlags Flags = Op->getFlags();
2801   const bool IsExp10 = false; // TODO: For some reason exp10 is missing
2802 
2803   if (VT.getScalarType() == MVT::f16) {
2804     // v_exp_f16 (fmul x, log2e)
2805     if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2806       return lowerFEXPUnsafe(X, SL, DAG, Flags);
2807 
2808     if (VT.isVector())
2809       return SDValue();
2810 
2811     // exp(f16 x) ->
2812     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2813 
2814     // Nothing in half is a denormal when promoted to f32.
2815     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2816     SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2817     return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2818                        DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2819   }
2820 
2821   assert(VT == MVT::f32);
2822 
2823   // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
2824   // library behavior. Also, is known-not-daz source sufficient?
2825   if (allowApproxFunc(DAG, Flags) && !needsDenormHandlingF32(DAG, X, Flags)) {
2826     assert(!IsExp10 && "todo exp10 support");
2827     return lowerFEXPUnsafe(X, SL, DAG, Flags);
2828   }
2829 
2830   //    Algorithm:
2831   //
2832   //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
2833   //
2834   //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
2835   //    n = 64*m + j,   0 <= j < 64
2836   //
2837   //    e^x = 2^((64*m + j + f)/64)
2838   //        = (2^m) * (2^(j/64)) * 2^(f/64)
2839   //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
2840   //
2841   //    f = x*(64/ln(2)) - n
2842   //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
2843   //
2844   //    e^x = (2^m) * (2^(j/64)) * e^r
2845   //
2846   //    (2^(j/64)) is precomputed
2847   //
2848   //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2849   //    e^r = 1 + q
2850   //
2851   //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2852   //
2853   //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
2854   SDNodeFlags FlagsNoContract = Flags;
2855   FlagsNoContract.setAllowContract(false);
2856 
2857   SDValue PH, PL;
2858   if (Subtarget->hasFastFMAF32()) {
2859     const float c_exp = numbers::log2ef;
2860     const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
2861     const float c_exp10 = 0x1.a934f0p+1f;
2862     const float cc_exp10 = 0x1.2f346ep-24f;
2863 
2864     SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
2865     SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
2866 
2867     PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
2868     SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
2869     SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
2870     PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
2871   } else {
2872     const float ch_exp = 0x1.714000p+0f;
2873     const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
2874 
2875     const float ch_exp10 = 0x1.a92000p+1f;
2876     const float cl_exp10 = 0x1.4f0978p-11f;
2877 
2878     SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
2879     SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
2880 
2881     SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
2882     SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
2883     SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
2884     SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
2885     SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
2886 
2887     PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
2888 
2889     SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
2890     SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
2891     PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
2892   }
2893 
2894   SDValue E = DAG.getNode(ISD::FRINT, SL, VT, PH, Flags);
2895 
2896   // It is unsafe to contract this fsub into the PH multiply.
2897   SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
2898 
2899   SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
2900   SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
2901   SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
2902 
2903   SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
2904 
2905   SDValue UnderflowCheckConst =
2906       DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
2907 
2908   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2909   SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2910   SDValue Underflow =
2911       DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
2912 
2913   R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
2914   const auto &Options = getTargetMachine().Options;
2915 
2916   if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
2917     SDValue OverflowCheckConst =
2918         DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
2919     SDValue Overflow =
2920         DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
2921     SDValue Inf =
2922         DAG.getConstantFP(APFloat::getInf(APFloat::IEEEsingle()), SL, VT);
2923     R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
2924   }
2925 
2926   return R;
2927 }
2928 
2929 static bool isCtlzOpc(unsigned Opc) {
2930   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2931 }
2932 
2933 static bool isCttzOpc(unsigned Opc) {
2934   return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2935 }
2936 
2937 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2938   SDLoc SL(Op);
2939   SDValue Src = Op.getOperand(0);
2940 
2941   assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
2942   bool Ctlz = isCtlzOpc(Op.getOpcode());
2943   unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2944 
2945   bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2946                    Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2947 
2948   if (Src.getValueType() == MVT::i32) {
2949     // (ctlz hi:lo) -> (umin (ffbh src), 32)
2950     // (cttz hi:lo) -> (umin (ffbl src), 32)
2951     // (ctlz_zero_undef src) -> (ffbh src)
2952     // (cttz_zero_undef src) -> (ffbl src)
2953     SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2954     if (!ZeroUndef) {
2955       const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2956       NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2957     }
2958     return NewOpr;
2959   }
2960 
2961   SDValue Lo, Hi;
2962   std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2963 
2964   SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2965   SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2966 
2967   // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2968   // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2969   // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2970   // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2971 
2972   unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2973   const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2974   if (Ctlz)
2975     OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2976   else
2977     OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2978 
2979   SDValue NewOpr;
2980   NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2981   if (!ZeroUndef) {
2982     const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2983     NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2984   }
2985 
2986   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2987 }
2988 
2989 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2990                                                bool Signed) const {
2991   // The regular method converting a 64-bit integer to float roughly consists of
2992   // 2 steps: normalization and rounding. In fact, after normalization, the
2993   // conversion from a 64-bit integer to a float is essentially the same as the
2994   // one from a 32-bit integer. The only difference is that it has more
2995   // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2996   // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2997   // converted into the correct float number. The basic steps for the unsigned
2998   // conversion are illustrated in the following pseudo code:
2999   //
3000   // f32 uitofp(i64 u) {
3001   //   i32 hi, lo = split(u);
3002   //   // Only count the leading zeros in hi as we have native support of the
3003   //   // conversion from i32 to f32. If hi is all 0s, the conversion is
3004   //   // reduced to a 32-bit one automatically.
3005   //   i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3006   //   u <<= shamt;
3007   //   hi, lo = split(u);
3008   //   hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3009   //   // convert it as a 32-bit integer and scale the result back.
3010   //   return uitofp(hi) * 2^(32 - shamt);
3011   // }
3012   //
3013   // The signed one follows the same principle but uses 'ffbh_i32' to count its
3014   // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3015   // converted instead followed by negation based its sign bit.
3016 
3017   SDLoc SL(Op);
3018   SDValue Src = Op.getOperand(0);
3019 
3020   SDValue Lo, Hi;
3021   std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3022   SDValue Sign;
3023   SDValue ShAmt;
3024   if (Signed && Subtarget->isGCN()) {
3025     // We also need to consider the sign bit in Lo if Hi has just sign bits,
3026     // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3027     // account. That is, the maximal shift is
3028     // - 32 if Lo and Hi have opposite signs;
3029     // - 33 if Lo and Hi have the same sign.
3030     //
3031     // Or, MaxShAmt = 33 + OppositeSign, where
3032     //
3033     // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3034     // - -1 if Lo and Hi have opposite signs; and
3035     // -  0 otherwise.
3036     //
3037     // All in all, ShAmt is calculated as
3038     //
3039     //  umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3040     //
3041     // or
3042     //
3043     //  umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3044     //
3045     // to reduce the critical path.
3046     SDValue OppositeSign = DAG.getNode(
3047         ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3048         DAG.getConstant(31, SL, MVT::i32));
3049     SDValue MaxShAmt =
3050         DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3051                     OppositeSign);
3052     // Count the leading sign bits.
3053     ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3054     // Different from unsigned conversion, the shift should be one bit less to
3055     // preserve the sign bit.
3056     ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3057                         DAG.getConstant(1, SL, MVT::i32));
3058     ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3059   } else {
3060     if (Signed) {
3061       // Without 'ffbh_i32', only leading zeros could be counted. Take the
3062       // absolute value first.
3063       Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3064                          DAG.getConstant(63, SL, MVT::i64));
3065       SDValue Abs =
3066           DAG.getNode(ISD::XOR, SL, MVT::i64,
3067                       DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3068       std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3069     }
3070     // Count the leading zeros.
3071     ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3072     // The shift amount for signed integers is [0, 32].
3073   }
3074   // Normalize the given 64-bit integer.
3075   SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3076   // Split it again.
3077   std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3078   // Calculate the adjust bit for rounding.
3079   // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3080   SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3081                                DAG.getConstant(1, SL, MVT::i32), Lo);
3082   // Get the 32-bit normalized integer.
3083   Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3084   // Convert the normalized 32-bit integer into f32.
3085   unsigned Opc =
3086       (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3087   SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3088 
3089   // Finally, need to scale back the converted floating number as the original
3090   // 64-bit integer is converted as a 32-bit one.
3091   ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3092                       ShAmt);
3093   // On GCN, use LDEXP directly.
3094   if (Subtarget->isGCN())
3095     return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3096 
3097   // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3098   // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3099   // exponent is enough to avoid overflowing into the sign bit.
3100   SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3101                             DAG.getConstant(23, SL, MVT::i32));
3102   SDValue IVal =
3103       DAG.getNode(ISD::ADD, SL, MVT::i32,
3104                   DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3105   if (Signed) {
3106     // Set the sign bit.
3107     Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3108                        DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3109                        DAG.getConstant(31, SL, MVT::i32));
3110     IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3111   }
3112   return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3113 }
3114 
3115 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
3116                                                bool Signed) const {
3117   SDLoc SL(Op);
3118   SDValue Src = Op.getOperand(0);
3119 
3120   SDValue Lo, Hi;
3121   std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3122 
3123   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
3124                               SL, MVT::f64, Hi);
3125 
3126   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3127 
3128   SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3129                               DAG.getConstant(32, SL, MVT::i32));
3130   // TODO: Should this propagate fast-math-flags?
3131   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3132 }
3133 
3134 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
3135                                                SelectionDAG &DAG) const {
3136   // TODO: Factor out code common with LowerSINT_TO_FP.
3137   EVT DestVT = Op.getValueType();
3138   SDValue Src = Op.getOperand(0);
3139   EVT SrcVT = Src.getValueType();
3140 
3141   if (SrcVT == MVT::i16) {
3142     if (DestVT == MVT::f16)
3143       return Op;
3144     SDLoc DL(Op);
3145 
3146     // Promote src to i32
3147     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3148     return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3149   }
3150 
3151   assert(SrcVT == MVT::i64 && "operation should be legal");
3152 
3153   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3154     SDLoc DL(Op);
3155 
3156     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3157     SDValue FPRoundFlag =
3158         DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3159     SDValue FPRound =
3160         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3161 
3162     return FPRound;
3163   }
3164 
3165   if (DestVT == MVT::f32)
3166     return LowerINT_TO_FP32(Op, DAG, false);
3167 
3168   assert(DestVT == MVT::f64);
3169   return LowerINT_TO_FP64(Op, DAG, false);
3170 }
3171 
3172 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
3173                                               SelectionDAG &DAG) const {
3174   EVT DestVT = Op.getValueType();
3175 
3176   SDValue Src = Op.getOperand(0);
3177   EVT SrcVT = Src.getValueType();
3178 
3179   if (SrcVT == MVT::i16) {
3180     if (DestVT == MVT::f16)
3181       return Op;
3182 
3183     SDLoc DL(Op);
3184     // Promote src to i32
3185     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3186     return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3187   }
3188 
3189   assert(SrcVT == MVT::i64 && "operation should be legal");
3190 
3191   // TODO: Factor out code common with LowerUINT_TO_FP.
3192 
3193   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3194     SDLoc DL(Op);
3195     SDValue Src = Op.getOperand(0);
3196 
3197     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3198     SDValue FPRoundFlag =
3199         DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3200     SDValue FPRound =
3201         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3202 
3203     return FPRound;
3204   }
3205 
3206   if (DestVT == MVT::f32)
3207     return LowerINT_TO_FP32(Op, DAG, true);
3208 
3209   assert(DestVT == MVT::f64);
3210   return LowerINT_TO_FP64(Op, DAG, true);
3211 }
3212 
3213 SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
3214                                                bool Signed) const {
3215   SDLoc SL(Op);
3216 
3217   SDValue Src = Op.getOperand(0);
3218   EVT SrcVT = Src.getValueType();
3219 
3220   assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3221 
3222   // The basic idea of converting a floating point number into a pair of 32-bit
3223   // integers is illustrated as follows:
3224   //
3225   //     tf := trunc(val);
3226   //    hif := floor(tf * 2^-32);
3227   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
3228   //     hi := fptoi(hif);
3229   //     lo := fptoi(lof);
3230   //
3231   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3232   SDValue Sign;
3233   if (Signed && SrcVT == MVT::f32) {
3234     // However, a 32-bit floating point number has only 23 bits mantissa and
3235     // it's not enough to hold all the significant bits of `lof` if val is
3236     // negative. To avoid the loss of precision, We need to take the absolute
3237     // value after truncating and flip the result back based on the original
3238     // signedness.
3239     Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3240                        DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3241                        DAG.getConstant(31, SL, MVT::i32));
3242     Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3243   }
3244 
3245   SDValue K0, K1;
3246   if (SrcVT == MVT::f64) {
3247     K0 = DAG.getConstantFP(
3248         llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3249         SrcVT);
3250     K1 = DAG.getConstantFP(
3251         llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3252         SrcVT);
3253   } else {
3254     K0 = DAG.getConstantFP(
3255         llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3256     K1 = DAG.getConstantFP(
3257         llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3258   }
3259   // TODO: Should this propagate fast-math-flags?
3260   SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3261 
3262   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3263 
3264   SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3265 
3266   SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3267                                                          : ISD::FP_TO_UINT,
3268                            SL, MVT::i32, FloorMul);
3269   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3270 
3271   SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3272                                DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3273 
3274   if (Signed && SrcVT == MVT::f32) {
3275     assert(Sign);
3276     // Flip the result based on the signedness, which is either all 0s or 1s.
3277     Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3278                        DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3279     // r := xor(r, sign) - sign;
3280     Result =
3281         DAG.getNode(ISD::SUB, SL, MVT::i64,
3282                     DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3283   }
3284 
3285   return Result;
3286 }
3287 
3288 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
3289   SDLoc DL(Op);
3290   SDValue N0 = Op.getOperand(0);
3291 
3292   // Convert to target node to get known bits
3293   if (N0.getValueType() == MVT::f32)
3294     return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3295 
3296   if (getTargetMachine().Options.UnsafeFPMath) {
3297     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3298     return SDValue();
3299   }
3300 
3301   assert(N0.getSimpleValueType() == MVT::f64);
3302 
3303   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3304   const unsigned ExpMask = 0x7ff;
3305   const unsigned ExpBiasf64 = 1023;
3306   const unsigned ExpBiasf16 = 15;
3307   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3308   SDValue One = DAG.getConstant(1, DL, MVT::i32);
3309   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3310   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3311                            DAG.getConstant(32, DL, MVT::i64));
3312   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3313   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3314   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3315                           DAG.getConstant(20, DL, MVT::i64));
3316   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3317                   DAG.getConstant(ExpMask, DL, MVT::i32));
3318   // Subtract the fp64 exponent bias (1023) to get the real exponent and
3319   // add the f16 bias (15) to get the biased exponent for the f16 format.
3320   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3321                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3322 
3323   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3324                           DAG.getConstant(8, DL, MVT::i32));
3325   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3326                   DAG.getConstant(0xffe, DL, MVT::i32));
3327 
3328   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3329                                   DAG.getConstant(0x1ff, DL, MVT::i32));
3330   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3331 
3332   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3333   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3334 
3335   // (M != 0 ? 0x0200 : 0) | 0x7c00;
3336   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3337       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3338                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3339 
3340   // N = M | (E << 12);
3341   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3342       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3343                   DAG.getConstant(12, DL, MVT::i32)));
3344 
3345   // B = clamp(1-E, 0, 13);
3346   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3347                                   One, E);
3348   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3349   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3350                   DAG.getConstant(13, DL, MVT::i32));
3351 
3352   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3353                                    DAG.getConstant(0x1000, DL, MVT::i32));
3354 
3355   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3356   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3357   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3358   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3359 
3360   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3361   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3362                               DAG.getConstant(0x7, DL, MVT::i32));
3363   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3364                   DAG.getConstant(2, DL, MVT::i32));
3365   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3366                                One, Zero, ISD::SETEQ);
3367   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3368                                One, Zero, ISD::SETGT);
3369   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3370   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3371 
3372   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3373                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3374   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3375                       I, V, ISD::SETEQ);
3376 
3377   // Extract the sign bit.
3378   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3379                             DAG.getConstant(16, DL, MVT::i32));
3380   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3381                      DAG.getConstant(0x8000, DL, MVT::i32));
3382 
3383   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3384   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3385 }
3386 
3387 SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
3388                                              SelectionDAG &DAG) const {
3389   SDValue Src = Op.getOperand(0);
3390   unsigned OpOpcode = Op.getOpcode();
3391   EVT SrcVT = Src.getValueType();
3392   EVT DestVT = Op.getValueType();
3393 
3394   // Will be selected natively
3395   if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3396     return Op;
3397 
3398   // Promote i16 to i32
3399   if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3400     SDLoc DL(Op);
3401 
3402     SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3403     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3404   }
3405 
3406   if (SrcVT == MVT::f16 ||
3407       (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3408     SDLoc DL(Op);
3409 
3410     SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3411     unsigned Ext =
3412         OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3413     return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3414   }
3415 
3416   if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
3417     return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3418 
3419   return SDValue();
3420 }
3421 
3422 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
3423                                                      SelectionDAG &DAG) const {
3424   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3425   MVT VT = Op.getSimpleValueType();
3426   MVT ScalarVT = VT.getScalarType();
3427 
3428   assert(VT.isVector());
3429 
3430   SDValue Src = Op.getOperand(0);
3431   SDLoc DL(Op);
3432 
3433   // TODO: Don't scalarize on Evergreen?
3434   unsigned NElts = VT.getVectorNumElements();
3435   SmallVector<SDValue, 8> Args;
3436   DAG.ExtractVectorElements(Src, Args, 0, NElts);
3437 
3438   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3439   for (unsigned I = 0; I < NElts; ++I)
3440     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3441 
3442   return DAG.getBuildVector(VT, DL, Args);
3443 }
3444 
3445 //===----------------------------------------------------------------------===//
3446 // Custom DAG optimizations
3447 //===----------------------------------------------------------------------===//
3448 
3449 static bool isU24(SDValue Op, SelectionDAG &DAG) {
3450   return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3451 }
3452 
3453 static bool isI24(SDValue Op, SelectionDAG &DAG) {
3454   EVT VT = Op.getValueType();
3455   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3456                                      // as unsigned 24-bit values.
3457          AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
3458 }
3459 
3460 static SDValue simplifyMul24(SDNode *Node24,
3461                              TargetLowering::DAGCombinerInfo &DCI) {
3462   SelectionDAG &DAG = DCI.DAG;
3463   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3464   bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3465 
3466   SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3467   SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3468   unsigned NewOpcode = Node24->getOpcode();
3469   if (IsIntrin) {
3470     unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
3471     switch (IID) {
3472     case Intrinsic::amdgcn_mul_i24:
3473       NewOpcode = AMDGPUISD::MUL_I24;
3474       break;
3475     case Intrinsic::amdgcn_mul_u24:
3476       NewOpcode = AMDGPUISD::MUL_U24;
3477       break;
3478     case Intrinsic::amdgcn_mulhi_i24:
3479       NewOpcode = AMDGPUISD::MULHI_I24;
3480       break;
3481     case Intrinsic::amdgcn_mulhi_u24:
3482       NewOpcode = AMDGPUISD::MULHI_U24;
3483       break;
3484     default:
3485       llvm_unreachable("Expected 24-bit mul intrinsic");
3486     }
3487   }
3488 
3489   APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3490 
3491   // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3492   // the operands to have other uses, but will only perform simplifications that
3493   // involve bypassing some nodes for this user.
3494   SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3495   SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3496   if (DemandedLHS || DemandedRHS)
3497     return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3498                        DemandedLHS ? DemandedLHS : LHS,
3499                        DemandedRHS ? DemandedRHS : RHS);
3500 
3501   // Now try SimplifyDemandedBits which can simplify the nodes used by our
3502   // operands if this node is the only user.
3503   if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3504     return SDValue(Node24, 0);
3505   if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3506     return SDValue(Node24, 0);
3507 
3508   return SDValue();
3509 }
3510 
3511 template <typename IntTy>
3512 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
3513                                uint32_t Width, const SDLoc &DL) {
3514   if (Width + Offset < 32) {
3515     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3516     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3517     return DAG.getConstant(Result, DL, MVT::i32);
3518   }
3519 
3520   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3521 }
3522 
3523 static bool hasVolatileUser(SDNode *Val) {
3524   for (SDNode *U : Val->uses()) {
3525     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3526       if (M->isVolatile())
3527         return true;
3528     }
3529   }
3530 
3531   return false;
3532 }
3533 
3534 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
3535   // i32 vectors are the canonical memory type.
3536   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3537     return false;
3538 
3539   if (!VT.isByteSized())
3540     return false;
3541 
3542   unsigned Size = VT.getStoreSize();
3543 
3544   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3545     return false;
3546 
3547   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3548     return false;
3549 
3550   return true;
3551 }
3552 
3553 // Replace load of an illegal type with a store of a bitcast to a friendlier
3554 // type.
3555 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
3556                                                  DAGCombinerInfo &DCI) const {
3557   if (!DCI.isBeforeLegalize())
3558     return SDValue();
3559 
3560   LoadSDNode *LN = cast<LoadSDNode>(N);
3561   if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3562     return SDValue();
3563 
3564   SDLoc SL(N);
3565   SelectionDAG &DAG = DCI.DAG;
3566   EVT VT = LN->getMemoryVT();
3567 
3568   unsigned Size = VT.getStoreSize();
3569   Align Alignment = LN->getAlign();
3570   if (Alignment < Size && isTypeLegal(VT)) {
3571     unsigned IsFast;
3572     unsigned AS = LN->getAddressSpace();
3573 
3574     // Expand unaligned loads earlier than legalization. Due to visitation order
3575     // problems during legalization, the emitted instructions to pack and unpack
3576     // the bytes again are not eliminated in the case of an unaligned copy.
3577     if (!allowsMisalignedMemoryAccesses(
3578             VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3579       if (VT.isVector())
3580         return SplitVectorLoad(SDValue(LN, 0), DAG);
3581 
3582       SDValue Ops[2];
3583       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3584 
3585       return DAG.getMergeValues(Ops, SDLoc(N));
3586     }
3587 
3588     if (!IsFast)
3589       return SDValue();
3590   }
3591 
3592   if (!shouldCombineMemoryType(VT))
3593     return SDValue();
3594 
3595   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3596 
3597   SDValue NewLoad
3598     = DAG.getLoad(NewVT, SL, LN->getChain(),
3599                   LN->getBasePtr(), LN->getMemOperand());
3600 
3601   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3602   DCI.CombineTo(N, BC, NewLoad.getValue(1));
3603   return SDValue(N, 0);
3604 }
3605 
3606 // Replace store of an illegal type with a store of a bitcast to a friendlier
3607 // type.
3608 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3609                                                   DAGCombinerInfo &DCI) const {
3610   if (!DCI.isBeforeLegalize())
3611     return SDValue();
3612 
3613   StoreSDNode *SN = cast<StoreSDNode>(N);
3614   if (!SN->isSimple() || !ISD::isNormalStore(SN))
3615     return SDValue();
3616 
3617   EVT VT = SN->getMemoryVT();
3618   unsigned Size = VT.getStoreSize();
3619 
3620   SDLoc SL(N);
3621   SelectionDAG &DAG = DCI.DAG;
3622   Align Alignment = SN->getAlign();
3623   if (Alignment < Size && isTypeLegal(VT)) {
3624     unsigned IsFast;
3625     unsigned AS = SN->getAddressSpace();
3626 
3627     // Expand unaligned stores earlier than legalization. Due to visitation
3628     // order problems during legalization, the emitted instructions to pack and
3629     // unpack the bytes again are not eliminated in the case of an unaligned
3630     // copy.
3631     if (!allowsMisalignedMemoryAccesses(
3632             VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3633       if (VT.isVector())
3634         return SplitVectorStore(SDValue(SN, 0), DAG);
3635 
3636       return expandUnalignedStore(SN, DAG);
3637     }
3638 
3639     if (!IsFast)
3640       return SDValue();
3641   }
3642 
3643   if (!shouldCombineMemoryType(VT))
3644     return SDValue();
3645 
3646   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3647   SDValue Val = SN->getValue();
3648 
3649   //DCI.AddToWorklist(Val.getNode());
3650 
3651   bool OtherUses = !Val.hasOneUse();
3652   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3653   if (OtherUses) {
3654     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3655     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3656   }
3657 
3658   return DAG.getStore(SN->getChain(), SL, CastVal,
3659                       SN->getBasePtr(), SN->getMemOperand());
3660 }
3661 
3662 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3663 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3664 // issues.
3665 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3666                                                         DAGCombinerInfo &DCI) const {
3667   SelectionDAG &DAG = DCI.DAG;
3668   SDValue N0 = N->getOperand(0);
3669 
3670   // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3671   //     (vt2 (truncate (assertzext vt0:x, vt1)))
3672   if (N0.getOpcode() == ISD::TRUNCATE) {
3673     SDValue N1 = N->getOperand(1);
3674     EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3675     SDLoc SL(N);
3676 
3677     SDValue Src = N0.getOperand(0);
3678     EVT SrcVT = Src.getValueType();
3679     if (SrcVT.bitsGE(ExtVT)) {
3680       SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3681       return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3682     }
3683   }
3684 
3685   return SDValue();
3686 }
3687 
3688 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3689   SDNode *N, DAGCombinerInfo &DCI) const {
3690   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3691   switch (IID) {
3692   case Intrinsic::amdgcn_mul_i24:
3693   case Intrinsic::amdgcn_mul_u24:
3694   case Intrinsic::amdgcn_mulhi_i24:
3695   case Intrinsic::amdgcn_mulhi_u24:
3696     return simplifyMul24(N, DCI);
3697   case Intrinsic::amdgcn_fract:
3698   case Intrinsic::amdgcn_rsq:
3699   case Intrinsic::amdgcn_rcp_legacy:
3700   case Intrinsic::amdgcn_rsq_legacy:
3701   case Intrinsic::amdgcn_rsq_clamp:
3702   case Intrinsic::amdgcn_ldexp: {
3703     // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3704     SDValue Src = N->getOperand(1);
3705     return Src.isUndef() ? Src : SDValue();
3706   }
3707   case Intrinsic::amdgcn_frexp_exp: {
3708     // frexp_exp (fneg x) -> frexp_exp x
3709     // frexp_exp (fabs x) -> frexp_exp x
3710     // frexp_exp (fneg (fabs x)) -> frexp_exp x
3711     SDValue Src = N->getOperand(1);
3712     SDValue PeekSign = peekFPSignOps(Src);
3713     if (PeekSign == Src)
3714       return SDValue();
3715     return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
3716                    0);
3717   }
3718   default:
3719     return SDValue();
3720   }
3721 }
3722 
3723 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3724 /// binary operation \p Opc to it with the corresponding constant operands.
3725 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3726   DAGCombinerInfo &DCI, const SDLoc &SL,
3727   unsigned Opc, SDValue LHS,
3728   uint32_t ValLo, uint32_t ValHi) const {
3729   SelectionDAG &DAG = DCI.DAG;
3730   SDValue Lo, Hi;
3731   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3732 
3733   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3734   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3735 
3736   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3737   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3738 
3739   // Re-visit the ands. It's possible we eliminated one of them and it could
3740   // simplify the vector.
3741   DCI.AddToWorklist(Lo.getNode());
3742   DCI.AddToWorklist(Hi.getNode());
3743 
3744   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3745   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3746 }
3747 
3748 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3749                                                 DAGCombinerInfo &DCI) const {
3750   EVT VT = N->getValueType(0);
3751 
3752   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3753   if (!RHS)
3754     return SDValue();
3755 
3756   SDValue LHS = N->getOperand(0);
3757   unsigned RHSVal = RHS->getZExtValue();
3758   if (!RHSVal)
3759     return LHS;
3760 
3761   SDLoc SL(N);
3762   SelectionDAG &DAG = DCI.DAG;
3763 
3764   switch (LHS->getOpcode()) {
3765   default:
3766     break;
3767   case ISD::ZERO_EXTEND:
3768   case ISD::SIGN_EXTEND:
3769   case ISD::ANY_EXTEND: {
3770     SDValue X = LHS->getOperand(0);
3771 
3772     if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3773         isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3774       // Prefer build_vector as the canonical form if packed types are legal.
3775       // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3776       SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3777        { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3778       return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3779     }
3780 
3781     // shl (ext x) => zext (shl x), if shift does not overflow int
3782     if (VT != MVT::i64)
3783       break;
3784     KnownBits Known = DAG.computeKnownBits(X);
3785     unsigned LZ = Known.countMinLeadingZeros();
3786     if (LZ < RHSVal)
3787       break;
3788     EVT XVT = X.getValueType();
3789     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3790     return DAG.getZExtOrTrunc(Shl, SL, VT);
3791   }
3792   }
3793 
3794   if (VT != MVT::i64)
3795     return SDValue();
3796 
3797   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3798 
3799   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3800   // common case, splitting this into a move and a 32-bit shift is faster and
3801   // the same code size.
3802   if (RHSVal < 32)
3803     return SDValue();
3804 
3805   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3806 
3807   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3808   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3809 
3810   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3811 
3812   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3813   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3814 }
3815 
3816 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3817                                                 DAGCombinerInfo &DCI) const {
3818   if (N->getValueType(0) != MVT::i64)
3819     return SDValue();
3820 
3821   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3822   if (!RHS)
3823     return SDValue();
3824 
3825   SelectionDAG &DAG = DCI.DAG;
3826   SDLoc SL(N);
3827   unsigned RHSVal = RHS->getZExtValue();
3828 
3829   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3830   if (RHSVal == 32) {
3831     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3832     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3833                                    DAG.getConstant(31, SL, MVT::i32));
3834 
3835     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3836     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3837   }
3838 
3839   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3840   if (RHSVal == 63) {
3841     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3842     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3843                                    DAG.getConstant(31, SL, MVT::i32));
3844     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3845     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3846   }
3847 
3848   return SDValue();
3849 }
3850 
3851 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3852                                                 DAGCombinerInfo &DCI) const {
3853   auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3854   if (!RHS)
3855     return SDValue();
3856 
3857   EVT VT = N->getValueType(0);
3858   SDValue LHS = N->getOperand(0);
3859   unsigned ShiftAmt = RHS->getZExtValue();
3860   SelectionDAG &DAG = DCI.DAG;
3861   SDLoc SL(N);
3862 
3863   // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3864   // this improves the ability to match BFE patterns in isel.
3865   if (LHS.getOpcode() == ISD::AND) {
3866     if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3867       unsigned MaskIdx, MaskLen;
3868       if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
3869           MaskIdx == ShiftAmt) {
3870         return DAG.getNode(
3871             ISD::AND, SL, VT,
3872             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3873             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3874       }
3875     }
3876   }
3877 
3878   if (VT != MVT::i64)
3879     return SDValue();
3880 
3881   if (ShiftAmt < 32)
3882     return SDValue();
3883 
3884   // srl i64:x, C for C >= 32
3885   // =>
3886   //   build_pair (srl hi_32(x), C - 32), 0
3887   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3888 
3889   SDValue Hi = getHiHalf64(LHS, DAG);
3890 
3891   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3892   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3893 
3894   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3895 
3896   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3897 }
3898 
3899 SDValue AMDGPUTargetLowering::performTruncateCombine(
3900   SDNode *N, DAGCombinerInfo &DCI) const {
3901   SDLoc SL(N);
3902   SelectionDAG &DAG = DCI.DAG;
3903   EVT VT = N->getValueType(0);
3904   SDValue Src = N->getOperand(0);
3905 
3906   // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3907   if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3908     SDValue Vec = Src.getOperand(0);
3909     if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3910       SDValue Elt0 = Vec.getOperand(0);
3911       EVT EltVT = Elt0.getValueType();
3912       if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3913         if (EltVT.isFloatingPoint()) {
3914           Elt0 = DAG.getNode(ISD::BITCAST, SL,
3915                              EltVT.changeTypeToInteger(), Elt0);
3916         }
3917 
3918         return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3919       }
3920     }
3921   }
3922 
3923   // Equivalent of above for accessing the high element of a vector as an
3924   // integer operation.
3925   // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3926   if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3927     if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3928       if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3929         SDValue BV = stripBitcast(Src.getOperand(0));
3930         if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3931             BV.getValueType().getVectorNumElements() == 2) {
3932           SDValue SrcElt = BV.getOperand(1);
3933           EVT SrcEltVT = SrcElt.getValueType();
3934           if (SrcEltVT.isFloatingPoint()) {
3935             SrcElt = DAG.getNode(ISD::BITCAST, SL,
3936                                  SrcEltVT.changeTypeToInteger(), SrcElt);
3937           }
3938 
3939           return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3940         }
3941       }
3942     }
3943   }
3944 
3945   // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3946   //
3947   // i16 (trunc (srl i64:x, K)), K <= 16 ->
3948   //     i16 (trunc (srl (i32 (trunc x), K)))
3949   if (VT.getScalarSizeInBits() < 32) {
3950     EVT SrcVT = Src.getValueType();
3951     if (SrcVT.getScalarSizeInBits() > 32 &&
3952         (Src.getOpcode() == ISD::SRL ||
3953          Src.getOpcode() == ISD::SRA ||
3954          Src.getOpcode() == ISD::SHL)) {
3955       SDValue Amt = Src.getOperand(1);
3956       KnownBits Known = DAG.computeKnownBits(Amt);
3957 
3958       // - For left shifts, do the transform as long as the shift
3959       //   amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
3960       // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
3961       //   losing information stored in the high bits when truncating.
3962       const unsigned MaxCstSize =
3963           (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
3964       if (Known.getMaxValue().ule(MaxCstSize)) {
3965         EVT MidVT = VT.isVector() ?
3966           EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3967                            VT.getVectorNumElements()) : MVT::i32;
3968 
3969         EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3970         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3971                                     Src.getOperand(0));
3972         DCI.AddToWorklist(Trunc.getNode());
3973 
3974         if (Amt.getValueType() != NewShiftVT) {
3975           Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3976           DCI.AddToWorklist(Amt.getNode());
3977         }
3978 
3979         SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3980                                           Trunc, Amt);
3981         return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3982       }
3983     }
3984   }
3985 
3986   return SDValue();
3987 }
3988 
3989 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3990 // instructions. If we only match on the legalized i64 mul expansion,
3991 // SimplifyDemandedBits will be unable to remove them because there will be
3992 // multiple uses due to the separate mul + mulh[su].
3993 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3994                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3995   if (Size <= 32) {
3996     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3997     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3998   }
3999 
4000   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4001   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4002 
4003   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4004   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4005 
4006   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4007 }
4008 
4009 /// If \p V is an add of a constant 1, returns the other operand. Otherwise
4010 /// return SDValue().
4011 static SDValue getAddOneOp(const SDNode *V) {
4012   if (V->getOpcode() != ISD::ADD)
4013     return SDValue();
4014 
4015   auto *C = dyn_cast<ConstantSDNode>(V->getOperand(1));
4016   return C && C->isOne() ? V->getOperand(0) : SDValue();
4017 }
4018 
4019 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
4020                                                 DAGCombinerInfo &DCI) const {
4021   EVT VT = N->getValueType(0);
4022 
4023   // Don't generate 24-bit multiplies on values that are in SGPRs, since
4024   // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4025   // unnecessarily). isDivergent() is used as an approximation of whether the
4026   // value is in an SGPR.
4027   if (!N->isDivergent())
4028     return SDValue();
4029 
4030   unsigned Size = VT.getSizeInBits();
4031   if (VT.isVector() || Size > 64)
4032     return SDValue();
4033 
4034   SelectionDAG &DAG = DCI.DAG;
4035   SDLoc DL(N);
4036 
4037   SDValue N0 = N->getOperand(0);
4038   SDValue N1 = N->getOperand(1);
4039 
4040   // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4041   // matching.
4042 
4043   // mul x, (add y, 1) -> add (mul x, y), x
4044   auto IsFoldableAdd = [](SDValue V) -> SDValue {
4045     SDValue AddOp = getAddOneOp(V.getNode());
4046     if (!AddOp)
4047       return SDValue();
4048 
4049     if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
4050           return U->getOpcode() == ISD::MUL;
4051         }))
4052       return AddOp;
4053 
4054     return SDValue();
4055   };
4056 
4057   // FIXME: The selection pattern is not properly checking for commuted
4058   // operands, so we have to place the mul in the LHS
4059   if (SDValue MulOper = IsFoldableAdd(N0)) {
4060     SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4061     return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4062   }
4063 
4064   if (SDValue MulOper = IsFoldableAdd(N1)) {
4065     SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4066     return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4067   }
4068 
4069   // Skip if already mul24.
4070   if (N->getOpcode() != ISD::MUL)
4071     return SDValue();
4072 
4073   // There are i16 integer mul/mad.
4074   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4075     return SDValue();
4076 
4077   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4078   // in the source into any_extends if the result of the mul is truncated. Since
4079   // we can assume the high bits are whatever we want, use the underlying value
4080   // to avoid the unknown high bits from interfering.
4081   if (N0.getOpcode() == ISD::ANY_EXTEND)
4082     N0 = N0.getOperand(0);
4083 
4084   if (N1.getOpcode() == ISD::ANY_EXTEND)
4085     N1 = N1.getOperand(0);
4086 
4087   SDValue Mul;
4088 
4089   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4090     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4091     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4092     Mul = getMul24(DAG, DL, N0, N1, Size, false);
4093   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4094     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4095     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4096     Mul = getMul24(DAG, DL, N0, N1, Size, true);
4097   } else {
4098     return SDValue();
4099   }
4100 
4101   // We need to use sext even for MUL_U24, because MUL_U24 is used
4102   // for signed multiply of 8 and 16-bit types.
4103   return DAG.getSExtOrTrunc(Mul, DL, VT);
4104 }
4105 
4106 SDValue
4107 AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
4108                                             DAGCombinerInfo &DCI) const {
4109   if (N->getValueType(0) != MVT::i32)
4110     return SDValue();
4111 
4112   SelectionDAG &DAG = DCI.DAG;
4113   SDLoc DL(N);
4114 
4115   SDValue N0 = N->getOperand(0);
4116   SDValue N1 = N->getOperand(1);
4117 
4118   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4119   // in the source into any_extends if the result of the mul is truncated. Since
4120   // we can assume the high bits are whatever we want, use the underlying value
4121   // to avoid the unknown high bits from interfering.
4122   if (N0.getOpcode() == ISD::ANY_EXTEND)
4123     N0 = N0.getOperand(0);
4124   if (N1.getOpcode() == ISD::ANY_EXTEND)
4125     N1 = N1.getOperand(0);
4126 
4127   // Try to use two fast 24-bit multiplies (one for each half of the result)
4128   // instead of one slow extending multiply.
4129   unsigned LoOpcode, HiOpcode;
4130   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4131     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4132     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4133     LoOpcode = AMDGPUISD::MUL_U24;
4134     HiOpcode = AMDGPUISD::MULHI_U24;
4135   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4136     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4137     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4138     LoOpcode = AMDGPUISD::MUL_I24;
4139     HiOpcode = AMDGPUISD::MULHI_I24;
4140   } else {
4141     return SDValue();
4142   }
4143 
4144   SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4145   SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4146   DCI.CombineTo(N, Lo, Hi);
4147   return SDValue(N, 0);
4148 }
4149 
4150 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
4151                                                   DAGCombinerInfo &DCI) const {
4152   EVT VT = N->getValueType(0);
4153 
4154   if (!Subtarget->hasMulI24() || VT.isVector())
4155     return SDValue();
4156 
4157   // Don't generate 24-bit multiplies on values that are in SGPRs, since
4158   // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4159   // unnecessarily). isDivergent() is used as an approximation of whether the
4160   // value is in an SGPR.
4161   // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4162   // valu op anyway)
4163   if (Subtarget->hasSMulHi() && !N->isDivergent())
4164     return SDValue();
4165 
4166   SelectionDAG &DAG = DCI.DAG;
4167   SDLoc DL(N);
4168 
4169   SDValue N0 = N->getOperand(0);
4170   SDValue N1 = N->getOperand(1);
4171 
4172   if (!isI24(N0, DAG) || !isI24(N1, DAG))
4173     return SDValue();
4174 
4175   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4176   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4177 
4178   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4179   DCI.AddToWorklist(Mulhi.getNode());
4180   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4181 }
4182 
4183 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
4184                                                   DAGCombinerInfo &DCI) const {
4185   EVT VT = N->getValueType(0);
4186 
4187   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4188     return SDValue();
4189 
4190   // Don't generate 24-bit multiplies on values that are in SGPRs, since
4191   // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4192   // unnecessarily). isDivergent() is used as an approximation of whether the
4193   // value is in an SGPR.
4194   // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4195   // valu op anyway)
4196   if (Subtarget->hasSMulHi() && !N->isDivergent())
4197     return SDValue();
4198 
4199   SelectionDAG &DAG = DCI.DAG;
4200   SDLoc DL(N);
4201 
4202   SDValue N0 = N->getOperand(0);
4203   SDValue N1 = N->getOperand(1);
4204 
4205   if (!isU24(N0, DAG) || !isU24(N1, DAG))
4206     return SDValue();
4207 
4208   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4209   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4210 
4211   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4212   DCI.AddToWorklist(Mulhi.getNode());
4213   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4214 }
4215 
4216 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4217                                           SDValue Op,
4218                                           const SDLoc &DL,
4219                                           unsigned Opc) const {
4220   EVT VT = Op.getValueType();
4221   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4222   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4223                               LegalVT != MVT::i16))
4224     return SDValue();
4225 
4226   if (VT != MVT::i32)
4227     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4228 
4229   SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4230   if (VT != MVT::i32)
4231     FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4232 
4233   return FFBX;
4234 }
4235 
4236 // The native instructions return -1 on 0 input. Optimize out a select that
4237 // produces -1 on 0.
4238 //
4239 // TODO: If zero is not undef, we could also do this if the output is compared
4240 // against the bitwidth.
4241 //
4242 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4243 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
4244                                                  SDValue LHS, SDValue RHS,
4245                                                  DAGCombinerInfo &DCI) const {
4246   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
4247   if (!CmpRhs || !CmpRhs->isZero())
4248     return SDValue();
4249 
4250   SelectionDAG &DAG = DCI.DAG;
4251   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4252   SDValue CmpLHS = Cond.getOperand(0);
4253 
4254   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4255   // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4256   if (CCOpcode == ISD::SETEQ &&
4257       (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4258       RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4259     unsigned Opc =
4260         isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4261     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4262   }
4263 
4264   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4265   // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4266   if (CCOpcode == ISD::SETNE &&
4267       (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4268       LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4269     unsigned Opc =
4270         isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4271 
4272     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4273   }
4274 
4275   return SDValue();
4276 }
4277 
4278 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
4279                                          unsigned Op,
4280                                          const SDLoc &SL,
4281                                          SDValue Cond,
4282                                          SDValue N1,
4283                                          SDValue N2) {
4284   SelectionDAG &DAG = DCI.DAG;
4285   EVT VT = N1.getValueType();
4286 
4287   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4288                                   N1.getOperand(0), N2.getOperand(0));
4289   DCI.AddToWorklist(NewSelect.getNode());
4290   return DAG.getNode(Op, SL, VT, NewSelect);
4291 }
4292 
4293 // Pull a free FP operation out of a select so it may fold into uses.
4294 //
4295 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4296 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
4297 //
4298 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4299 // select c, (fabs x), +k -> fabs (select c, x, k)
4300 SDValue
4301 AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4302                                            SDValue N) const {
4303   SelectionDAG &DAG = DCI.DAG;
4304   SDValue Cond = N.getOperand(0);
4305   SDValue LHS = N.getOperand(1);
4306   SDValue RHS = N.getOperand(2);
4307 
4308   EVT VT = N.getValueType();
4309   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4310       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4311     if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
4312       return SDValue();
4313 
4314     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4315                                      SDLoc(N), Cond, LHS, RHS);
4316   }
4317 
4318   bool Inv = false;
4319   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4320     std::swap(LHS, RHS);
4321     Inv = true;
4322   }
4323 
4324   // TODO: Support vector constants.
4325   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4326   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4327       !selectSupportsSourceMods(N.getNode())) {
4328     SDLoc SL(N);
4329     // If one side is an fneg/fabs and the other is a constant, we can push the
4330     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4331     SDValue NewLHS = LHS.getOperand(0);
4332     SDValue NewRHS = RHS;
4333 
4334     // Careful: if the neg can be folded up, don't try to pull it back down.
4335     bool ShouldFoldNeg = true;
4336 
4337     if (NewLHS.hasOneUse()) {
4338       unsigned Opc = NewLHS.getOpcode();
4339       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4340         ShouldFoldNeg = false;
4341       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4342         ShouldFoldNeg = false;
4343     }
4344 
4345     if (ShouldFoldNeg) {
4346       if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4347         return SDValue();
4348 
4349       // We're going to be forced to use a source modifier anyway, there's no
4350       // point to pulling the negate out unless we can get a size reduction by
4351       // negating the constant.
4352       //
4353       // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4354       // about cheaper constants.
4355       if (NewLHS.getOpcode() == ISD::FABS &&
4356           getConstantNegateCost(CRHS) != NegatibleCost::Cheaper)
4357         return SDValue();
4358 
4359       if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
4360         return SDValue();
4361 
4362       if (LHS.getOpcode() == ISD::FNEG)
4363         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4364 
4365       if (Inv)
4366         std::swap(NewLHS, NewRHS);
4367 
4368       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4369                                       Cond, NewLHS, NewRHS);
4370       DCI.AddToWorklist(NewSelect.getNode());
4371       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4372     }
4373   }
4374 
4375   return SDValue();
4376 }
4377 
4378 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
4379                                                    DAGCombinerInfo &DCI) const {
4380   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4381     return Folded;
4382 
4383   SDValue Cond = N->getOperand(0);
4384   if (Cond.getOpcode() != ISD::SETCC)
4385     return SDValue();
4386 
4387   EVT VT = N->getValueType(0);
4388   SDValue LHS = Cond.getOperand(0);
4389   SDValue RHS = Cond.getOperand(1);
4390   SDValue CC = Cond.getOperand(2);
4391 
4392   SDValue True = N->getOperand(1);
4393   SDValue False = N->getOperand(2);
4394 
4395   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4396     SelectionDAG &DAG = DCI.DAG;
4397     if (DAG.isConstantValueOfAnyType(True) &&
4398         !DAG.isConstantValueOfAnyType(False)) {
4399       // Swap cmp + select pair to move constant to false input.
4400       // This will allow using VOPC cndmasks more often.
4401       // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4402 
4403       SDLoc SL(N);
4404       ISD::CondCode NewCC =
4405           getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4406 
4407       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4408       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4409     }
4410 
4411     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4412       SDValue MinMax
4413         = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4414       // Revisit this node so we can catch min3/max3/med3 patterns.
4415       //DCI.AddToWorklist(MinMax.getNode());
4416       return MinMax;
4417     }
4418   }
4419 
4420   // There's no reason to not do this if the condition has other uses.
4421   return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4422 }
4423 
4424 static bool isInv2Pi(const APFloat &APF) {
4425   static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4426   static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4427   static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4428 
4429   return APF.bitwiseIsEqual(KF16) ||
4430          APF.bitwiseIsEqual(KF32) ||
4431          APF.bitwiseIsEqual(KF64);
4432 }
4433 
4434 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4435 // additional cost to negate them.
4436 TargetLowering::NegatibleCost
4437 AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {
4438   if (C->isZero())
4439     return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4440 
4441   if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4442     return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4443 
4444   return NegatibleCost::Neutral;
4445 }
4446 
4447 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
4448   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4449     return getConstantNegateCost(C) == NegatibleCost::Expensive;
4450   return false;
4451 }
4452 
4453 bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
4454   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4455     return getConstantNegateCost(C) == NegatibleCost::Cheaper;
4456   return false;
4457 }
4458 
4459 static unsigned inverseMinMax(unsigned Opc) {
4460   switch (Opc) {
4461   case ISD::FMAXNUM:
4462     return ISD::FMINNUM;
4463   case ISD::FMINNUM:
4464     return ISD::FMAXNUM;
4465   case ISD::FMAXNUM_IEEE:
4466     return ISD::FMINNUM_IEEE;
4467   case ISD::FMINNUM_IEEE:
4468     return ISD::FMAXNUM_IEEE;
4469   case AMDGPUISD::FMAX_LEGACY:
4470     return AMDGPUISD::FMIN_LEGACY;
4471   case AMDGPUISD::FMIN_LEGACY:
4472     return  AMDGPUISD::FMAX_LEGACY;
4473   default:
4474     llvm_unreachable("invalid min/max opcode");
4475   }
4476 }
4477 
4478 /// \return true if it's profitable to try to push an fneg into its source
4479 /// instruction.
4480 bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
4481   // If the input has multiple uses and we can either fold the negate down, or
4482   // the other uses cannot, give up. This both prevents unprofitable
4483   // transformations and infinite loops: we won't repeatedly try to fold around
4484   // a negate that has no 'good' form.
4485   if (N0.hasOneUse()) {
4486     // This may be able to fold into the source, but at a code size cost. Don't
4487     // fold if the fold into the user is free.
4488     if (allUsesHaveSourceMods(N, 0))
4489       return false;
4490   } else {
4491     if (fnegFoldsIntoOp(N0.getNode()) &&
4492         (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
4493       return false;
4494   }
4495 
4496   return true;
4497 }
4498 
4499 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
4500                                                  DAGCombinerInfo &DCI) const {
4501   SelectionDAG &DAG = DCI.DAG;
4502   SDValue N0 = N->getOperand(0);
4503   EVT VT = N->getValueType(0);
4504 
4505   unsigned Opc = N0.getOpcode();
4506 
4507   if (!shouldFoldFNegIntoSrc(N, N0))
4508     return SDValue();
4509 
4510   SDLoc SL(N);
4511   switch (Opc) {
4512   case ISD::FADD: {
4513     if (!mayIgnoreSignedZero(N0))
4514       return SDValue();
4515 
4516     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4517     SDValue LHS = N0.getOperand(0);
4518     SDValue RHS = N0.getOperand(1);
4519 
4520     if (LHS.getOpcode() != ISD::FNEG)
4521       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4522     else
4523       LHS = LHS.getOperand(0);
4524 
4525     if (RHS.getOpcode() != ISD::FNEG)
4526       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4527     else
4528       RHS = RHS.getOperand(0);
4529 
4530     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
4531     if (Res.getOpcode() != ISD::FADD)
4532       return SDValue(); // Op got folded away.
4533     if (!N0.hasOneUse())
4534       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4535     return Res;
4536   }
4537   case ISD::FMUL:
4538   case AMDGPUISD::FMUL_LEGACY: {
4539     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4540     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4541     SDValue LHS = N0.getOperand(0);
4542     SDValue RHS = N0.getOperand(1);
4543 
4544     if (LHS.getOpcode() == ISD::FNEG)
4545       LHS = LHS.getOperand(0);
4546     else if (RHS.getOpcode() == ISD::FNEG)
4547       RHS = RHS.getOperand(0);
4548     else
4549       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4550 
4551     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4552     if (Res.getOpcode() != Opc)
4553       return SDValue(); // Op got folded away.
4554     if (!N0.hasOneUse())
4555       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4556     return Res;
4557   }
4558   case ISD::FMA:
4559   case ISD::FMAD: {
4560     // TODO: handle llvm.amdgcn.fma.legacy
4561     if (!mayIgnoreSignedZero(N0))
4562       return SDValue();
4563 
4564     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4565     SDValue LHS = N0.getOperand(0);
4566     SDValue MHS = N0.getOperand(1);
4567     SDValue RHS = N0.getOperand(2);
4568 
4569     if (LHS.getOpcode() == ISD::FNEG)
4570       LHS = LHS.getOperand(0);
4571     else if (MHS.getOpcode() == ISD::FNEG)
4572       MHS = MHS.getOperand(0);
4573     else
4574       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4575 
4576     if (RHS.getOpcode() != ISD::FNEG)
4577       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4578     else
4579       RHS = RHS.getOperand(0);
4580 
4581     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4582     if (Res.getOpcode() != Opc)
4583       return SDValue(); // Op got folded away.
4584     if (!N0.hasOneUse())
4585       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4586     return Res;
4587   }
4588   case ISD::FMAXNUM:
4589   case ISD::FMINNUM:
4590   case ISD::FMAXNUM_IEEE:
4591   case ISD::FMINNUM_IEEE:
4592   case AMDGPUISD::FMAX_LEGACY:
4593   case AMDGPUISD::FMIN_LEGACY: {
4594     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4595     // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4596     // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4597     // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4598 
4599     SDValue LHS = N0.getOperand(0);
4600     SDValue RHS = N0.getOperand(1);
4601 
4602     // 0 doesn't have a negated inline immediate.
4603     // TODO: This constant check should be generalized to other operations.
4604     if (isConstantCostlierToNegate(RHS))
4605       return SDValue();
4606 
4607     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4608     SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4609     unsigned Opposite = inverseMinMax(Opc);
4610 
4611     SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4612     if (Res.getOpcode() != Opposite)
4613       return SDValue(); // Op got folded away.
4614     if (!N0.hasOneUse())
4615       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4616     return Res;
4617   }
4618   case AMDGPUISD::FMED3: {
4619     SDValue Ops[3];
4620     for (unsigned I = 0; I < 3; ++I)
4621       Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4622 
4623     SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4624     if (Res.getOpcode() != AMDGPUISD::FMED3)
4625       return SDValue(); // Op got folded away.
4626 
4627     if (!N0.hasOneUse()) {
4628       SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4629       DAG.ReplaceAllUsesWith(N0, Neg);
4630 
4631       for (SDNode *U : Neg->uses())
4632         DCI.AddToWorklist(U);
4633     }
4634 
4635     return Res;
4636   }
4637   case ISD::FP_EXTEND:
4638   case ISD::FTRUNC:
4639   case ISD::FRINT:
4640   case ISD::FNEARBYINT: // XXX - Should fround be handled?
4641   case ISD::FSIN:
4642   case ISD::FCANONICALIZE:
4643   case AMDGPUISD::RCP:
4644   case AMDGPUISD::RCP_LEGACY:
4645   case AMDGPUISD::RCP_IFLAG:
4646   case AMDGPUISD::SIN_HW: {
4647     SDValue CvtSrc = N0.getOperand(0);
4648     if (CvtSrc.getOpcode() == ISD::FNEG) {
4649       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4650       // (fneg (rcp (fneg x))) -> (rcp x)
4651       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4652     }
4653 
4654     if (!N0.hasOneUse())
4655       return SDValue();
4656 
4657     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4658     // (fneg (rcp x)) -> (rcp (fneg x))
4659     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4660     return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4661   }
4662   case ISD::FP_ROUND: {
4663     SDValue CvtSrc = N0.getOperand(0);
4664 
4665     if (CvtSrc.getOpcode() == ISD::FNEG) {
4666       // (fneg (fp_round (fneg x))) -> (fp_round x)
4667       return DAG.getNode(ISD::FP_ROUND, SL, VT,
4668                          CvtSrc.getOperand(0), N0.getOperand(1));
4669     }
4670 
4671     if (!N0.hasOneUse())
4672       return SDValue();
4673 
4674     // (fneg (fp_round x)) -> (fp_round (fneg x))
4675     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4676     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4677   }
4678   case ISD::FP16_TO_FP: {
4679     // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4680     // f16, but legalization of f16 fneg ends up pulling it out of the source.
4681     // Put the fneg back as a legal source operation that can be matched later.
4682     SDLoc SL(N);
4683 
4684     SDValue Src = N0.getOperand(0);
4685     EVT SrcVT = Src.getValueType();
4686 
4687     // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4688     SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4689                                   DAG.getConstant(0x8000, SL, SrcVT));
4690     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4691   }
4692   case ISD::SELECT: {
4693     // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4694     // TODO: Invert conditions of foldFreeOpFromSelect
4695     return SDValue();
4696   }
4697   case ISD::BITCAST: {
4698     SDLoc SL(N);
4699     SDValue BCSrc = N0.getOperand(0);
4700     if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4701       SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
4702       if (HighBits.getValueType().getSizeInBits() != 32 ||
4703           !fnegFoldsIntoOp(HighBits.getNode()))
4704         return SDValue();
4705 
4706       // f64 fneg only really needs to operate on the high half of of the
4707       // register, so try to force it to an f32 operation to help make use of
4708       // source modifiers.
4709       //
4710       //
4711       // fneg (f64 (bitcast (build_vector x, y))) ->
4712       // f64 (bitcast (build_vector (bitcast i32:x to f32),
4713       //                            (fneg (bitcast i32:y to f32)))
4714 
4715       SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4716       SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4717       SDValue CastBack =
4718           DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
4719 
4720       SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
4721       Ops.back() = CastBack;
4722       DCI.AddToWorklist(NegHi.getNode());
4723       SDValue Build =
4724           DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
4725       SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
4726 
4727       if (!N0.hasOneUse())
4728         DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
4729       return Result;
4730     }
4731 
4732     if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4733         BCSrc.hasOneUse()) {
4734       // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4735       //   select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4736 
4737       // TODO: Cast back result for multiple uses is beneficial in some cases.
4738 
4739       SDValue LHS =
4740           DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4741       SDValue RHS =
4742           DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4743 
4744       SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4745       SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4746 
4747       return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4748                          NegRHS);
4749     }
4750 
4751     return SDValue();
4752   }
4753   default:
4754     return SDValue();
4755   }
4756 }
4757 
4758 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
4759                                                  DAGCombinerInfo &DCI) const {
4760   SelectionDAG &DAG = DCI.DAG;
4761   SDValue N0 = N->getOperand(0);
4762 
4763   if (!N0.hasOneUse())
4764     return SDValue();
4765 
4766   switch (N0.getOpcode()) {
4767   case ISD::FP16_TO_FP: {
4768     assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
4769     SDLoc SL(N);
4770     SDValue Src = N0.getOperand(0);
4771     EVT SrcVT = Src.getValueType();
4772 
4773     // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
4774     SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
4775                                   DAG.getConstant(0x7fff, SL, SrcVT));
4776     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
4777   }
4778   default:
4779     return SDValue();
4780   }
4781 }
4782 
4783 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
4784                                                 DAGCombinerInfo &DCI) const {
4785   const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
4786   if (!CFP)
4787     return SDValue();
4788 
4789   // XXX - Should this flush denormals?
4790   const APFloat &Val = CFP->getValueAPF();
4791   APFloat One(Val.getSemantics(), "1.0");
4792   return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
4793 }
4794 
4795 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
4796                                                 DAGCombinerInfo &DCI) const {
4797   SelectionDAG &DAG = DCI.DAG;
4798   SDLoc DL(N);
4799 
4800   switch(N->getOpcode()) {
4801   default:
4802     break;
4803   case ISD::BITCAST: {
4804     EVT DestVT = N->getValueType(0);
4805 
4806     // Push casts through vector builds. This helps avoid emitting a large
4807     // number of copies when materializing floating point vector constants.
4808     //
4809     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
4810     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
4811     if (DestVT.isVector()) {
4812       SDValue Src = N->getOperand(0);
4813       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
4814         EVT SrcVT = Src.getValueType();
4815         unsigned NElts = DestVT.getVectorNumElements();
4816 
4817         if (SrcVT.getVectorNumElements() == NElts) {
4818           EVT DestEltVT = DestVT.getVectorElementType();
4819 
4820           SmallVector<SDValue, 8> CastedElts;
4821           SDLoc SL(N);
4822           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
4823             SDValue Elt = Src.getOperand(I);
4824             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
4825           }
4826 
4827           return DAG.getBuildVector(DestVT, SL, CastedElts);
4828         }
4829       }
4830     }
4831 
4832     if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
4833       break;
4834 
4835     // Fold bitcasts of constants.
4836     //
4837     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
4838     // TODO: Generalize and move to DAGCombiner
4839     SDValue Src = N->getOperand(0);
4840     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
4841       SDLoc SL(N);
4842       uint64_t CVal = C->getZExtValue();
4843       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4844                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4845                                DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4846       return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
4847     }
4848 
4849     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
4850       const APInt &Val = C->getValueAPF().bitcastToAPInt();
4851       SDLoc SL(N);
4852       uint64_t CVal = Val.getZExtValue();
4853       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4854                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4855                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4856 
4857       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
4858     }
4859 
4860     break;
4861   }
4862   case ISD::SHL: {
4863     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4864       break;
4865 
4866     return performShlCombine(N, DCI);
4867   }
4868   case ISD::SRL: {
4869     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4870       break;
4871 
4872     return performSrlCombine(N, DCI);
4873   }
4874   case ISD::SRA: {
4875     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4876       break;
4877 
4878     return performSraCombine(N, DCI);
4879   }
4880   case ISD::TRUNCATE:
4881     return performTruncateCombine(N, DCI);
4882   case ISD::MUL:
4883     return performMulCombine(N, DCI);
4884   case AMDGPUISD::MUL_U24:
4885   case AMDGPUISD::MUL_I24: {
4886     if (SDValue Simplified = simplifyMul24(N, DCI))
4887       return Simplified;
4888     return performMulCombine(N, DCI);
4889   }
4890   case AMDGPUISD::MULHI_I24:
4891   case AMDGPUISD::MULHI_U24:
4892     return simplifyMul24(N, DCI);
4893   case ISD::SMUL_LOHI:
4894   case ISD::UMUL_LOHI:
4895     return performMulLoHiCombine(N, DCI);
4896   case ISD::MULHS:
4897     return performMulhsCombine(N, DCI);
4898   case ISD::MULHU:
4899     return performMulhuCombine(N, DCI);
4900   case ISD::SELECT:
4901     return performSelectCombine(N, DCI);
4902   case ISD::FNEG:
4903     return performFNegCombine(N, DCI);
4904   case ISD::FABS:
4905     return performFAbsCombine(N, DCI);
4906   case AMDGPUISD::BFE_I32:
4907   case AMDGPUISD::BFE_U32: {
4908     assert(!N->getValueType(0).isVector() &&
4909            "Vector handling of BFE not implemented");
4910     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
4911     if (!Width)
4912       break;
4913 
4914     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4915     if (WidthVal == 0)
4916       return DAG.getConstant(0, DL, MVT::i32);
4917 
4918     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
4919     if (!Offset)
4920       break;
4921 
4922     SDValue BitsFrom = N->getOperand(0);
4923     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4924 
4925     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4926 
4927     if (OffsetVal == 0) {
4928       // This is already sign / zero extended, so try to fold away extra BFEs.
4929       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4930 
4931       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4932       if (OpSignBits >= SignBits)
4933         return BitsFrom;
4934 
4935       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4936       if (Signed) {
4937         // This is a sign_extend_inreg. Replace it to take advantage of existing
4938         // DAG Combines. If not eliminated, we will match back to BFE during
4939         // selection.
4940 
4941         // TODO: The sext_inreg of extended types ends, although we can could
4942         // handle them in a single BFE.
4943         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4944                            DAG.getValueType(SmallVT));
4945       }
4946 
4947       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4948     }
4949 
4950     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4951       if (Signed) {
4952         return constantFoldBFE<int32_t>(DAG,
4953                                         CVal->getSExtValue(),
4954                                         OffsetVal,
4955                                         WidthVal,
4956                                         DL);
4957       }
4958 
4959       return constantFoldBFE<uint32_t>(DAG,
4960                                        CVal->getZExtValue(),
4961                                        OffsetVal,
4962                                        WidthVal,
4963                                        DL);
4964     }
4965 
4966     if ((OffsetVal + WidthVal) >= 32 &&
4967         !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4968       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4969       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4970                          BitsFrom, ShiftVal);
4971     }
4972 
4973     if (BitsFrom.hasOneUse()) {
4974       APInt Demanded = APInt::getBitsSet(32,
4975                                          OffsetVal,
4976                                          OffsetVal + WidthVal);
4977 
4978       KnownBits Known;
4979       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4980                                             !DCI.isBeforeLegalizeOps());
4981       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4982       if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4983           TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4984         DCI.CommitTargetLoweringOpt(TLO);
4985       }
4986     }
4987 
4988     break;
4989   }
4990   case ISD::LOAD:
4991     return performLoadCombine(N, DCI);
4992   case ISD::STORE:
4993     return performStoreCombine(N, DCI);
4994   case AMDGPUISD::RCP:
4995   case AMDGPUISD::RCP_IFLAG:
4996     return performRcpCombine(N, DCI);
4997   case ISD::AssertZext:
4998   case ISD::AssertSext:
4999     return performAssertSZExtCombine(N, DCI);
5000   case ISD::INTRINSIC_WO_CHAIN:
5001     return performIntrinsicWOChainCombine(N, DCI);
5002   }
5003   return SDValue();
5004 }
5005 
5006 //===----------------------------------------------------------------------===//
5007 // Helper functions
5008 //===----------------------------------------------------------------------===//
5009 
5010 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
5011                                                    const TargetRegisterClass *RC,
5012                                                    Register Reg, EVT VT,
5013                                                    const SDLoc &SL,
5014                                                    bool RawReg) const {
5015   MachineFunction &MF = DAG.getMachineFunction();
5016   MachineRegisterInfo &MRI = MF.getRegInfo();
5017   Register VReg;
5018 
5019   if (!MRI.isLiveIn(Reg)) {
5020     VReg = MRI.createVirtualRegister(RC);
5021     MRI.addLiveIn(Reg, VReg);
5022   } else {
5023     VReg = MRI.getLiveInVirtReg(Reg);
5024   }
5025 
5026   if (RawReg)
5027     return DAG.getRegister(VReg, VT);
5028 
5029   return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5030 }
5031 
5032 // This may be called multiple times, and nothing prevents creating multiple
5033 // objects at the same offset. See if we already defined this object.
5034 static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
5035                                        int64_t Offset) {
5036   for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5037     if (MFI.getObjectOffset(I) == Offset) {
5038       assert(MFI.getObjectSize(I) == Size);
5039       return I;
5040     }
5041   }
5042 
5043   return MFI.CreateFixedObject(Size, Offset, true);
5044 }
5045 
5046 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
5047                                                   EVT VT,
5048                                                   const SDLoc &SL,
5049                                                   int64_t Offset) const {
5050   MachineFunction &MF = DAG.getMachineFunction();
5051   MachineFrameInfo &MFI = MF.getFrameInfo();
5052   int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5053 
5054   auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5055   SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5056 
5057   return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5058                      MachineMemOperand::MODereferenceable |
5059                          MachineMemOperand::MOInvariant);
5060 }
5061 
5062 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
5063                                                    const SDLoc &SL,
5064                                                    SDValue Chain,
5065                                                    SDValue ArgVal,
5066                                                    int64_t Offset) const {
5067   MachineFunction &MF = DAG.getMachineFunction();
5068   MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
5069   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5070 
5071   SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5072   // Stores to the argument stack area are relative to the stack pointer.
5073   SDValue SP =
5074       DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5075   Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5076   SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5077                                MachineMemOperand::MODereferenceable);
5078   return Store;
5079 }
5080 
5081 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
5082                                              const TargetRegisterClass *RC,
5083                                              EVT VT, const SDLoc &SL,
5084                                              const ArgDescriptor &Arg) const {
5085   assert(Arg && "Attempting to load missing argument");
5086 
5087   SDValue V = Arg.isRegister() ?
5088     CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5089     loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5090 
5091   if (!Arg.isMasked())
5092     return V;
5093 
5094   unsigned Mask = Arg.getMask();
5095   unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5096   V = DAG.getNode(ISD::SRL, SL, VT, V,
5097                   DAG.getShiftAmountConstant(Shift, VT, SL));
5098   return DAG.getNode(ISD::AND, SL, VT, V,
5099                      DAG.getConstant(Mask >> Shift, SL, VT));
5100 }
5101 
5102 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5103     uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5104   unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5105   const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5106   uint64_t ArgOffset =
5107       alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5108   switch (Param) {
5109   case FIRST_IMPLICIT:
5110     return ArgOffset;
5111   case PRIVATE_BASE:
5112     return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
5113   case SHARED_BASE:
5114     return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5115   case QUEUE_PTR:
5116     return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5117   }
5118   llvm_unreachable("unexpected implicit parameter type");
5119 }
5120 
5121 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5122     const MachineFunction &MF, const ImplicitParameter Param) const {
5123   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
5124   return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param);
5125 }
5126 
5127 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5128 
5129 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5130   switch ((AMDGPUISD::NodeType)Opcode) {
5131   case AMDGPUISD::FIRST_NUMBER: break;
5132   // AMDIL DAG nodes
5133   NODE_NAME_CASE(UMUL);
5134   NODE_NAME_CASE(BRANCH_COND);
5135 
5136   // AMDGPU DAG nodes
5137   NODE_NAME_CASE(IF)
5138   NODE_NAME_CASE(ELSE)
5139   NODE_NAME_CASE(LOOP)
5140   NODE_NAME_CASE(CALL)
5141   NODE_NAME_CASE(TC_RETURN)
5142   NODE_NAME_CASE(TC_RETURN_GFX)
5143   NODE_NAME_CASE(TRAP)
5144   NODE_NAME_CASE(RET_GLUE)
5145   NODE_NAME_CASE(RETURN_TO_EPILOG)
5146   NODE_NAME_CASE(ENDPGM)
5147   NODE_NAME_CASE(ENDPGM_TRAP)
5148   NODE_NAME_CASE(DWORDADDR)
5149   NODE_NAME_CASE(FRACT)
5150   NODE_NAME_CASE(SETCC)
5151   NODE_NAME_CASE(SETREG)
5152   NODE_NAME_CASE(DENORM_MODE)
5153   NODE_NAME_CASE(FMA_W_CHAIN)
5154   NODE_NAME_CASE(FMUL_W_CHAIN)
5155   NODE_NAME_CASE(CLAMP)
5156   NODE_NAME_CASE(COS_HW)
5157   NODE_NAME_CASE(SIN_HW)
5158   NODE_NAME_CASE(FMAX_LEGACY)
5159   NODE_NAME_CASE(FMIN_LEGACY)
5160   NODE_NAME_CASE(FMAX3)
5161   NODE_NAME_CASE(SMAX3)
5162   NODE_NAME_CASE(UMAX3)
5163   NODE_NAME_CASE(FMIN3)
5164   NODE_NAME_CASE(SMIN3)
5165   NODE_NAME_CASE(UMIN3)
5166   NODE_NAME_CASE(FMED3)
5167   NODE_NAME_CASE(SMED3)
5168   NODE_NAME_CASE(UMED3)
5169   NODE_NAME_CASE(FDOT2)
5170   NODE_NAME_CASE(URECIP)
5171   NODE_NAME_CASE(DIV_SCALE)
5172   NODE_NAME_CASE(DIV_FMAS)
5173   NODE_NAME_CASE(DIV_FIXUP)
5174   NODE_NAME_CASE(FMAD_FTZ)
5175   NODE_NAME_CASE(RCP)
5176   NODE_NAME_CASE(RSQ)
5177   NODE_NAME_CASE(RCP_LEGACY)
5178   NODE_NAME_CASE(RCP_IFLAG)
5179   NODE_NAME_CASE(LOG)
5180   NODE_NAME_CASE(EXP)
5181   NODE_NAME_CASE(FMUL_LEGACY)
5182   NODE_NAME_CASE(RSQ_CLAMP)
5183   NODE_NAME_CASE(FP_CLASS)
5184   NODE_NAME_CASE(DOT4)
5185   NODE_NAME_CASE(CARRY)
5186   NODE_NAME_CASE(BORROW)
5187   NODE_NAME_CASE(BFE_U32)
5188   NODE_NAME_CASE(BFE_I32)
5189   NODE_NAME_CASE(BFI)
5190   NODE_NAME_CASE(BFM)
5191   NODE_NAME_CASE(FFBH_U32)
5192   NODE_NAME_CASE(FFBH_I32)
5193   NODE_NAME_CASE(FFBL_B32)
5194   NODE_NAME_CASE(MUL_U24)
5195   NODE_NAME_CASE(MUL_I24)
5196   NODE_NAME_CASE(MULHI_U24)
5197   NODE_NAME_CASE(MULHI_I24)
5198   NODE_NAME_CASE(MAD_U24)
5199   NODE_NAME_CASE(MAD_I24)
5200   NODE_NAME_CASE(MAD_I64_I32)
5201   NODE_NAME_CASE(MAD_U64_U32)
5202   NODE_NAME_CASE(PERM)
5203   NODE_NAME_CASE(TEXTURE_FETCH)
5204   NODE_NAME_CASE(R600_EXPORT)
5205   NODE_NAME_CASE(CONST_ADDRESS)
5206   NODE_NAME_CASE(REGISTER_LOAD)
5207   NODE_NAME_CASE(REGISTER_STORE)
5208   NODE_NAME_CASE(SAMPLE)
5209   NODE_NAME_CASE(SAMPLEB)
5210   NODE_NAME_CASE(SAMPLED)
5211   NODE_NAME_CASE(SAMPLEL)
5212   NODE_NAME_CASE(CVT_F32_UBYTE0)
5213   NODE_NAME_CASE(CVT_F32_UBYTE1)
5214   NODE_NAME_CASE(CVT_F32_UBYTE2)
5215   NODE_NAME_CASE(CVT_F32_UBYTE3)
5216   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5217   NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5218   NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5219   NODE_NAME_CASE(CVT_PK_I16_I32)
5220   NODE_NAME_CASE(CVT_PK_U16_U32)
5221   NODE_NAME_CASE(FP_TO_FP16)
5222   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5223   NODE_NAME_CASE(CONST_DATA_PTR)
5224   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5225   NODE_NAME_CASE(LDS)
5226   NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
5227   NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5228   NODE_NAME_CASE(DUMMY_CHAIN)
5229   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
5230   NODE_NAME_CASE(LOAD_D16_HI)
5231   NODE_NAME_CASE(LOAD_D16_LO)
5232   NODE_NAME_CASE(LOAD_D16_HI_I8)
5233   NODE_NAME_CASE(LOAD_D16_HI_U8)
5234   NODE_NAME_CASE(LOAD_D16_LO_I8)
5235   NODE_NAME_CASE(LOAD_D16_LO_U8)
5236   NODE_NAME_CASE(STORE_MSKOR)
5237   NODE_NAME_CASE(LOAD_CONSTANT)
5238   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5239   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5240   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5241   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5242   NODE_NAME_CASE(DS_ORDERED_COUNT)
5243   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5244   NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
5245   NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
5246   NODE_NAME_CASE(BUFFER_LOAD)
5247   NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5248   NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5249   NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5250   NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5251   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5252   NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5253   NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5254   NODE_NAME_CASE(SBUFFER_LOAD)
5255   NODE_NAME_CASE(BUFFER_STORE)
5256   NODE_NAME_CASE(BUFFER_STORE_BYTE)
5257   NODE_NAME_CASE(BUFFER_STORE_SHORT)
5258   NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5259   NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5260   NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5261   NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5262   NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5263   NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5264   NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5265   NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5266   NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5267   NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5268   NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5269   NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5270   NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5271   NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5272   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5273   NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5274   NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5275   NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5276   NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5277 
5278   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
5279   }
5280   return nullptr;
5281 }
5282 
5283 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
5284                                               SelectionDAG &DAG, int Enabled,
5285                                               int &RefinementSteps,
5286                                               bool &UseOneConstNR,
5287                                               bool Reciprocal) const {
5288   EVT VT = Operand.getValueType();
5289 
5290   if (VT == MVT::f32) {
5291     RefinementSteps = 0;
5292     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5293   }
5294 
5295   // TODO: There is also f64 rsq instruction, but the documentation is less
5296   // clear on its precision.
5297 
5298   return SDValue();
5299 }
5300 
5301 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
5302                                                SelectionDAG &DAG, int Enabled,
5303                                                int &RefinementSteps) const {
5304   EVT VT = Operand.getValueType();
5305 
5306   if (VT == MVT::f32) {
5307     // Reciprocal, < 1 ulp error.
5308     //
5309     // This reciprocal approximation converges to < 0.5 ulp error with one
5310     // newton rhapson performed with two fused multiple adds (FMAs).
5311 
5312     RefinementSteps = 0;
5313     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5314   }
5315 
5316   // TODO: There is also f64 rcp instruction, but the documentation is less
5317   // clear on its precision.
5318 
5319   return SDValue();
5320 }
5321 
5322 static unsigned workitemIntrinsicDim(unsigned ID) {
5323   switch (ID) {
5324   case Intrinsic::amdgcn_workitem_id_x:
5325     return 0;
5326   case Intrinsic::amdgcn_workitem_id_y:
5327     return 1;
5328   case Intrinsic::amdgcn_workitem_id_z:
5329     return 2;
5330   default:
5331     llvm_unreachable("not a workitem intrinsic");
5332   }
5333 }
5334 
5335 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
5336     const SDValue Op, KnownBits &Known,
5337     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5338 
5339   Known.resetAll(); // Don't know anything.
5340 
5341   unsigned Opc = Op.getOpcode();
5342 
5343   switch (Opc) {
5344   default:
5345     break;
5346   case AMDGPUISD::CARRY:
5347   case AMDGPUISD::BORROW: {
5348     Known.Zero = APInt::getHighBitsSet(32, 31);
5349     break;
5350   }
5351 
5352   case AMDGPUISD::BFE_I32:
5353   case AMDGPUISD::BFE_U32: {
5354     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5355     if (!CWidth)
5356       return;
5357 
5358     uint32_t Width = CWidth->getZExtValue() & 0x1f;
5359 
5360     if (Opc == AMDGPUISD::BFE_U32)
5361       Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5362 
5363     break;
5364   }
5365   case AMDGPUISD::FP_TO_FP16: {
5366     unsigned BitWidth = Known.getBitWidth();
5367 
5368     // High bits are zero.
5369     Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
5370     break;
5371   }
5372   case AMDGPUISD::MUL_U24:
5373   case AMDGPUISD::MUL_I24: {
5374     KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5375     KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5376     unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5377                       RHSKnown.countMinTrailingZeros();
5378     Known.Zero.setLowBits(std::min(TrailZ, 32u));
5379     // Skip extra check if all bits are known zeros.
5380     if (TrailZ >= 32)
5381       break;
5382 
5383     // Truncate to 24 bits.
5384     LHSKnown = LHSKnown.trunc(24);
5385     RHSKnown = RHSKnown.trunc(24);
5386 
5387     if (Opc == AMDGPUISD::MUL_I24) {
5388       unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5389       unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5390       unsigned MaxValBits = LHSValBits + RHSValBits;
5391       if (MaxValBits > 32)
5392         break;
5393       unsigned SignBits = 32 - MaxValBits + 1;
5394       bool LHSNegative = LHSKnown.isNegative();
5395       bool LHSNonNegative = LHSKnown.isNonNegative();
5396       bool LHSPositive = LHSKnown.isStrictlyPositive();
5397       bool RHSNegative = RHSKnown.isNegative();
5398       bool RHSNonNegative = RHSKnown.isNonNegative();
5399       bool RHSPositive = RHSKnown.isStrictlyPositive();
5400 
5401       if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5402         Known.Zero.setHighBits(SignBits);
5403       else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5404         Known.One.setHighBits(SignBits);
5405     } else {
5406       unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5407       unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5408       unsigned MaxValBits = LHSValBits + RHSValBits;
5409       if (MaxValBits >= 32)
5410         break;
5411       Known.Zero.setBitsFrom(MaxValBits);
5412     }
5413     break;
5414   }
5415   case AMDGPUISD::PERM: {
5416     ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5417     if (!CMask)
5418       return;
5419 
5420     KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5421     KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5422     unsigned Sel = CMask->getZExtValue();
5423 
5424     for (unsigned I = 0; I < 32; I += 8) {
5425       unsigned SelBits = Sel & 0xff;
5426       if (SelBits < 4) {
5427         SelBits *= 8;
5428         Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5429         Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5430       } else if (SelBits < 7) {
5431         SelBits = (SelBits & 3) * 8;
5432         Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5433         Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5434       } else if (SelBits == 0x0c) {
5435         Known.Zero |= 0xFFull << I;
5436       } else if (SelBits > 0x0c) {
5437         Known.One |= 0xFFull << I;
5438       }
5439       Sel >>= 8;
5440     }
5441     break;
5442   }
5443   case AMDGPUISD::BUFFER_LOAD_UBYTE:  {
5444     Known.Zero.setHighBits(24);
5445     break;
5446   }
5447   case AMDGPUISD::BUFFER_LOAD_USHORT: {
5448     Known.Zero.setHighBits(16);
5449     break;
5450   }
5451   case AMDGPUISD::LDS: {
5452     auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5453     Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5454 
5455     Known.Zero.setHighBits(16);
5456     Known.Zero.setLowBits(Log2(Alignment));
5457     break;
5458   }
5459   case AMDGPUISD::SMIN3:
5460   case AMDGPUISD::SMAX3:
5461   case AMDGPUISD::SMED3:
5462   case AMDGPUISD::UMIN3:
5463   case AMDGPUISD::UMAX3:
5464   case AMDGPUISD::UMED3: {
5465     KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5466     if (Known2.isUnknown())
5467       break;
5468 
5469     KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5470     if (Known1.isUnknown())
5471       break;
5472 
5473     KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5474     if (Known0.isUnknown())
5475       break;
5476 
5477     // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5478     Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5479     Known.One = Known0.One & Known1.One & Known2.One;
5480     break;
5481   }
5482   case ISD::INTRINSIC_WO_CHAIN: {
5483     unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5484     switch (IID) {
5485     case Intrinsic::amdgcn_workitem_id_x:
5486     case Intrinsic::amdgcn_workitem_id_y:
5487     case Intrinsic::amdgcn_workitem_id_z: {
5488       unsigned MaxValue = Subtarget->getMaxWorkitemID(
5489           DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));
5490       Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5491       break;
5492     }
5493     default:
5494       break;
5495     }
5496   }
5497   }
5498 }
5499 
5500 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
5501     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5502     unsigned Depth) const {
5503   switch (Op.getOpcode()) {
5504   case AMDGPUISD::BFE_I32: {
5505     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5506     if (!Width)
5507       return 1;
5508 
5509     unsigned SignBits = 32 - Width->getZExtValue() + 1;
5510     if (!isNullConstant(Op.getOperand(1)))
5511       return SignBits;
5512 
5513     // TODO: Could probably figure something out with non-0 offsets.
5514     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5515     return std::max(SignBits, Op0SignBits);
5516   }
5517 
5518   case AMDGPUISD::BFE_U32: {
5519     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5520     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5521   }
5522 
5523   case AMDGPUISD::CARRY:
5524   case AMDGPUISD::BORROW:
5525     return 31;
5526   case AMDGPUISD::BUFFER_LOAD_BYTE:
5527     return 25;
5528   case AMDGPUISD::BUFFER_LOAD_SHORT:
5529     return 17;
5530   case AMDGPUISD::BUFFER_LOAD_UBYTE:
5531     return 24;
5532   case AMDGPUISD::BUFFER_LOAD_USHORT:
5533     return 16;
5534   case AMDGPUISD::FP_TO_FP16:
5535     return 16;
5536   case AMDGPUISD::SMIN3:
5537   case AMDGPUISD::SMAX3:
5538   case AMDGPUISD::SMED3:
5539   case AMDGPUISD::UMIN3:
5540   case AMDGPUISD::UMAX3:
5541   case AMDGPUISD::UMED3: {
5542     unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5543     if (Tmp2 == 1)
5544       return 1; // Early out.
5545 
5546     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5547     if (Tmp1 == 1)
5548       return 1; // Early out.
5549 
5550     unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5551     if (Tmp0 == 1)
5552       return 1; // Early out.
5553 
5554     return std::min(Tmp0, std::min(Tmp1, Tmp2));
5555   }
5556   default:
5557     return 1;
5558   }
5559 }
5560 
5561 unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
5562   GISelKnownBits &Analysis, Register R,
5563   const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5564   unsigned Depth) const {
5565   const MachineInstr *MI = MRI.getVRegDef(R);
5566   if (!MI)
5567     return 1;
5568 
5569   // TODO: Check range metadata on MMO.
5570   switch (MI->getOpcode()) {
5571   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5572     return 25;
5573   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5574     return 17;
5575   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5576     return 24;
5577   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5578     return 16;
5579   case AMDGPU::G_AMDGPU_SMED3:
5580   case AMDGPU::G_AMDGPU_UMED3: {
5581     auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5582     unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5583     if (Tmp2 == 1)
5584       return 1;
5585     unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5586     if (Tmp1 == 1)
5587       return 1;
5588     unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5589     if (Tmp0 == 1)
5590       return 1;
5591     return std::min(Tmp0, std::min(Tmp1, Tmp2));
5592   }
5593   default:
5594     return 1;
5595   }
5596 }
5597 
5598 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
5599                                                         const SelectionDAG &DAG,
5600                                                         bool SNaN,
5601                                                         unsigned Depth) const {
5602   unsigned Opcode = Op.getOpcode();
5603   switch (Opcode) {
5604   case AMDGPUISD::FMIN_LEGACY:
5605   case AMDGPUISD::FMAX_LEGACY: {
5606     if (SNaN)
5607       return true;
5608 
5609     // TODO: Can check no nans on one of the operands for each one, but which
5610     // one?
5611     return false;
5612   }
5613   case AMDGPUISD::FMUL_LEGACY:
5614   case AMDGPUISD::CVT_PKRTZ_F16_F32: {
5615     if (SNaN)
5616       return true;
5617     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5618            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5619   }
5620   case AMDGPUISD::FMED3:
5621   case AMDGPUISD::FMIN3:
5622   case AMDGPUISD::FMAX3:
5623   case AMDGPUISD::FMAD_FTZ: {
5624     if (SNaN)
5625       return true;
5626     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5627            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5628            DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5629   }
5630   case AMDGPUISD::CVT_F32_UBYTE0:
5631   case AMDGPUISD::CVT_F32_UBYTE1:
5632   case AMDGPUISD::CVT_F32_UBYTE2:
5633   case AMDGPUISD::CVT_F32_UBYTE3:
5634     return true;
5635 
5636   case AMDGPUISD::RCP:
5637   case AMDGPUISD::RSQ:
5638   case AMDGPUISD::RCP_LEGACY:
5639   case AMDGPUISD::RSQ_CLAMP: {
5640     if (SNaN)
5641       return true;
5642 
5643     // TODO: Need is known positive check.
5644     return false;
5645   }
5646   case ISD::FLDEXP:
5647   case AMDGPUISD::FRACT: {
5648     if (SNaN)
5649       return true;
5650     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5651   }
5652   case AMDGPUISD::DIV_SCALE:
5653   case AMDGPUISD::DIV_FMAS:
5654   case AMDGPUISD::DIV_FIXUP:
5655     // TODO: Refine on operands.
5656     return SNaN;
5657   case AMDGPUISD::SIN_HW:
5658   case AMDGPUISD::COS_HW: {
5659     // TODO: Need check for infinity
5660     return SNaN;
5661   }
5662   case ISD::INTRINSIC_WO_CHAIN: {
5663     unsigned IntrinsicID
5664       = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5665     // TODO: Handle more intrinsics
5666     switch (IntrinsicID) {
5667     case Intrinsic::amdgcn_cubeid:
5668       return true;
5669 
5670     case Intrinsic::amdgcn_frexp_mant: {
5671       if (SNaN)
5672         return true;
5673       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5674     }
5675     case Intrinsic::amdgcn_cvt_pkrtz: {
5676       if (SNaN)
5677         return true;
5678       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5679              DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5680     }
5681     case Intrinsic::amdgcn_rcp:
5682     case Intrinsic::amdgcn_rsq:
5683     case Intrinsic::amdgcn_rcp_legacy:
5684     case Intrinsic::amdgcn_rsq_legacy:
5685     case Intrinsic::amdgcn_rsq_clamp: {
5686       if (SNaN)
5687         return true;
5688 
5689       // TODO: Need is known positive check.
5690       return false;
5691     }
5692     case Intrinsic::amdgcn_trig_preop:
5693     case Intrinsic::amdgcn_fdot2:
5694       // TODO: Refine on operand
5695       return SNaN;
5696     case Intrinsic::amdgcn_fma_legacy:
5697       if (SNaN)
5698         return true;
5699       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5700              DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
5701              DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
5702     default:
5703       return false;
5704     }
5705   }
5706   default:
5707     return false;
5708   }
5709 }
5710 
5711 bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
5712                                                Register N0, Register N1) const {
5713   return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
5714 }
5715 
5716 TargetLowering::AtomicExpansionKind
5717 AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
5718   switch (RMW->getOperation()) {
5719   case AtomicRMWInst::Nand:
5720   case AtomicRMWInst::FAdd:
5721   case AtomicRMWInst::FSub:
5722   case AtomicRMWInst::FMax:
5723   case AtomicRMWInst::FMin:
5724     return AtomicExpansionKind::CmpXChg;
5725   default: {
5726     if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
5727       unsigned Size = IntTy->getBitWidth();
5728       if (Size == 32 || Size == 64)
5729         return AtomicExpansionKind::None;
5730     }
5731 
5732     return AtomicExpansionKind::CmpXChg;
5733   }
5734   }
5735 }
5736 
5737 bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
5738     unsigned Opc, LLT Ty1, LLT Ty2) const {
5739   return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
5740          Ty2 == LLT::scalar(32);
5741 }
5742 
5743 /// Whether it is profitable to sink the operands of an
5744 /// Instruction I to the basic block of I.
5745 /// This helps using several modifiers (like abs and neg) more often.
5746 bool AMDGPUTargetLowering::shouldSinkOperands(
5747     Instruction *I, SmallVectorImpl<Use *> &Ops) const {
5748   using namespace PatternMatch;
5749 
5750   for (auto &Op : I->operands()) {
5751     // Ensure we are not already sinking this operand.
5752     if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
5753       continue;
5754 
5755     if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
5756       Ops.push_back(&Op);
5757   }
5758 
5759   return !Ops.empty();
5760 }
5761