1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   setAction({G_BRCOND, S1}, Legal); // VCC branches
248   setAction({G_BRCOND, S32}, Legal); // SCC branches
249 
250   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
251   // elements for v3s16
252   getActionDefinitionsBuilder(G_PHI)
253     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
254     .legalFor(AllS32Vectors)
255     .legalFor(AllS64Vectors)
256     .legalFor(AddrSpaces64)
257     .legalFor(AddrSpaces32)
258     .clampScalar(0, S32, S256)
259     .widenScalarToNextPow2(0, 32)
260     .clampMaxNumElements(0, S32, 16)
261     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
262     .legalIf(isPointer(0));
263 
264   if (ST.has16BitInsts()) {
265     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
266       .legalFor({S32, S16})
267       .clampScalar(0, S16, S32)
268       .scalarize(0);
269   } else {
270     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
271       .legalFor({S32})
272       .clampScalar(0, S32, S32)
273       .scalarize(0);
274   }
275 
276   // FIXME: Not really legal. Placeholder for custom lowering.
277   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
278     .legalFor({S32, S64})
279     .clampScalar(0, S32, S64)
280     .widenScalarToNextPow2(0, 32)
281     .scalarize(0);
282 
283   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
284     .legalFor({S32})
285     .clampScalar(0, S32, S32)
286     .scalarize(0);
287 
288   // Report legal for any types we can handle anywhere. For the cases only legal
289   // on the SALU, RegBankSelect will be able to re-legalize.
290   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
291     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
292     .clampScalar(0, S32, S64)
293     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
295     .widenScalarToNextPow2(0)
296     .scalarize(0);
297 
298   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
299                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
300     .legalFor({{S32, S1}, {S32, S32}})
301     .clampScalar(0, S32, S32)
302     .scalarize(0); // TODO: Implement.
303 
304   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
305     .lower();
306 
307   getActionDefinitionsBuilder(G_BITCAST)
308     // Don't worry about the size constraint.
309     .legalIf(all(isRegisterType(0), isRegisterType(1)))
310     // FIXME: Testing hack
311     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
312 
313   getActionDefinitionsBuilder(G_FCONSTANT)
314     .legalFor({S32, S64, S16})
315     .clampScalar(0, S16, S64);
316 
317   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
318     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
319                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
320     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
321     .clampScalarOrElt(0, S32, S1024)
322     .legalIf(isMultiple32(0))
323     .widenScalarToNextPow2(0, 32)
324     .clampMaxNumElements(0, S32, 16);
325 
326 
327   // FIXME: i1 operands to intrinsics should always be legal, but other i1
328   // values may not be legal.  We need to figure out how to distinguish
329   // between these two scenarios.
330   getActionDefinitionsBuilder(G_CONSTANT)
331     .legalFor({S1, S32, S64, S16, GlobalPtr,
332                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
333     .clampScalar(0, S32, S64)
334     .widenScalarToNextPow2(0)
335     .legalIf(isPointer(0));
336 
337   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
338   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
339     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
340 
341 
342   auto &FPOpActions = getActionDefinitionsBuilder(
343     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
344     .legalFor({S32, S64});
345   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
346     .customFor({S32, S64});
347   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
348     .customFor({S32, S64});
349 
350   if (ST.has16BitInsts()) {
351     if (ST.hasVOP3PInsts())
352       FPOpActions.legalFor({S16, V2S16});
353     else
354       FPOpActions.legalFor({S16});
355 
356     TrigActions.customFor({S16});
357     FDIVActions.customFor({S16});
358   }
359 
360   auto &MinNumMaxNum = getActionDefinitionsBuilder({
361       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
362 
363   if (ST.hasVOP3PInsts()) {
364     MinNumMaxNum.customFor(FPTypesPK16)
365       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
366       .clampMaxNumElements(0, S16, 2)
367       .clampScalar(0, S16, S64)
368       .scalarize(0);
369   } else if (ST.has16BitInsts()) {
370     MinNumMaxNum.customFor(FPTypes16)
371       .clampScalar(0, S16, S64)
372       .scalarize(0);
373   } else {
374     MinNumMaxNum.customFor(FPTypesBase)
375       .clampScalar(0, S32, S64)
376       .scalarize(0);
377   }
378 
379   if (ST.hasVOP3PInsts())
380     FPOpActions.clampMaxNumElements(0, S16, 2);
381 
382   FPOpActions
383     .scalarize(0)
384     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
385 
386   TrigActions
387     .scalarize(0)
388     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
389 
390   FDIVActions
391     .scalarize(0)
392     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
393 
394   getActionDefinitionsBuilder({G_FNEG, G_FABS})
395     .legalFor(FPTypesPK16)
396     .clampMaxNumElements(0, S16, 2)
397     .scalarize(0)
398     .clampScalar(0, S16, S64);
399 
400   // TODO: Implement
401   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
402 
403   if (ST.has16BitInsts()) {
404     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
405       .legalFor({S32, S64, S16})
406       .scalarize(0)
407       .clampScalar(0, S16, S64);
408   } else {
409     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
410       .legalFor({S32, S64})
411       .scalarize(0)
412       .clampScalar(0, S32, S64);
413   }
414 
415   getActionDefinitionsBuilder(G_FPTRUNC)
416     .legalFor({{S32, S64}, {S16, S32}})
417     .scalarize(0);
418 
419   getActionDefinitionsBuilder(G_FPEXT)
420     .legalFor({{S64, S32}, {S32, S16}})
421     .lowerFor({{S64, S16}}) // FIXME: Implement
422     .scalarize(0);
423 
424   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
425   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
426 
427   getActionDefinitionsBuilder(G_FSUB)
428       // Use actual fsub instruction
429       .legalFor({S32})
430       // Must use fadd + fneg
431       .lowerFor({S64, S16, V2S16})
432       .scalarize(0)
433       .clampScalar(0, S32, S64);
434 
435   // Whether this is legal depends on the floating point mode for the function.
436   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
437   if (ST.hasMadF16())
438     FMad.customFor({S32, S16});
439   else
440     FMad.customFor({S32});
441   FMad.scalarize(0)
442       .lower();
443 
444   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
445     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
446                {S32, S1}, {S64, S1}, {S16, S1},
447                {S96, S32},
448                // FIXME: Hack
449                {S64, LLT::scalar(33)},
450                {S32, S8}, {S32, LLT::scalar(24)}})
451     .scalarize(0)
452     .clampScalar(0, S32, S64);
453 
454   // TODO: Split s1->s64 during regbankselect for VALU.
455   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
456     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
457     .lowerFor({{S32, S64}})
458     .lowerIf(typeIs(1, S1))
459     .customFor({{S64, S64}});
460   if (ST.has16BitInsts())
461     IToFP.legalFor({{S16, S16}});
462   IToFP.clampScalar(1, S32, S64)
463        .scalarize(0);
464 
465   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
466     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
467   if (ST.has16BitInsts())
468     FPToI.legalFor({{S16, S16}});
469   else
470     FPToI.minScalar(1, S32);
471 
472   FPToI.minScalar(0, S32)
473        .scalarize(0);
474 
475   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
476     .scalarize(0)
477     .lower();
478 
479   if (ST.has16BitInsts()) {
480     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
481       .legalFor({S16, S32, S64})
482       .clampScalar(0, S16, S64)
483       .scalarize(0);
484   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
485     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
486       .legalFor({S32, S64})
487       .clampScalar(0, S32, S64)
488       .scalarize(0);
489   } else {
490     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
491       .legalFor({S32})
492       .customFor({S64})
493       .clampScalar(0, S32, S64)
494       .scalarize(0);
495   }
496 
497   getActionDefinitionsBuilder(G_PTR_ADD)
498     .legalForCartesianProduct(AddrSpaces64, {S64})
499     .legalForCartesianProduct(AddrSpaces32, {S32})
500     .scalarize(0);
501 
502   getActionDefinitionsBuilder(G_PTR_MASK)
503     .scalarize(0)
504     .alwaysLegal();
505 
506   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
507 
508   auto &CmpBuilder =
509     getActionDefinitionsBuilder(G_ICMP)
510     // The compare output type differs based on the register bank of the output,
511     // so make both s1 and s32 legal.
512     //
513     // Scalar compares producing output in scc will be promoted to s32, as that
514     // is the allocatable register type that will be needed for the copy from
515     // scc. This will be promoted during RegBankSelect, and we assume something
516     // before that won't try to use s32 result types.
517     //
518     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
519     // bank.
520     .legalForCartesianProduct(
521       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
522     .legalForCartesianProduct(
523       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
524   if (ST.has16BitInsts()) {
525     CmpBuilder.legalFor({{S1, S16}});
526   }
527 
528   CmpBuilder
529     .widenScalarToNextPow2(1)
530     .clampScalar(1, S32, S64)
531     .scalarize(0)
532     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
533 
534   getActionDefinitionsBuilder(G_FCMP)
535     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
536     .widenScalarToNextPow2(1)
537     .clampScalar(1, S32, S64)
538     .scalarize(0);
539 
540   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
541   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
542                                G_FLOG, G_FLOG2, G_FLOG10})
543     .legalFor({S32})
544     .scalarize(0);
545 
546   // The 64-bit versions produce 32-bit results, but only on the SALU.
547   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
548                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
549                                G_CTPOP})
550     .legalFor({{S32, S32}, {S32, S64}})
551     .clampScalar(0, S32, S32)
552     .clampScalar(1, S32, S64)
553     .scalarize(0)
554     .widenScalarToNextPow2(0, 32)
555     .widenScalarToNextPow2(1, 32);
556 
557   // TODO: Expand for > s32
558   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
559     .legalFor({S32})
560     .clampScalar(0, S32, S32)
561     .scalarize(0);
562 
563   if (ST.has16BitInsts()) {
564     if (ST.hasVOP3PInsts()) {
565       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
566         .legalFor({S32, S16, V2S16})
567         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
568         .clampMaxNumElements(0, S16, 2)
569         .clampScalar(0, S16, S32)
570         .widenScalarToNextPow2(0)
571         .scalarize(0);
572     } else {
573       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
574         .legalFor({S32, S16})
575         .widenScalarToNextPow2(0)
576         .clampScalar(0, S16, S32)
577         .scalarize(0);
578     }
579   } else {
580     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
581       .legalFor({S32})
582       .clampScalar(0, S32, S32)
583       .widenScalarToNextPow2(0)
584       .scalarize(0);
585   }
586 
587   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
588     return [=](const LegalityQuery &Query) {
589       return Query.Types[TypeIdx0].getSizeInBits() <
590              Query.Types[TypeIdx1].getSizeInBits();
591     };
592   };
593 
594   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
595     return [=](const LegalityQuery &Query) {
596       return Query.Types[TypeIdx0].getSizeInBits() >
597              Query.Types[TypeIdx1].getSizeInBits();
598     };
599   };
600 
601   getActionDefinitionsBuilder(G_INTTOPTR)
602     // List the common cases
603     .legalForCartesianProduct(AddrSpaces64, {S64})
604     .legalForCartesianProduct(AddrSpaces32, {S32})
605     .scalarize(0)
606     // Accept any address space as long as the size matches
607     .legalIf(sameSize(0, 1))
608     .widenScalarIf(smallerThan(1, 0),
609       [](const LegalityQuery &Query) {
610         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
611       })
612     .narrowScalarIf(greaterThan(1, 0),
613       [](const LegalityQuery &Query) {
614         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
615       });
616 
617   getActionDefinitionsBuilder(G_PTRTOINT)
618     // List the common cases
619     .legalForCartesianProduct(AddrSpaces64, {S64})
620     .legalForCartesianProduct(AddrSpaces32, {S32})
621     .scalarize(0)
622     // Accept any address space as long as the size matches
623     .legalIf(sameSize(0, 1))
624     .widenScalarIf(smallerThan(0, 1),
625       [](const LegalityQuery &Query) {
626         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
627       })
628     .narrowScalarIf(
629       greaterThan(0, 1),
630       [](const LegalityQuery &Query) {
631         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
632       });
633 
634   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
635     .scalarize(0)
636     .custom();
637 
638   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
639   // handle some operations by just promoting the register during
640   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
641   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
642     switch (AS) {
643     // FIXME: Private element size.
644     case AMDGPUAS::PRIVATE_ADDRESS:
645       return 32;
646     // FIXME: Check subtarget
647     case AMDGPUAS::LOCAL_ADDRESS:
648       return ST.useDS128() ? 128 : 64;
649 
650     // Treat constant and global as identical. SMRD loads are sometimes usable
651     // for global loads (ideally constant address space should be eliminated)
652     // depending on the context. Legality cannot be context dependent, but
653     // RegBankSelect can split the load as necessary depending on the pointer
654     // register bank/uniformity and if the memory is invariant or not written in
655     // a kernel.
656     case AMDGPUAS::CONSTANT_ADDRESS:
657     case AMDGPUAS::GLOBAL_ADDRESS:
658       return 512;
659     default:
660       return 128;
661     }
662   };
663 
664   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
665     const LLT DstTy = Query.Types[0];
666 
667     // Split vector extloads.
668     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
669     unsigned Align = Query.MMODescrs[0].AlignInBits;
670 
671     if (MemSize < DstTy.getSizeInBits())
672       MemSize = std::max(MemSize, Align);
673 
674     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
675       return true;
676 
677     const LLT PtrTy = Query.Types[1];
678     unsigned AS = PtrTy.getAddressSpace();
679     if (MemSize > maxSizeForAddrSpace(AS))
680       return true;
681 
682     // Catch weird sized loads that don't evenly divide into the access sizes
683     // TODO: May be able to widen depending on alignment etc.
684     unsigned NumRegs = MemSize / 32;
685     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
686       return true;
687 
688     if (Align < MemSize) {
689       const SITargetLowering *TLI = ST.getTargetLowering();
690       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
691     }
692 
693     return false;
694   };
695 
696   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
697   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
698   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
699 
700   // TODO: Refine based on subtargets which support unaligned access or 128-bit
701   // LDS
702   // TODO: Unsupported flat for SI.
703 
704   for (unsigned Op : {G_LOAD, G_STORE}) {
705     const bool IsStore = Op == G_STORE;
706 
707     auto &Actions = getActionDefinitionsBuilder(Op);
708     // Whitelist the common cases.
709     // TODO: Pointer loads
710     // TODO: Wide constant loads
711     // TODO: Only CI+ has 3x loads
712     // TODO: Loads to s16 on gfx9
713     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
714                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
715                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
716                                       {S96, GlobalPtr, 96, GlobalAlign32},
717                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
718                                       {S128, GlobalPtr, 128, GlobalAlign32},
719                                       {S64, GlobalPtr, 64, GlobalAlign32},
720                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
721                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
722                                       {S32, GlobalPtr, 8, GlobalAlign8},
723                                       {S32, GlobalPtr, 16, GlobalAlign16},
724 
725                                       {S32, LocalPtr, 32, 32},
726                                       {S64, LocalPtr, 64, 32},
727                                       {V2S32, LocalPtr, 64, 32},
728                                       {S32, LocalPtr, 8, 8},
729                                       {S32, LocalPtr, 16, 16},
730                                       {V2S16, LocalPtr, 32, 32},
731 
732                                       {S32, PrivatePtr, 32, 32},
733                                       {S32, PrivatePtr, 8, 8},
734                                       {S32, PrivatePtr, 16, 16},
735                                       {V2S16, PrivatePtr, 32, 32},
736 
737                                       {S32, FlatPtr, 32, GlobalAlign32},
738                                       {S32, FlatPtr, 16, GlobalAlign16},
739                                       {S32, FlatPtr, 8, GlobalAlign8},
740                                       {V2S16, FlatPtr, 32, GlobalAlign32},
741 
742                                       {S32, ConstantPtr, 32, GlobalAlign32},
743                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
744                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
745                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
746                                       {S64, ConstantPtr, 64, GlobalAlign32},
747                                       {S128, ConstantPtr, 128, GlobalAlign32},
748                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
749     Actions
750         .customIf(typeIs(1, Constant32Ptr))
751         .narrowScalarIf(
752             [=](const LegalityQuery &Query) -> bool {
753               return !Query.Types[0].isVector() && needToSplitLoad(Query);
754             },
755             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
756               const LLT DstTy = Query.Types[0];
757               const LLT PtrTy = Query.Types[1];
758 
759               const unsigned DstSize = DstTy.getSizeInBits();
760               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
761 
762               // Split extloads.
763               if (DstSize > MemSize)
764                 return std::make_pair(0, LLT::scalar(MemSize));
765 
766               if (DstSize > 32 && (DstSize % 32 != 0)) {
767                 // FIXME: Need a way to specify non-extload of larger size if
768                 // suitably aligned.
769                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
770               }
771 
772               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
773               if (MemSize > MaxSize)
774                 return std::make_pair(0, LLT::scalar(MaxSize));
775 
776               unsigned Align = Query.MMODescrs[0].AlignInBits;
777               return std::make_pair(0, LLT::scalar(Align));
778             })
779         .fewerElementsIf(
780             [=](const LegalityQuery &Query) -> bool {
781               return Query.Types[0].isVector() && needToSplitLoad(Query);
782             },
783             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
784               const LLT DstTy = Query.Types[0];
785               const LLT PtrTy = Query.Types[1];
786 
787               LLT EltTy = DstTy.getElementType();
788               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
789 
790               // Split if it's too large for the address space.
791               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
792                 unsigned NumElts = DstTy.getNumElements();
793                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
794 
795                 // FIXME: Refine when odd breakdowns handled
796                 // The scalars will need to be re-legalized.
797                 if (NumPieces == 1 || NumPieces >= NumElts ||
798                     NumElts % NumPieces != 0)
799                   return std::make_pair(0, EltTy);
800 
801                 return std::make_pair(0,
802                                       LLT::vector(NumElts / NumPieces, EltTy));
803               }
804 
805               // Need to split because of alignment.
806               unsigned Align = Query.MMODescrs[0].AlignInBits;
807               unsigned EltSize = EltTy.getSizeInBits();
808               if (EltSize > Align &&
809                   (EltSize / Align < DstTy.getNumElements())) {
810                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
811               }
812 
813               // May need relegalization for the scalars.
814               return std::make_pair(0, EltTy);
815             })
816         .minScalar(0, S32);
817 
818     if (IsStore)
819       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
820 
821     // TODO: Need a bitcast lower option?
822     Actions
823         .legalIf([=](const LegalityQuery &Query) {
824           const LLT Ty0 = Query.Types[0];
825           unsigned Size = Ty0.getSizeInBits();
826           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
827           unsigned Align = Query.MMODescrs[0].AlignInBits;
828 
829           // FIXME: Widening store from alignment not valid.
830           if (MemSize < Size)
831             MemSize = std::max(MemSize, Align);
832 
833           // No extending vector loads.
834           if (Size > MemSize && Ty0.isVector())
835             return false;
836 
837           switch (MemSize) {
838           case 8:
839           case 16:
840             return Size == 32;
841           case 32:
842           case 64:
843           case 128:
844             return true;
845           case 96:
846             return ST.hasDwordx3LoadStores();
847           case 256:
848           case 512:
849             return true;
850           default:
851             return false;
852           }
853         })
854         .widenScalarToNextPow2(0)
855         // TODO: v3s32->v4s32 with alignment
856         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
857   }
858 
859   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
860                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
861                                                   {S32, GlobalPtr, 16, 2 * 8},
862                                                   {S32, LocalPtr, 8, 8},
863                                                   {S32, LocalPtr, 16, 16},
864                                                   {S32, PrivatePtr, 8, 8},
865                                                   {S32, PrivatePtr, 16, 16},
866                                                   {S32, ConstantPtr, 8, 8},
867                                                   {S32, ConstantPtr, 16, 2 * 8}});
868   if (ST.hasFlatAddressSpace()) {
869     ExtLoads.legalForTypesWithMemDesc(
870         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
871   }
872 
873   ExtLoads.clampScalar(0, S32, S32)
874           .widenScalarToNextPow2(0)
875           .unsupportedIfMemSizeNotPow2()
876           .lower();
877 
878   auto &Atomics = getActionDefinitionsBuilder(
879     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
880      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
881      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
882      G_ATOMICRMW_UMIN})
883     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
884                {S64, GlobalPtr}, {S64, LocalPtr}});
885   if (ST.hasFlatAddressSpace()) {
886     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
887   }
888 
889   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
890     .legalFor({{S32, LocalPtr}});
891 
892   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
893   // demarshalling
894   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
895     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
896                 {S32, FlatPtr}, {S64, FlatPtr}})
897     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
898                {S32, RegionPtr}, {S64, RegionPtr}});
899 
900   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
901     .lower();
902 
903   // TODO: Pointer types, any 32-bit or 64-bit vector
904 
905   // Condition should be s32 for scalar, s1 for vector.
906   getActionDefinitionsBuilder(G_SELECT)
907     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
908           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
909           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
910     .clampScalar(0, S16, S64)
911     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
912     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
913     .scalarize(1)
914     .clampMaxNumElements(0, S32, 2)
915     .clampMaxNumElements(0, LocalPtr, 2)
916     .clampMaxNumElements(0, PrivatePtr, 2)
917     .scalarize(0)
918     .widenScalarToNextPow2(0)
919     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
920 
921   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
922   // be more flexible with the shift amount type.
923   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
924     .legalFor({{S32, S32}, {S64, S32}});
925   if (ST.has16BitInsts()) {
926     if (ST.hasVOP3PInsts()) {
927       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
928             .clampMaxNumElements(0, S16, 2);
929     } else
930       Shifts.legalFor({{S16, S32}, {S16, S16}});
931 
932     // TODO: Support 16-bit shift amounts
933     Shifts.clampScalar(1, S32, S32);
934     Shifts.clampScalar(0, S16, S64);
935     Shifts.widenScalarToNextPow2(0, 16);
936   } else {
937     // Make sure we legalize the shift amount type first, as the general
938     // expansion for the shifted type will produce much worse code if it hasn't
939     // been truncated already.
940     Shifts.clampScalar(1, S32, S32);
941     Shifts.clampScalar(0, S32, S64);
942     Shifts.widenScalarToNextPow2(0, 32);
943   }
944   Shifts.scalarize(0);
945 
946   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
947     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
948     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
949     unsigned IdxTypeIdx = 2;
950 
951     getActionDefinitionsBuilder(Op)
952       .customIf([=](const LegalityQuery &Query) {
953           const LLT EltTy = Query.Types[EltTypeIdx];
954           const LLT VecTy = Query.Types[VecTypeIdx];
955           const LLT IdxTy = Query.Types[IdxTypeIdx];
956           return (EltTy.getSizeInBits() == 16 ||
957                   EltTy.getSizeInBits() % 32 == 0) &&
958                  VecTy.getSizeInBits() % 32 == 0 &&
959                  VecTy.getSizeInBits() <= 1024 &&
960                  IdxTy.getSizeInBits() == 32;
961         })
962       .clampScalar(EltTypeIdx, S32, S64)
963       .clampScalar(VecTypeIdx, S32, S64)
964       .clampScalar(IdxTypeIdx, S32, S32);
965   }
966 
967   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
968     .unsupportedIf([=](const LegalityQuery &Query) {
969         const LLT &EltTy = Query.Types[1].getElementType();
970         return Query.Types[0] != EltTy;
971       });
972 
973   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
974     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
975     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
976 
977     // FIXME: Doesn't handle extract of illegal sizes.
978     getActionDefinitionsBuilder(Op)
979       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
980       // FIXME: Multiples of 16 should not be legal.
981       .legalIf([=](const LegalityQuery &Query) {
982           const LLT BigTy = Query.Types[BigTyIdx];
983           const LLT LitTy = Query.Types[LitTyIdx];
984           return (BigTy.getSizeInBits() % 32 == 0) &&
985                  (LitTy.getSizeInBits() % 16 == 0);
986         })
987       .widenScalarIf(
988         [=](const LegalityQuery &Query) {
989           const LLT BigTy = Query.Types[BigTyIdx];
990           return (BigTy.getScalarSizeInBits() < 16);
991         },
992         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
993       .widenScalarIf(
994         [=](const LegalityQuery &Query) {
995           const LLT LitTy = Query.Types[LitTyIdx];
996           return (LitTy.getScalarSizeInBits() < 16);
997         },
998         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
999       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1000       .widenScalarToNextPow2(BigTyIdx, 32);
1001 
1002   }
1003 
1004   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1005     .legalForCartesianProduct(AllS32Vectors, {S32})
1006     .legalForCartesianProduct(AllS64Vectors, {S64})
1007     .clampNumElements(0, V16S32, V32S32)
1008     .clampNumElements(0, V2S64, V16S64)
1009     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1010 
1011   if (ST.hasScalarPackInsts())
1012     BuildVector.legalFor({V2S16, S32});
1013 
1014   BuildVector
1015     .minScalarSameAs(1, 0)
1016     .legalIf(isRegisterType(0))
1017     .minScalarOrElt(0, S32);
1018 
1019   if (ST.hasScalarPackInsts()) {
1020     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1021       .legalFor({V2S16, S32})
1022       .lower();
1023   } else {
1024     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1025       .lower();
1026   }
1027 
1028   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1029     .legalIf(isRegisterType(0));
1030 
1031   // TODO: Don't fully scalarize v2s16 pieces
1032   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1033 
1034   // Merge/Unmerge
1035   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1036     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1037     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1038 
1039     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1040       const LLT &Ty = Query.Types[TypeIdx];
1041       if (Ty.isVector()) {
1042         const LLT &EltTy = Ty.getElementType();
1043         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1044           return true;
1045         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1046           return true;
1047       }
1048       return false;
1049     };
1050 
1051     auto &Builder = getActionDefinitionsBuilder(Op)
1052       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1053       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1054       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1055       // valid.
1056       .clampScalar(LitTyIdx, S16, S256)
1057       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1058       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1059       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1060                            elementTypeIs(1, S16)),
1061                        changeTo(1, V2S16))
1062       // Break up vectors with weird elements into scalars
1063       .fewerElementsIf(
1064         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1065         scalarize(0))
1066       .fewerElementsIf(
1067         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1068         scalarize(1))
1069       .clampScalar(BigTyIdx, S32, S1024)
1070       .lowerFor({{S16, V2S16}});
1071 
1072     if (Op == G_MERGE_VALUES) {
1073       Builder.widenScalarIf(
1074         // TODO: Use 16-bit shifts if legal for 8-bit values?
1075         [=](const LegalityQuery &Query) {
1076           const LLT Ty = Query.Types[LitTyIdx];
1077           return Ty.getSizeInBits() < 32;
1078         },
1079         changeTo(LitTyIdx, S32));
1080     }
1081 
1082     Builder.widenScalarIf(
1083       [=](const LegalityQuery &Query) {
1084         const LLT Ty = Query.Types[BigTyIdx];
1085         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1086           Ty.getSizeInBits() % 16 != 0;
1087       },
1088       [=](const LegalityQuery &Query) {
1089         // Pick the next power of 2, or a multiple of 64 over 128.
1090         // Whichever is smaller.
1091         const LLT &Ty = Query.Types[BigTyIdx];
1092         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1093         if (NewSizeInBits >= 256) {
1094           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1095           if (RoundedTo < NewSizeInBits)
1096             NewSizeInBits = RoundedTo;
1097         }
1098         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1099       })
1100       .legalIf([=](const LegalityQuery &Query) {
1101           const LLT &BigTy = Query.Types[BigTyIdx];
1102           const LLT &LitTy = Query.Types[LitTyIdx];
1103 
1104           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1105             return false;
1106           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1107             return false;
1108 
1109           return BigTy.getSizeInBits() % 16 == 0 &&
1110                  LitTy.getSizeInBits() % 16 == 0 &&
1111                  BigTy.getSizeInBits() <= 1024;
1112         })
1113       // Any vectors left are the wrong size. Scalarize them.
1114       .scalarize(0)
1115       .scalarize(1);
1116   }
1117 
1118   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1119 
1120   getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower();
1121 
1122   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1123     .legalFor({S64});
1124 
1125   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1126         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1127         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1128     .unsupported();
1129 
1130   computeTables();
1131   verify(*ST.getInstrInfo());
1132 }
1133 
1134 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1135                                          MachineRegisterInfo &MRI,
1136                                          MachineIRBuilder &B,
1137                                          GISelChangeObserver &Observer) const {
1138   switch (MI.getOpcode()) {
1139   case TargetOpcode::G_ADDRSPACE_CAST:
1140     return legalizeAddrSpaceCast(MI, MRI, B);
1141   case TargetOpcode::G_FRINT:
1142     return legalizeFrint(MI, MRI, B);
1143   case TargetOpcode::G_FCEIL:
1144     return legalizeFceil(MI, MRI, B);
1145   case TargetOpcode::G_INTRINSIC_TRUNC:
1146     return legalizeIntrinsicTrunc(MI, MRI, B);
1147   case TargetOpcode::G_SITOFP:
1148     return legalizeITOFP(MI, MRI, B, true);
1149   case TargetOpcode::G_UITOFP:
1150     return legalizeITOFP(MI, MRI, B, false);
1151   case TargetOpcode::G_FMINNUM:
1152   case TargetOpcode::G_FMAXNUM:
1153   case TargetOpcode::G_FMINNUM_IEEE:
1154   case TargetOpcode::G_FMAXNUM_IEEE:
1155     return legalizeMinNumMaxNum(MI, MRI, B);
1156   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1157     return legalizeExtractVectorElt(MI, MRI, B);
1158   case TargetOpcode::G_INSERT_VECTOR_ELT:
1159     return legalizeInsertVectorElt(MI, MRI, B);
1160   case TargetOpcode::G_FSIN:
1161   case TargetOpcode::G_FCOS:
1162     return legalizeSinCos(MI, MRI, B);
1163   case TargetOpcode::G_GLOBAL_VALUE:
1164     return legalizeGlobalValue(MI, MRI, B);
1165   case TargetOpcode::G_LOAD:
1166     return legalizeLoad(MI, MRI, B, Observer);
1167   case TargetOpcode::G_FMAD:
1168     return legalizeFMad(MI, MRI, B);
1169   case TargetOpcode::G_FDIV:
1170     return legalizeFDIV(MI, MRI, B);
1171   case TargetOpcode::G_ATOMIC_CMPXCHG:
1172     return legalizeAtomicCmpXChg(MI, MRI, B);
1173   default:
1174     return false;
1175   }
1176 
1177   llvm_unreachable("expected switch to return");
1178 }
1179 
1180 Register AMDGPULegalizerInfo::getSegmentAperture(
1181   unsigned AS,
1182   MachineRegisterInfo &MRI,
1183   MachineIRBuilder &B) const {
1184   MachineFunction &MF = B.getMF();
1185   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1186   const LLT S32 = LLT::scalar(32);
1187 
1188   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1189 
1190   if (ST.hasApertureRegs()) {
1191     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1192     // getreg.
1193     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1194         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1195         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1196     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1197         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1198         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1199     unsigned Encoding =
1200         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1201         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1202         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1203 
1204     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1205     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1206 
1207     B.buildInstr(AMDGPU::S_GETREG_B32)
1208       .addDef(GetReg)
1209       .addImm(Encoding);
1210     MRI.setType(GetReg, S32);
1211 
1212     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1213     B.buildInstr(TargetOpcode::G_SHL)
1214       .addDef(ApertureReg)
1215       .addUse(GetReg)
1216       .addUse(ShiftAmt.getReg(0));
1217 
1218     return ApertureReg;
1219   }
1220 
1221   Register QueuePtr = MRI.createGenericVirtualRegister(
1222     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1223 
1224   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1225   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1226     return Register();
1227 
1228   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1229   // private_segment_aperture_base_hi.
1230   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1231 
1232   // TODO: can we be smarter about machine pointer info?
1233   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1234   MachineMemOperand *MMO = MF.getMachineMemOperand(
1235     PtrInfo,
1236     MachineMemOperand::MOLoad |
1237     MachineMemOperand::MODereferenceable |
1238     MachineMemOperand::MOInvariant,
1239     4,
1240     MinAlign(64, StructOffset));
1241 
1242   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1243   Register LoadAddr;
1244 
1245   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1246   B.buildLoad(LoadResult, LoadAddr, *MMO);
1247   return LoadResult;
1248 }
1249 
1250 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1251   MachineInstr &MI, MachineRegisterInfo &MRI,
1252   MachineIRBuilder &B) const {
1253   MachineFunction &MF = B.getMF();
1254 
1255   B.setInstr(MI);
1256 
1257   const LLT S32 = LLT::scalar(32);
1258   Register Dst = MI.getOperand(0).getReg();
1259   Register Src = MI.getOperand(1).getReg();
1260 
1261   LLT DstTy = MRI.getType(Dst);
1262   LLT SrcTy = MRI.getType(Src);
1263   unsigned DestAS = DstTy.getAddressSpace();
1264   unsigned SrcAS = SrcTy.getAddressSpace();
1265 
1266   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1267   // vector element.
1268   assert(!DstTy.isVector());
1269 
1270   const AMDGPUTargetMachine &TM
1271     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1272 
1273   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1274   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1275     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1276     return true;
1277   }
1278 
1279   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1280     // Truncate.
1281     B.buildExtract(Dst, Src, 0);
1282     MI.eraseFromParent();
1283     return true;
1284   }
1285 
1286   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1287     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1288     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1289 
1290     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1291     // another. Merge operands are required to be the same type, but creating an
1292     // extra ptrtoint would be kind of pointless.
1293     auto HighAddr = B.buildConstant(
1294       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1295     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1296     MI.eraseFromParent();
1297     return true;
1298   }
1299 
1300   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1301     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1302            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1303     unsigned NullVal = TM.getNullPointerValue(DestAS);
1304 
1305     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1306     auto FlatNull = B.buildConstant(SrcTy, 0);
1307 
1308     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1309 
1310     // Extract low 32-bits of the pointer.
1311     B.buildExtract(PtrLo32, Src, 0);
1312 
1313     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1314     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1315     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1316 
1317     MI.eraseFromParent();
1318     return true;
1319   }
1320 
1321   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1322     return false;
1323 
1324   if (!ST.hasFlatAddressSpace())
1325     return false;
1326 
1327   auto SegmentNull =
1328       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1329   auto FlatNull =
1330       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1331 
1332   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1333   if (!ApertureReg.isValid())
1334     return false;
1335 
1336   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1337   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1338 
1339   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1340 
1341   // Coerce the type of the low half of the result so we can use merge_values.
1342   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1343   B.buildInstr(TargetOpcode::G_PTRTOINT)
1344     .addDef(SrcAsInt)
1345     .addUse(Src);
1346 
1347   // TODO: Should we allow mismatched types but matching sizes in merges to
1348   // avoid the ptrtoint?
1349   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1350   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1351 
1352   MI.eraseFromParent();
1353   return true;
1354 }
1355 
1356 bool AMDGPULegalizerInfo::legalizeFrint(
1357   MachineInstr &MI, MachineRegisterInfo &MRI,
1358   MachineIRBuilder &B) const {
1359   B.setInstr(MI);
1360 
1361   Register Src = MI.getOperand(1).getReg();
1362   LLT Ty = MRI.getType(Src);
1363   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1364 
1365   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1366   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1367 
1368   auto C1 = B.buildFConstant(Ty, C1Val);
1369   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1370 
1371   // TODO: Should this propagate fast-math-flags?
1372   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1373   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1374 
1375   auto C2 = B.buildFConstant(Ty, C2Val);
1376   auto Fabs = B.buildFAbs(Ty, Src);
1377 
1378   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1379   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1380   return true;
1381 }
1382 
1383 bool AMDGPULegalizerInfo::legalizeFceil(
1384   MachineInstr &MI, MachineRegisterInfo &MRI,
1385   MachineIRBuilder &B) const {
1386   B.setInstr(MI);
1387 
1388   const LLT S1 = LLT::scalar(1);
1389   const LLT S64 = LLT::scalar(64);
1390 
1391   Register Src = MI.getOperand(1).getReg();
1392   assert(MRI.getType(Src) == S64);
1393 
1394   // result = trunc(src)
1395   // if (src > 0.0 && src != result)
1396   //   result += 1.0
1397 
1398   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1399 
1400   const auto Zero = B.buildFConstant(S64, 0.0);
1401   const auto One = B.buildFConstant(S64, 1.0);
1402   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1403   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1404   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1405   auto Add = B.buildSelect(S64, And, One, Zero);
1406 
1407   // TODO: Should this propagate fast-math-flags?
1408   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1409   return true;
1410 }
1411 
1412 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1413                                               MachineIRBuilder &B) {
1414   const unsigned FractBits = 52;
1415   const unsigned ExpBits = 11;
1416   LLT S32 = LLT::scalar(32);
1417 
1418   auto Const0 = B.buildConstant(S32, FractBits - 32);
1419   auto Const1 = B.buildConstant(S32, ExpBits);
1420 
1421   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1422     .addUse(Const0.getReg(0))
1423     .addUse(Const1.getReg(0));
1424 
1425   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1426 }
1427 
1428 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1429   MachineInstr &MI, MachineRegisterInfo &MRI,
1430   MachineIRBuilder &B) const {
1431   B.setInstr(MI);
1432 
1433   const LLT S1 = LLT::scalar(1);
1434   const LLT S32 = LLT::scalar(32);
1435   const LLT S64 = LLT::scalar(64);
1436 
1437   Register Src = MI.getOperand(1).getReg();
1438   assert(MRI.getType(Src) == S64);
1439 
1440   // TODO: Should this use extract since the low half is unused?
1441   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1442   Register Hi = Unmerge.getReg(1);
1443 
1444   // Extract the upper half, since this is where we will find the sign and
1445   // exponent.
1446   auto Exp = extractF64Exponent(Hi, B);
1447 
1448   const unsigned FractBits = 52;
1449 
1450   // Extract the sign bit.
1451   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1452   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1453 
1454   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1455 
1456   const auto Zero32 = B.buildConstant(S32, 0);
1457 
1458   // Extend back to 64-bits.
1459   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1460 
1461   auto Shr = B.buildAShr(S64, FractMask, Exp);
1462   auto Not = B.buildNot(S64, Shr);
1463   auto Tmp0 = B.buildAnd(S64, Src, Not);
1464   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1465 
1466   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1467   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1468 
1469   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1470   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1471   return true;
1472 }
1473 
1474 bool AMDGPULegalizerInfo::legalizeITOFP(
1475   MachineInstr &MI, MachineRegisterInfo &MRI,
1476   MachineIRBuilder &B, bool Signed) const {
1477   B.setInstr(MI);
1478 
1479   Register Dst = MI.getOperand(0).getReg();
1480   Register Src = MI.getOperand(1).getReg();
1481 
1482   const LLT S64 = LLT::scalar(64);
1483   const LLT S32 = LLT::scalar(32);
1484 
1485   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1486 
1487   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1488 
1489   auto CvtHi = Signed ?
1490     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1491     B.buildUITOFP(S64, Unmerge.getReg(1));
1492 
1493   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1494 
1495   auto ThirtyTwo = B.buildConstant(S32, 32);
1496   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1497     .addUse(CvtHi.getReg(0))
1498     .addUse(ThirtyTwo.getReg(0));
1499 
1500   // TODO: Should this propagate fast-math-flags?
1501   B.buildFAdd(Dst, LdExp, CvtLo);
1502   MI.eraseFromParent();
1503   return true;
1504 }
1505 
1506 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1507   MachineInstr &MI, MachineRegisterInfo &MRI,
1508   MachineIRBuilder &B) const {
1509   MachineFunction &MF = B.getMF();
1510   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1511 
1512   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1513                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1514 
1515   // With ieee_mode disabled, the instructions have the correct behavior
1516   // already for G_FMINNUM/G_FMAXNUM
1517   if (!MFI->getMode().IEEE)
1518     return !IsIEEEOp;
1519 
1520   if (IsIEEEOp)
1521     return true;
1522 
1523   MachineIRBuilder HelperBuilder(MI);
1524   GISelObserverWrapper DummyObserver;
1525   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1526   HelperBuilder.setInstr(MI);
1527   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1528 }
1529 
1530 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1531   MachineInstr &MI, MachineRegisterInfo &MRI,
1532   MachineIRBuilder &B) const {
1533   // TODO: Should move some of this into LegalizerHelper.
1534 
1535   // TODO: Promote dynamic indexing of s16 to s32
1536   // TODO: Dynamic s64 indexing is only legal for SGPR.
1537   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1538   if (!IdxVal) // Dynamic case will be selected to register indexing.
1539     return true;
1540 
1541   Register Dst = MI.getOperand(0).getReg();
1542   Register Vec = MI.getOperand(1).getReg();
1543 
1544   LLT VecTy = MRI.getType(Vec);
1545   LLT EltTy = VecTy.getElementType();
1546   assert(EltTy == MRI.getType(Dst));
1547 
1548   B.setInstr(MI);
1549 
1550   if (IdxVal.getValue() < VecTy.getNumElements())
1551     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1552   else
1553     B.buildUndef(Dst);
1554 
1555   MI.eraseFromParent();
1556   return true;
1557 }
1558 
1559 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1560   MachineInstr &MI, MachineRegisterInfo &MRI,
1561   MachineIRBuilder &B) const {
1562   // TODO: Should move some of this into LegalizerHelper.
1563 
1564   // TODO: Promote dynamic indexing of s16 to s32
1565   // TODO: Dynamic s64 indexing is only legal for SGPR.
1566   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1567   if (!IdxVal) // Dynamic case will be selected to register indexing.
1568     return true;
1569 
1570   Register Dst = MI.getOperand(0).getReg();
1571   Register Vec = MI.getOperand(1).getReg();
1572   Register Ins = MI.getOperand(2).getReg();
1573 
1574   LLT VecTy = MRI.getType(Vec);
1575   LLT EltTy = VecTy.getElementType();
1576   assert(EltTy == MRI.getType(Ins));
1577 
1578   B.setInstr(MI);
1579 
1580   if (IdxVal.getValue() < VecTy.getNumElements())
1581     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1582   else
1583     B.buildUndef(Dst);
1584 
1585   MI.eraseFromParent();
1586   return true;
1587 }
1588 
1589 bool AMDGPULegalizerInfo::legalizeSinCos(
1590   MachineInstr &MI, MachineRegisterInfo &MRI,
1591   MachineIRBuilder &B) const {
1592   B.setInstr(MI);
1593 
1594   Register DstReg = MI.getOperand(0).getReg();
1595   Register SrcReg = MI.getOperand(1).getReg();
1596   LLT Ty = MRI.getType(DstReg);
1597   unsigned Flags = MI.getFlags();
1598 
1599   Register TrigVal;
1600   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1601   if (ST.hasTrigReducedRange()) {
1602     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1603     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1604       .addUse(MulVal.getReg(0))
1605       .setMIFlags(Flags).getReg(0);
1606   } else
1607     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1608 
1609   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1610     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1611   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1612     .addUse(TrigVal)
1613     .setMIFlags(Flags);
1614   MI.eraseFromParent();
1615   return true;
1616 }
1617 
1618 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1619   Register DstReg, LLT PtrTy,
1620   MachineIRBuilder &B, const GlobalValue *GV,
1621   unsigned Offset, unsigned GAFlags) const {
1622   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1623   // to the following code sequence:
1624   //
1625   // For constant address space:
1626   //   s_getpc_b64 s[0:1]
1627   //   s_add_u32 s0, s0, $symbol
1628   //   s_addc_u32 s1, s1, 0
1629   //
1630   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1631   //   a fixup or relocation is emitted to replace $symbol with a literal
1632   //   constant, which is a pc-relative offset from the encoding of the $symbol
1633   //   operand to the global variable.
1634   //
1635   // For global address space:
1636   //   s_getpc_b64 s[0:1]
1637   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1638   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1639   //
1640   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1641   //   fixups or relocations are emitted to replace $symbol@*@lo and
1642   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1643   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1644   //   operand to the global variable.
1645   //
1646   // What we want here is an offset from the value returned by s_getpc
1647   // (which is the address of the s_add_u32 instruction) to the global
1648   // variable, but since the encoding of $symbol starts 4 bytes after the start
1649   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1650   // small. This requires us to add 4 to the global variable offset in order to
1651   // compute the correct address.
1652 
1653   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1654 
1655   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1656     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1657 
1658   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1659     .addDef(PCReg);
1660 
1661   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1662   if (GAFlags == SIInstrInfo::MO_NONE)
1663     MIB.addImm(0);
1664   else
1665     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1666 
1667   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1668 
1669   if (PtrTy.getSizeInBits() == 32)
1670     B.buildExtract(DstReg, PCReg, 0);
1671   return true;
1672  }
1673 
1674 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1675   MachineInstr &MI, MachineRegisterInfo &MRI,
1676   MachineIRBuilder &B) const {
1677   Register DstReg = MI.getOperand(0).getReg();
1678   LLT Ty = MRI.getType(DstReg);
1679   unsigned AS = Ty.getAddressSpace();
1680 
1681   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1682   MachineFunction &MF = B.getMF();
1683   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1684   B.setInstr(MI);
1685 
1686   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1687     if (!MFI->isEntryFunction()) {
1688       const Function &Fn = MF.getFunction();
1689       DiagnosticInfoUnsupported BadLDSDecl(
1690         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1691       Fn.getContext().diagnose(BadLDSDecl);
1692     }
1693 
1694     // TODO: We could emit code to handle the initialization somewhere.
1695     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1696       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1697       MI.eraseFromParent();
1698       return true;
1699     }
1700 
1701     const Function &Fn = MF.getFunction();
1702     DiagnosticInfoUnsupported BadInit(
1703       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1704     Fn.getContext().diagnose(BadInit);
1705     return true;
1706   }
1707 
1708   const SITargetLowering *TLI = ST.getTargetLowering();
1709 
1710   if (TLI->shouldEmitFixup(GV)) {
1711     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1712     MI.eraseFromParent();
1713     return true;
1714   }
1715 
1716   if (TLI->shouldEmitPCReloc(GV)) {
1717     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1718     MI.eraseFromParent();
1719     return true;
1720   }
1721 
1722   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1723   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1724 
1725   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1726     MachinePointerInfo::getGOT(MF),
1727     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1728     MachineMemOperand::MOInvariant,
1729     8 /*Size*/, 8 /*Align*/);
1730 
1731   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1732 
1733   if (Ty.getSizeInBits() == 32) {
1734     // Truncate if this is a 32-bit constant adrdess.
1735     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1736     B.buildExtract(DstReg, Load, 0);
1737   } else
1738     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1739 
1740   MI.eraseFromParent();
1741   return true;
1742 }
1743 
1744 bool AMDGPULegalizerInfo::legalizeLoad(
1745   MachineInstr &MI, MachineRegisterInfo &MRI,
1746   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1747   B.setInstr(MI);
1748   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1749   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1750   Observer.changingInstr(MI);
1751   MI.getOperand(1).setReg(Cast.getReg(0));
1752   Observer.changedInstr(MI);
1753   return true;
1754 }
1755 
1756 bool AMDGPULegalizerInfo::legalizeFMad(
1757   MachineInstr &MI, MachineRegisterInfo &MRI,
1758   MachineIRBuilder &B) const {
1759   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1760   assert(Ty.isScalar());
1761 
1762   MachineFunction &MF = B.getMF();
1763   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1764 
1765   // TODO: Always legal with future ftz flag.
1766   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1767     return true;
1768   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1769     return true;
1770 
1771 
1772   MachineIRBuilder HelperBuilder(MI);
1773   GISelObserverWrapper DummyObserver;
1774   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1775   HelperBuilder.setMBB(*MI.getParent());
1776   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1777 }
1778 
1779 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1780   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1781   Register DstReg = MI.getOperand(0).getReg();
1782   Register PtrReg = MI.getOperand(1).getReg();
1783   Register CmpVal = MI.getOperand(2).getReg();
1784   Register NewVal = MI.getOperand(3).getReg();
1785 
1786   assert(SITargetLowering::isFlatGlobalAddrSpace(
1787            MRI.getType(PtrReg).getAddressSpace()) &&
1788          "this should not have been custom lowered");
1789 
1790   LLT ValTy = MRI.getType(CmpVal);
1791   LLT VecTy = LLT::vector(2, ValTy);
1792 
1793   B.setInstr(MI);
1794   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1795 
1796   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1797     .addDef(DstReg)
1798     .addUse(PtrReg)
1799     .addUse(PackedVal)
1800     .setMemRefs(MI.memoperands());
1801 
1802   MI.eraseFromParent();
1803   return true;
1804 }
1805 
1806 // Return the use branch instruction, otherwise null if the usage is invalid.
1807 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1808                                        MachineRegisterInfo &MRI,
1809                                        MachineInstr *&Br) {
1810   Register CondDef = MI.getOperand(0).getReg();
1811   if (!MRI.hasOneNonDBGUse(CondDef))
1812     return nullptr;
1813 
1814   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1815   if (UseMI.getParent() != MI.getParent() ||
1816       UseMI.getOpcode() != AMDGPU::G_BRCOND)
1817     return nullptr;
1818 
1819   // Make sure the cond br is followed by a G_BR
1820   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
1821   if (Next != MI.getParent()->end()) {
1822     if (Next->getOpcode() != AMDGPU::G_BR)
1823       return nullptr;
1824     Br = &*Next;
1825   }
1826 
1827   return &UseMI;
1828 }
1829 
1830 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1831                                                 Register Reg, LLT Ty) const {
1832   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1833   if (LiveIn)
1834     return LiveIn;
1835 
1836   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1837   MRI.addLiveIn(Reg, NewReg);
1838   return NewReg;
1839 }
1840 
1841 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1842                                          const ArgDescriptor *Arg) const {
1843   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1844     return false; // TODO: Handle these
1845 
1846   assert(Arg->getRegister().isPhysical());
1847 
1848   MachineRegisterInfo &MRI = *B.getMRI();
1849 
1850   LLT Ty = MRI.getType(DstReg);
1851   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1852 
1853   if (Arg->isMasked()) {
1854     // TODO: Should we try to emit this once in the entry block?
1855     const LLT S32 = LLT::scalar(32);
1856     const unsigned Mask = Arg->getMask();
1857     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1858 
1859     Register AndMaskSrc = LiveIn;
1860 
1861     if (Shift != 0) {
1862       auto ShiftAmt = B.buildConstant(S32, Shift);
1863       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1864     }
1865 
1866     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1867   } else
1868     B.buildCopy(DstReg, LiveIn);
1869 
1870   // Insert the argument copy if it doens't already exist.
1871   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1872   if (!MRI.getVRegDef(LiveIn)) {
1873     // FIXME: Should have scoped insert pt
1874     MachineBasicBlock &OrigInsBB = B.getMBB();
1875     auto OrigInsPt = B.getInsertPt();
1876 
1877     MachineBasicBlock &EntryMBB = B.getMF().front();
1878     EntryMBB.addLiveIn(Arg->getRegister());
1879     B.setInsertPt(EntryMBB, EntryMBB.begin());
1880     B.buildCopy(LiveIn, Arg->getRegister());
1881 
1882     B.setInsertPt(OrigInsBB, OrigInsPt);
1883   }
1884 
1885   return true;
1886 }
1887 
1888 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1889   MachineInstr &MI,
1890   MachineRegisterInfo &MRI,
1891   MachineIRBuilder &B,
1892   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1893   B.setInstr(MI);
1894 
1895   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1896 
1897   const ArgDescriptor *Arg;
1898   const TargetRegisterClass *RC;
1899   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1900   if (!Arg) {
1901     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1902     return false;
1903   }
1904 
1905   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1906     MI.eraseFromParent();
1907     return true;
1908   }
1909 
1910   return false;
1911 }
1912 
1913 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1914                                        MachineRegisterInfo &MRI,
1915                                        MachineIRBuilder &B) const {
1916   B.setInstr(MI);
1917   Register Dst = MI.getOperand(0).getReg();
1918   LLT DstTy = MRI.getType(Dst);
1919   LLT S16 = LLT::scalar(16);
1920   LLT S32 = LLT::scalar(32);
1921   LLT S64 = LLT::scalar(64);
1922 
1923   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1924     return true;
1925 
1926   if (DstTy == S16)
1927     return legalizeFDIV16(MI, MRI, B);
1928   if (DstTy == S32)
1929     return legalizeFDIV32(MI, MRI, B);
1930   if (DstTy == S64)
1931     return legalizeFDIV64(MI, MRI, B);
1932 
1933   return false;
1934 }
1935 
1936 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1937                                                  MachineRegisterInfo &MRI,
1938                                                  MachineIRBuilder &B) const {
1939   Register Res = MI.getOperand(0).getReg();
1940   Register LHS = MI.getOperand(1).getReg();
1941   Register RHS = MI.getOperand(2).getReg();
1942 
1943   uint16_t Flags = MI.getFlags();
1944 
1945   LLT ResTy = MRI.getType(Res);
1946   LLT S32 = LLT::scalar(32);
1947   LLT S64 = LLT::scalar(64);
1948 
1949   const MachineFunction &MF = B.getMF();
1950   bool Unsafe =
1951     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1952 
1953   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1954     return false;
1955 
1956   if (!Unsafe && ResTy == S32 &&
1957       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1958     return false;
1959 
1960   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1961     // 1 / x -> RCP(x)
1962     if (CLHS->isExactlyValue(1.0)) {
1963       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1964         .addUse(RHS)
1965         .setMIFlags(Flags);
1966 
1967       MI.eraseFromParent();
1968       return true;
1969     }
1970 
1971     // -1 / x -> RCP( FNEG(x) )
1972     if (CLHS->isExactlyValue(-1.0)) {
1973       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1974       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1975         .addUse(FNeg.getReg(0))
1976         .setMIFlags(Flags);
1977 
1978       MI.eraseFromParent();
1979       return true;
1980     }
1981   }
1982 
1983   // x / y -> x * (1.0 / y)
1984   if (Unsafe) {
1985     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1986       .addUse(RHS)
1987       .setMIFlags(Flags);
1988     B.buildFMul(Res, LHS, RCP, Flags);
1989 
1990     MI.eraseFromParent();
1991     return true;
1992   }
1993 
1994   return false;
1995 }
1996 
1997 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1998                                          MachineRegisterInfo &MRI,
1999                                          MachineIRBuilder &B) const {
2000   B.setInstr(MI);
2001   Register Res = MI.getOperand(0).getReg();
2002   Register LHS = MI.getOperand(1).getReg();
2003   Register RHS = MI.getOperand(2).getReg();
2004 
2005   uint16_t Flags = MI.getFlags();
2006 
2007   LLT S16 = LLT::scalar(16);
2008   LLT S32 = LLT::scalar(32);
2009 
2010   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2011   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2012 
2013   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2014     .addUse(RHSExt.getReg(0))
2015     .setMIFlags(Flags);
2016 
2017   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2018   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2019 
2020   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2021     .addUse(RDst.getReg(0))
2022     .addUse(RHS)
2023     .addUse(LHS)
2024     .setMIFlags(Flags);
2025 
2026   MI.eraseFromParent();
2027   return true;
2028 }
2029 
2030 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2031 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2032 static void toggleSPDenormMode(bool Enable,
2033                                MachineIRBuilder &B,
2034                                const GCNSubtarget &ST,
2035                                AMDGPU::SIModeRegisterDefaults Mode) {
2036   // Set SP denorm mode to this value.
2037   unsigned SPDenormMode =
2038     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2039 
2040   if (ST.hasDenormModeInst()) {
2041     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2042     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2043                                    ? FP_DENORM_FLUSH_NONE
2044                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2045 
2046     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2047     B.buildInstr(AMDGPU::S_DENORM_MODE)
2048       .addImm(NewDenormModeValue);
2049 
2050   } else {
2051     // Select FP32 bit field in mode register.
2052     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2053                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2054                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2055 
2056     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2057       .addImm(SPDenormMode)
2058       .addImm(SPDenormModeBitField);
2059   }
2060 }
2061 
2062 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2063                                          MachineRegisterInfo &MRI,
2064                                          MachineIRBuilder &B) const {
2065   B.setInstr(MI);
2066   Register Res = MI.getOperand(0).getReg();
2067   Register LHS = MI.getOperand(1).getReg();
2068   Register RHS = MI.getOperand(2).getReg();
2069   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2070   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2071 
2072   uint16_t Flags = MI.getFlags();
2073 
2074   LLT S32 = LLT::scalar(32);
2075   LLT S1 = LLT::scalar(1);
2076 
2077   auto One = B.buildFConstant(S32, 1.0f);
2078 
2079   auto DenominatorScaled =
2080     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2081       .addUse(RHS)
2082       .addUse(LHS)
2083       .addImm(1)
2084       .setMIFlags(Flags);
2085   auto NumeratorScaled =
2086     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2087       .addUse(LHS)
2088       .addUse(RHS)
2089       .addImm(0)
2090       .setMIFlags(Flags);
2091 
2092   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2093     .addUse(DenominatorScaled.getReg(0))
2094     .setMIFlags(Flags);
2095   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2096 
2097   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2098   // aren't modeled as reading it.
2099   if (!Mode.FP32Denormals)
2100     toggleSPDenormMode(true, B, ST, Mode);
2101 
2102   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2103   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2104   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2105   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2106   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2107   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2108 
2109   if (!Mode.FP32Denormals)
2110     toggleSPDenormMode(false, B, ST, Mode);
2111 
2112   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2113     .addUse(Fma4.getReg(0))
2114     .addUse(Fma1.getReg(0))
2115     .addUse(Fma3.getReg(0))
2116     .addUse(NumeratorScaled.getReg(1))
2117     .setMIFlags(Flags);
2118 
2119   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2120     .addUse(Fmas.getReg(0))
2121     .addUse(RHS)
2122     .addUse(LHS)
2123     .setMIFlags(Flags);
2124 
2125   MI.eraseFromParent();
2126   return true;
2127 }
2128 
2129 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2130                                          MachineRegisterInfo &MRI,
2131                                          MachineIRBuilder &B) const {
2132   B.setInstr(MI);
2133   Register Res = MI.getOperand(0).getReg();
2134   Register LHS = MI.getOperand(1).getReg();
2135   Register RHS = MI.getOperand(2).getReg();
2136 
2137   uint16_t Flags = MI.getFlags();
2138 
2139   LLT S64 = LLT::scalar(64);
2140   LLT S1 = LLT::scalar(1);
2141 
2142   auto One = B.buildFConstant(S64, 1.0);
2143 
2144   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2145     .addUse(LHS)
2146     .addUse(RHS)
2147     .addImm(1)
2148     .setMIFlags(Flags);
2149 
2150   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2151 
2152   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2153     .addUse(DivScale0.getReg(0))
2154     .setMIFlags(Flags);
2155 
2156   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2157   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2158   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2159 
2160   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2161     .addUse(LHS)
2162     .addUse(RHS)
2163     .addImm(0)
2164     .setMIFlags(Flags);
2165 
2166   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2167   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2168   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2169 
2170   Register Scale;
2171   if (!ST.hasUsableDivScaleConditionOutput()) {
2172     // Workaround a hardware bug on SI where the condition output from div_scale
2173     // is not usable.
2174 
2175     Scale = MRI.createGenericVirtualRegister(S1);
2176 
2177     LLT S32 = LLT::scalar(32);
2178 
2179     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2180     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2181     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2182     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2183 
2184     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2185                               Scale1Unmerge.getReg(1));
2186     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2187                               Scale0Unmerge.getReg(1));
2188     B.buildXor(Scale, CmpNum, CmpDen);
2189   } else {
2190     Scale = DivScale1.getReg(1);
2191   }
2192 
2193   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2194     .addUse(Fma4.getReg(0))
2195     .addUse(Fma3.getReg(0))
2196     .addUse(Mul.getReg(0))
2197     .addUse(Scale)
2198     .setMIFlags(Flags);
2199 
2200   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2201     .addUse(Fmas.getReg(0))
2202     .addUse(RHS)
2203     .addUse(LHS)
2204     .setMIFlags(Flags);
2205 
2206   MI.eraseFromParent();
2207   return true;
2208 }
2209 
2210 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2211                                                  MachineRegisterInfo &MRI,
2212                                                  MachineIRBuilder &B) const {
2213   B.setInstr(MI);
2214   Register Res = MI.getOperand(0).getReg();
2215   Register LHS = MI.getOperand(2).getReg();
2216   Register RHS = MI.getOperand(3).getReg();
2217   uint16_t Flags = MI.getFlags();
2218 
2219   LLT S32 = LLT::scalar(32);
2220   LLT S1 = LLT::scalar(1);
2221 
2222   auto Abs = B.buildFAbs(S32, RHS, Flags);
2223   const APFloat C0Val(1.0f);
2224 
2225   auto C0 = B.buildConstant(S32, 0x6f800000);
2226   auto C1 = B.buildConstant(S32, 0x2f800000);
2227   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2228 
2229   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2230   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2231 
2232   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2233 
2234   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2235     .addUse(Mul0.getReg(0))
2236     .setMIFlags(Flags);
2237 
2238   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2239 
2240   B.buildFMul(Res, Sel, Mul1, Flags);
2241 
2242   MI.eraseFromParent();
2243   return true;
2244 }
2245 
2246 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2247                                                  MachineRegisterInfo &MRI,
2248                                                  MachineIRBuilder &B) const {
2249   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2250   if (!MFI->isEntryFunction()) {
2251     return legalizePreloadedArgIntrin(MI, MRI, B,
2252                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2253   }
2254 
2255   B.setInstr(MI);
2256 
2257   uint64_t Offset =
2258     ST.getTargetLowering()->getImplicitParameterOffset(
2259       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2260   Register DstReg = MI.getOperand(0).getReg();
2261   LLT DstTy = MRI.getType(DstReg);
2262   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2263 
2264   const ArgDescriptor *Arg;
2265   const TargetRegisterClass *RC;
2266   std::tie(Arg, RC)
2267     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2268   if (!Arg)
2269     return false;
2270 
2271   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2272   if (!loadInputValue(KernargPtrReg, B, Arg))
2273     return false;
2274 
2275   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2276   MI.eraseFromParent();
2277   return true;
2278 }
2279 
2280 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2281                                               MachineRegisterInfo &MRI,
2282                                               MachineIRBuilder &B,
2283                                               unsigned AddrSpace) const {
2284   B.setInstr(MI);
2285   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2286   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2287   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2288   MI.eraseFromParent();
2289   return true;
2290 }
2291 
2292 /// Handle register layout difference for f16 images for some subtargets.
2293 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2294                                              MachineRegisterInfo &MRI,
2295                                              Register Reg) const {
2296   if (!ST.hasUnpackedD16VMem())
2297     return Reg;
2298 
2299   const LLT S16 = LLT::scalar(16);
2300   const LLT S32 = LLT::scalar(32);
2301   LLT StoreVT = MRI.getType(Reg);
2302   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2303 
2304   auto Unmerge = B.buildUnmerge(S16, Reg);
2305 
2306   SmallVector<Register, 4> WideRegs;
2307   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2308     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2309 
2310   int NumElts = StoreVT.getNumElements();
2311 
2312   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2313 }
2314 
2315 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2316                                                  MachineRegisterInfo &MRI,
2317                                                  MachineIRBuilder &B,
2318                                                  bool IsFormat) const {
2319   // TODO: Reject f16 format on targets where unsupported.
2320   Register VData = MI.getOperand(1).getReg();
2321   LLT Ty = MRI.getType(VData);
2322 
2323   B.setInstr(MI);
2324 
2325   const LLT S32 = LLT::scalar(32);
2326   const LLT S16 = LLT::scalar(16);
2327 
2328   // Fixup illegal register types for i8 stores.
2329   if (Ty == LLT::scalar(8) || Ty == S16) {
2330     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2331     MI.getOperand(1).setReg(AnyExt);
2332     return true;
2333   }
2334 
2335   if (Ty.isVector()) {
2336     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2337       if (IsFormat)
2338         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2339       return true;
2340     }
2341 
2342     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2343   }
2344 
2345   return Ty == S32;
2346 }
2347 
2348 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2349                                             MachineRegisterInfo &MRI,
2350                                             MachineIRBuilder &B) const {
2351   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2352   auto IntrID = MI.getIntrinsicID();
2353   switch (IntrID) {
2354   case Intrinsic::amdgcn_if:
2355   case Intrinsic::amdgcn_else: {
2356     MachineInstr *Br = nullptr;
2357     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2358       const SIRegisterInfo *TRI
2359         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2360 
2361       B.setInstr(*BrCond);
2362       Register Def = MI.getOperand(1).getReg();
2363       Register Use = MI.getOperand(3).getReg();
2364 
2365       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
2366       if (Br)
2367         BrTarget = Br->getOperand(0).getMBB();
2368 
2369       if (IntrID == Intrinsic::amdgcn_if) {
2370         B.buildInstr(AMDGPU::SI_IF)
2371           .addDef(Def)
2372           .addUse(Use)
2373           .addMBB(BrTarget);
2374       } else {
2375         B.buildInstr(AMDGPU::SI_ELSE)
2376           .addDef(Def)
2377           .addUse(Use)
2378           .addMBB(BrTarget)
2379           .addImm(0);
2380       }
2381 
2382       if (Br)
2383         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
2384 
2385       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2386       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2387       MI.eraseFromParent();
2388       BrCond->eraseFromParent();
2389       return true;
2390     }
2391 
2392     return false;
2393   }
2394   case Intrinsic::amdgcn_loop: {
2395     MachineInstr *Br = nullptr;
2396     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2397       const SIRegisterInfo *TRI
2398         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2399 
2400       B.setInstr(*BrCond);
2401 
2402       // FIXME: Need to adjust branch targets based on unconditional branch.
2403       Register Reg = MI.getOperand(2).getReg();
2404       B.buildInstr(AMDGPU::SI_LOOP)
2405         .addUse(Reg)
2406         .addMBB(BrCond->getOperand(1).getMBB());
2407       MI.eraseFromParent();
2408       BrCond->eraseFromParent();
2409       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2410       return true;
2411     }
2412 
2413     return false;
2414   }
2415   case Intrinsic::amdgcn_kernarg_segment_ptr:
2416     return legalizePreloadedArgIntrin(
2417       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2418   case Intrinsic::amdgcn_implicitarg_ptr:
2419     return legalizeImplicitArgPtr(MI, MRI, B);
2420   case Intrinsic::amdgcn_workitem_id_x:
2421     return legalizePreloadedArgIntrin(MI, MRI, B,
2422                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2423   case Intrinsic::amdgcn_workitem_id_y:
2424     return legalizePreloadedArgIntrin(MI, MRI, B,
2425                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2426   case Intrinsic::amdgcn_workitem_id_z:
2427     return legalizePreloadedArgIntrin(MI, MRI, B,
2428                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2429   case Intrinsic::amdgcn_workgroup_id_x:
2430     return legalizePreloadedArgIntrin(MI, MRI, B,
2431                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2432   case Intrinsic::amdgcn_workgroup_id_y:
2433     return legalizePreloadedArgIntrin(MI, MRI, B,
2434                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2435   case Intrinsic::amdgcn_workgroup_id_z:
2436     return legalizePreloadedArgIntrin(MI, MRI, B,
2437                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2438   case Intrinsic::amdgcn_dispatch_ptr:
2439     return legalizePreloadedArgIntrin(MI, MRI, B,
2440                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2441   case Intrinsic::amdgcn_queue_ptr:
2442     return legalizePreloadedArgIntrin(MI, MRI, B,
2443                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2444   case Intrinsic::amdgcn_implicit_buffer_ptr:
2445     return legalizePreloadedArgIntrin(
2446       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2447   case Intrinsic::amdgcn_dispatch_id:
2448     return legalizePreloadedArgIntrin(MI, MRI, B,
2449                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2450   case Intrinsic::amdgcn_fdiv_fast:
2451     return legalizeFDIVFastIntrin(MI, MRI, B);
2452   case Intrinsic::amdgcn_is_shared:
2453     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2454   case Intrinsic::amdgcn_is_private:
2455     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2456   case Intrinsic::amdgcn_wavefrontsize: {
2457     B.setInstr(MI);
2458     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2459     MI.eraseFromParent();
2460     return true;
2461   }
2462   case Intrinsic::amdgcn_raw_buffer_store:
2463     return legalizeRawBufferStore(MI, MRI, B, false);
2464   case Intrinsic::amdgcn_raw_buffer_store_format:
2465     return legalizeRawBufferStore(MI, MRI, B, true);
2466   default:
2467     return true;
2468   }
2469 
2470   return true;
2471 }
2472