1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returns true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::pair(TypeIdx,
100                      LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101   };
102 }
103 
104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105   return [=](const LegalityQuery &Query) {
106     const LLT Ty = Query.Types[TypeIdx];
107     const LLT EltTy = Ty.getElementType();
108     unsigned Size = Ty.getSizeInBits();
109     unsigned Pieces = (Size + 63) / 64;
110     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111     return std::pair(TypeIdx, LLT::scalarOrVector(
112                                   ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   if (Size <= 32) {
138     // <2 x s8> -> s16
139     // <4 x s8> -> s32
140     return LLT::scalar(Size);
141   }
142 
143   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
144 }
145 
146 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
147   return [=](const LegalityQuery &Query) {
148     const LLT Ty = Query.Types[TypeIdx];
149     return std::pair(TypeIdx, getBitcastRegisterType(Ty));
150   };
151 }
152 
153 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
154   return [=](const LegalityQuery &Query) {
155     const LLT Ty = Query.Types[TypeIdx];
156     unsigned Size = Ty.getSizeInBits();
157     assert(Size % 32 == 0);
158     return std::pair(
159         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
160   };
161 }
162 
163 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
164   return [=](const LegalityQuery &Query) {
165     const LLT QueryTy = Query.Types[TypeIdx];
166     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
167   };
168 }
169 
170 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
171   return [=](const LegalityQuery &Query) {
172     const LLT QueryTy = Query.Types[TypeIdx];
173     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
174   };
175 }
176 
177 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
178   return [=](const LegalityQuery &Query) {
179     const LLT QueryTy = Query.Types[TypeIdx];
180     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
181   };
182 }
183 
184 static bool isRegisterSize(unsigned Size) {
185   return Size % 32 == 0 && Size <= MaxRegisterSize;
186 }
187 
188 static bool isRegisterVectorElementType(LLT EltTy) {
189   const int EltSize = EltTy.getSizeInBits();
190   return EltSize == 16 || EltSize % 32 == 0;
191 }
192 
193 static bool isRegisterVectorType(LLT Ty) {
194   const int EltSize = Ty.getElementType().getSizeInBits();
195   return EltSize == 32 || EltSize == 64 ||
196          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
197          EltSize == 128 || EltSize == 256;
198 }
199 
200 static bool isRegisterType(LLT Ty) {
201   if (!isRegisterSize(Ty.getSizeInBits()))
202     return false;
203 
204   if (Ty.isVector())
205     return isRegisterVectorType(Ty);
206 
207   return true;
208 }
209 
210 // Any combination of 32 or 64-bit elements up the maximum register size, and
211 // multiples of v2s16.
212 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
213   return [=](const LegalityQuery &Query) {
214     return isRegisterType(Query.Types[TypeIdx]);
215   };
216 }
217 
218 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
219   return [=](const LegalityQuery &Query) {
220     const LLT QueryTy = Query.Types[TypeIdx];
221     if (!QueryTy.isVector())
222       return false;
223     const LLT EltTy = QueryTy.getElementType();
224     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
225   };
226 }
227 
228 // If we have a truncating store or an extending load with a data size larger
229 // than 32-bits, we need to reduce to a 32-bit type.
230 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
231   return [=](const LegalityQuery &Query) {
232     const LLT Ty = Query.Types[TypeIdx];
233     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
234            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
235   };
236 }
237 
238 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
239 // handle some operations by just promoting the register during
240 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
241 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
242                                     bool IsLoad) {
243   switch (AS) {
244   case AMDGPUAS::PRIVATE_ADDRESS:
245     // FIXME: Private element size.
246     return ST.enableFlatScratch() ? 128 : 32;
247   case AMDGPUAS::LOCAL_ADDRESS:
248     return ST.useDS128() ? 128 : 64;
249   case AMDGPUAS::GLOBAL_ADDRESS:
250   case AMDGPUAS::CONSTANT_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
252     // Treat constant and global as identical. SMRD loads are sometimes usable for
253     // global loads (ideally constant address space should be eliminated)
254     // depending on the context. Legality cannot be context dependent, but
255     // RegBankSelect can split the load as necessary depending on the pointer
256     // register bank/uniformity and if the memory is invariant or not written in a
257     // kernel.
258     return IsLoad ? 512 : 128;
259   default:
260     // Flat addresses may contextually need to be split to 32-bit parts if they
261     // may alias scratch depending on the subtarget.
262     return 128;
263   }
264 }
265 
266 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
267                                  const LegalityQuery &Query) {
268   const LLT Ty = Query.Types[0];
269 
270   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
271   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
272 
273   unsigned RegSize = Ty.getSizeInBits();
274   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
275   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
276   unsigned AS = Query.Types[1].getAddressSpace();
277 
278   // All of these need to be custom lowered to cast the pointer operand.
279   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
280     return false;
281 
282   // Do not handle extending vector loads.
283   if (Ty.isVector() && MemSize != RegSize)
284     return false;
285 
286   // TODO: We should be able to widen loads if the alignment is high enough, but
287   // we also need to modify the memory access size.
288 #if 0
289   // Accept widening loads based on alignment.
290   if (IsLoad && MemSize < Size)
291     MemSize = std::max(MemSize, Align);
292 #endif
293 
294   // Only 1-byte and 2-byte to 32-bit extloads are valid.
295   if (MemSize != RegSize && RegSize != 32)
296     return false;
297 
298   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
299     return false;
300 
301   switch (MemSize) {
302   case 8:
303   case 16:
304   case 32:
305   case 64:
306   case 128:
307     break;
308   case 96:
309     if (!ST.hasDwordx3LoadStores())
310       return false;
311     break;
312   case 256:
313   case 512:
314     // These may contextually need to be broken down.
315     break;
316   default:
317     return false;
318   }
319 
320   assert(RegSize >= MemSize);
321 
322   if (AlignBits < MemSize) {
323     const SITargetLowering *TLI = ST.getTargetLowering();
324     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
325                                                  Align(AlignBits / 8)))
326       return false;
327   }
328 
329   return true;
330 }
331 
332 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
333 // workaround this. Eventually it should ignore the type for loads and only care
334 // about the size. Return true in cases where we will workaround this for now by
335 // bitcasting.
336 static bool loadStoreBitcastWorkaround(const LLT Ty) {
337   if (EnableNewLegality)
338     return false;
339 
340   const unsigned Size = Ty.getSizeInBits();
341   if (Size <= 64)
342     return false;
343   if (!Ty.isVector())
344     return true;
345 
346   LLT EltTy = Ty.getElementType();
347   if (EltTy.isPointer())
348     return true;
349 
350   unsigned EltSize = EltTy.getSizeInBits();
351   return EltSize != 32 && EltSize != 64;
352 }
353 
354 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
355   const LLT Ty = Query.Types[0];
356   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
357          !loadStoreBitcastWorkaround(Ty);
358 }
359 
360 /// Return true if a load or store of the type should be lowered with a bitcast
361 /// to a different type.
362 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
363                                        const LLT MemTy) {
364   const unsigned MemSizeInBits = MemTy.getSizeInBits();
365   const unsigned Size = Ty.getSizeInBits();
366   if (Size != MemSizeInBits)
367     return Size <= 32 && Ty.isVector();
368 
369   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
370     return true;
371 
372   // Don't try to handle bitcasting vector ext loads for now.
373   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
374          (Size <= 32 || isRegisterSize(Size)) &&
375          !isRegisterVectorElementType(Ty.getElementType());
376 }
377 
378 /// Return true if we should legalize a load by widening an odd sized memory
379 /// access up to the alignment. Note this case when the memory access itself
380 /// changes, not the size of the result register.
381 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
382                             uint64_t AlignInBits, unsigned AddrSpace,
383                             unsigned Opcode) {
384   unsigned SizeInBits = MemoryTy.getSizeInBits();
385   // We don't want to widen cases that are naturally legal.
386   if (isPowerOf2_32(SizeInBits))
387     return false;
388 
389   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
390   // end up widening these for a scalar load during RegBankSelect, since there
391   // aren't 96-bit scalar loads.
392   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
393     return false;
394 
395   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
396     return false;
397 
398   // A load is known dereferenceable up to the alignment, so it's legal to widen
399   // to it.
400   //
401   // TODO: Could check dereferenceable for less aligned cases.
402   unsigned RoundedSize = NextPowerOf2(SizeInBits);
403   if (AlignInBits < RoundedSize)
404     return false;
405 
406   // Do not widen if it would introduce a slow unaligned load.
407   const SITargetLowering *TLI = ST.getTargetLowering();
408   unsigned Fast = 0;
409   return TLI->allowsMisalignedMemoryAccessesImpl(
410              RoundedSize, AddrSpace, Align(AlignInBits / 8),
411              MachineMemOperand::MOLoad, &Fast) &&
412          Fast;
413 }
414 
415 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
416                             unsigned Opcode) {
417   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
418     return false;
419 
420   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
421                          Query.MMODescrs[0].AlignInBits,
422                          Query.Types[1].getAddressSpace(), Opcode);
423 }
424 
425 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
426                                          const GCNTargetMachine &TM)
427   :  ST(ST_) {
428   using namespace TargetOpcode;
429 
430   auto GetAddrSpacePtr = [&TM](unsigned AS) {
431     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
432   };
433 
434   const LLT S1 = LLT::scalar(1);
435   const LLT S8 = LLT::scalar(8);
436   const LLT S16 = LLT::scalar(16);
437   const LLT S32 = LLT::scalar(32);
438   const LLT S64 = LLT::scalar(64);
439   const LLT S128 = LLT::scalar(128);
440   const LLT S256 = LLT::scalar(256);
441   const LLT S512 = LLT::scalar(512);
442   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
443 
444   const LLT V2S8 = LLT::fixed_vector(2, 8);
445   const LLT V2S16 = LLT::fixed_vector(2, 16);
446   const LLT V4S16 = LLT::fixed_vector(4, 16);
447 
448   const LLT V2S32 = LLT::fixed_vector(2, 32);
449   const LLT V3S32 = LLT::fixed_vector(3, 32);
450   const LLT V4S32 = LLT::fixed_vector(4, 32);
451   const LLT V5S32 = LLT::fixed_vector(5, 32);
452   const LLT V6S32 = LLT::fixed_vector(6, 32);
453   const LLT V7S32 = LLT::fixed_vector(7, 32);
454   const LLT V8S32 = LLT::fixed_vector(8, 32);
455   const LLT V9S32 = LLT::fixed_vector(9, 32);
456   const LLT V10S32 = LLT::fixed_vector(10, 32);
457   const LLT V11S32 = LLT::fixed_vector(11, 32);
458   const LLT V12S32 = LLT::fixed_vector(12, 32);
459   const LLT V13S32 = LLT::fixed_vector(13, 32);
460   const LLT V14S32 = LLT::fixed_vector(14, 32);
461   const LLT V15S32 = LLT::fixed_vector(15, 32);
462   const LLT V16S32 = LLT::fixed_vector(16, 32);
463   const LLT V32S32 = LLT::fixed_vector(32, 32);
464 
465   const LLT V2S64 = LLT::fixed_vector(2, 64);
466   const LLT V3S64 = LLT::fixed_vector(3, 64);
467   const LLT V4S64 = LLT::fixed_vector(4, 64);
468   const LLT V5S64 = LLT::fixed_vector(5, 64);
469   const LLT V6S64 = LLT::fixed_vector(6, 64);
470   const LLT V7S64 = LLT::fixed_vector(7, 64);
471   const LLT V8S64 = LLT::fixed_vector(8, 64);
472   const LLT V16S64 = LLT::fixed_vector(16, 64);
473 
474   std::initializer_list<LLT> AllS32Vectors =
475     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
476      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
477   std::initializer_list<LLT> AllS64Vectors =
478     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
479 
480   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
481   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
482   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
483   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
484   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
485   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
486   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
487 
488   const LLT CodePtr = FlatPtr;
489 
490   const std::initializer_list<LLT> AddrSpaces64 = {
491     GlobalPtr, ConstantPtr, FlatPtr
492   };
493 
494   const std::initializer_list<LLT> AddrSpaces32 = {
495     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
496   };
497 
498   const std::initializer_list<LLT> FPTypesBase = {
499     S32, S64
500   };
501 
502   const std::initializer_list<LLT> FPTypes16 = {
503     S32, S64, S16
504   };
505 
506   const std::initializer_list<LLT> FPTypesPK16 = {
507     S32, S64, S16, V2S16
508   };
509 
510   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
511 
512   // s1 for VCC branches, s32 for SCC branches.
513   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
514 
515   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
516   // elements for v3s16
517   getActionDefinitionsBuilder(G_PHI)
518     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
519     .legalFor(AllS32Vectors)
520     .legalFor(AllS64Vectors)
521     .legalFor(AddrSpaces64)
522     .legalFor(AddrSpaces32)
523     .legalIf(isPointer(0))
524     .clampScalar(0, S16, S256)
525     .widenScalarToNextPow2(0, 32)
526     .clampMaxNumElements(0, S32, 16)
527     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
528     .scalarize(0);
529 
530   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
531     // Full set of gfx9 features.
532     getActionDefinitionsBuilder({G_ADD, G_SUB})
533       .legalFor({S32, S16, V2S16})
534       .clampMaxNumElementsStrict(0, S16, 2)
535       .scalarize(0)
536       .minScalar(0, S16)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32);
539 
540     getActionDefinitionsBuilder(G_MUL)
541       .legalFor({S32, S16, V2S16})
542       .clampMaxNumElementsStrict(0, S16, 2)
543       .scalarize(0)
544       .minScalar(0, S16)
545       .widenScalarToNextMultipleOf(0, 32)
546       .custom();
547     assert(ST.hasMad64_32());
548 
549     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
550       .legalFor({S32, S16, V2S16}) // Clamp modifier
551       .minScalarOrElt(0, S16)
552       .clampMaxNumElementsStrict(0, S16, 2)
553       .scalarize(0)
554       .widenScalarToNextPow2(0, 32)
555       .lower();
556   } else if (ST.has16BitInsts()) {
557     getActionDefinitionsBuilder({G_ADD, G_SUB})
558       .legalFor({S32, S16})
559       .minScalar(0, S16)
560       .widenScalarToNextMultipleOf(0, 32)
561       .maxScalar(0, S32)
562       .scalarize(0);
563 
564     getActionDefinitionsBuilder(G_MUL)
565       .legalFor({S32, S16})
566       .scalarize(0)
567       .minScalar(0, S16)
568       .widenScalarToNextMultipleOf(0, 32)
569       .custom();
570     assert(ST.hasMad64_32());
571 
572     // Technically the saturating operations require clamp bit support, but this
573     // was introduced at the same time as 16-bit operations.
574     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
575       .legalFor({S32, S16}) // Clamp modifier
576       .minScalar(0, S16)
577       .scalarize(0)
578       .widenScalarToNextPow2(0, 16)
579       .lower();
580 
581     // We're just lowering this, but it helps get a better result to try to
582     // coerce to the desired type first.
583     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
584       .minScalar(0, S16)
585       .scalarize(0)
586       .lower();
587   } else {
588     getActionDefinitionsBuilder({G_ADD, G_SUB})
589       .legalFor({S32})
590       .widenScalarToNextMultipleOf(0, 32)
591       .clampScalar(0, S32, S32)
592       .scalarize(0);
593 
594     auto &Mul = getActionDefinitionsBuilder(G_MUL)
595       .legalFor({S32})
596       .scalarize(0)
597       .minScalar(0, S32)
598       .widenScalarToNextMultipleOf(0, 32);
599 
600     if (ST.hasMad64_32())
601       Mul.custom();
602     else
603       Mul.maxScalar(0, S32);
604 
605     if (ST.hasIntClamp()) {
606       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
607         .legalFor({S32}) // Clamp modifier.
608         .scalarize(0)
609         .minScalarOrElt(0, S32)
610         .lower();
611     } else {
612       // Clamp bit support was added in VI, along with 16-bit operations.
613       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
614         .minScalar(0, S32)
615         .scalarize(0)
616         .lower();
617     }
618 
619     // FIXME: DAG expansion gets better results. The widening uses the smaller
620     // range values and goes for the min/max lowering directly.
621     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
622       .minScalar(0, S32)
623       .scalarize(0)
624       .lower();
625   }
626 
627   getActionDefinitionsBuilder(
628       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
629       .customFor({S32, S64})
630       .clampScalar(0, S32, S64)
631       .widenScalarToNextPow2(0, 32)
632       .scalarize(0);
633 
634   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
635                    .legalFor({S32})
636                    .maxScalar(0, S32);
637 
638   if (ST.hasVOP3PInsts()) {
639     Mulh
640       .clampMaxNumElements(0, S8, 2)
641       .lowerFor({V2S8});
642   }
643 
644   Mulh
645     .scalarize(0)
646     .lower();
647 
648   // Report legal for any types we can handle anywhere. For the cases only legal
649   // on the SALU, RegBankSelect will be able to re-legalize.
650   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
651     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
652     .clampScalar(0, S32, S64)
653     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
654     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
655     .widenScalarToNextPow2(0)
656     .scalarize(0);
657 
658   getActionDefinitionsBuilder(
659       {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
660       .legalFor({{S32, S1}, {S32, S32}})
661       .clampScalar(0, S32, S32)
662       .scalarize(0);
663 
664   getActionDefinitionsBuilder(G_BITCAST)
665     // Don't worry about the size constraint.
666     .legalIf(all(isRegisterType(0), isRegisterType(1)))
667     .lower();
668 
669 
670   getActionDefinitionsBuilder(G_CONSTANT)
671     .legalFor({S1, S32, S64, S16, GlobalPtr,
672                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
673     .legalIf(isPointer(0))
674     .clampScalar(0, S32, S64)
675     .widenScalarToNextPow2(0);
676 
677   getActionDefinitionsBuilder(G_FCONSTANT)
678     .legalFor({S32, S64, S16})
679     .clampScalar(0, S16, S64);
680 
681   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
682       .legalIf(isRegisterType(0))
683       // s1 and s16 are special cases because they have legal operations on
684       // them, but don't really occupy registers in the normal way.
685       .legalFor({S1, S16})
686       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
687       .clampScalarOrElt(0, S32, MaxScalar)
688       .widenScalarToNextPow2(0, 32)
689       .clampMaxNumElements(0, S32, 16);
690 
691   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
692 
693   // If the amount is divergent, we have to do a wave reduction to get the
694   // maximum value, so this is expanded during RegBankSelect.
695   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
696     .legalFor({{PrivatePtr, S32}});
697 
698   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
699     .customIf(typeIsNot(0, PrivatePtr));
700 
701   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
702 
703   auto &FPOpActions = getActionDefinitionsBuilder(
704     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
705       G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
706     .legalFor({S32, S64});
707   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
708     .customFor({S32, S64});
709   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
710     .customFor({S32, S64});
711 
712   if (ST.has16BitInsts()) {
713     if (ST.hasVOP3PInsts())
714       FPOpActions.legalFor({S16, V2S16});
715     else
716       FPOpActions.legalFor({S16});
717 
718     TrigActions.customFor({S16});
719     FDIVActions.customFor({S16});
720   }
721 
722   auto &MinNumMaxNum = getActionDefinitionsBuilder({
723       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
724 
725   if (ST.hasVOP3PInsts()) {
726     MinNumMaxNum.customFor(FPTypesPK16)
727       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
728       .clampMaxNumElements(0, S16, 2)
729       .clampScalar(0, S16, S64)
730       .scalarize(0);
731   } else if (ST.has16BitInsts()) {
732     MinNumMaxNum.customFor(FPTypes16)
733       .clampScalar(0, S16, S64)
734       .scalarize(0);
735   } else {
736     MinNumMaxNum.customFor(FPTypesBase)
737       .clampScalar(0, S32, S64)
738       .scalarize(0);
739   }
740 
741   if (ST.hasVOP3PInsts())
742     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
743 
744   FPOpActions
745     .scalarize(0)
746     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
747 
748   TrigActions
749     .scalarize(0)
750     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
751 
752   FDIVActions
753     .scalarize(0)
754     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
755 
756   getActionDefinitionsBuilder({G_FNEG, G_FABS})
757     .legalFor(FPTypesPK16)
758     .clampMaxNumElementsStrict(0, S16, 2)
759     .scalarize(0)
760     .clampScalar(0, S16, S64);
761 
762   if (ST.has16BitInsts()) {
763     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
764       .legalFor({S32, S64, S16})
765       .scalarize(0)
766       .clampScalar(0, S16, S64);
767   } else {
768     getActionDefinitionsBuilder(G_FSQRT)
769       .legalFor({S32, S64})
770       .scalarize(0)
771       .clampScalar(0, S32, S64);
772 
773     if (ST.hasFractBug()) {
774       getActionDefinitionsBuilder(G_FFLOOR)
775         .customFor({S64})
776         .legalFor({S32, S64})
777         .scalarize(0)
778         .clampScalar(0, S32, S64);
779     } else {
780       getActionDefinitionsBuilder(G_FFLOOR)
781         .legalFor({S32, S64})
782         .scalarize(0)
783         .clampScalar(0, S32, S64);
784     }
785   }
786 
787   getActionDefinitionsBuilder(G_FPTRUNC)
788     .legalFor({{S32, S64}, {S16, S32}})
789     .scalarize(0)
790     .lower();
791 
792   getActionDefinitionsBuilder(G_FPEXT)
793     .legalFor({{S64, S32}, {S32, S16}})
794     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
795     .scalarize(0);
796 
797   auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
798   if (ST.has16BitInsts()) {
799     FSubActions
800       // Use actual fsub instruction
801       .legalFor({S32, S16})
802       // Must use fadd + fneg
803       .lowerFor({S64, V2S16});
804   } else {
805     FSubActions
806       // Use actual fsub instruction
807       .legalFor({S32})
808       // Must use fadd + fneg
809       .lowerFor({S64, S16, V2S16});
810   }
811 
812   FSubActions
813     .scalarize(0)
814     .clampScalar(0, S32, S64);
815 
816   // Whether this is legal depends on the floating point mode for the function.
817   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
818   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
819     FMad.customFor({S32, S16});
820   else if (ST.hasMadMacF32Insts())
821     FMad.customFor({S32});
822   else if (ST.hasMadF16())
823     FMad.customFor({S16});
824   FMad.scalarize(0)
825       .lower();
826 
827   auto &FRem = getActionDefinitionsBuilder(G_FREM);
828   if (ST.has16BitInsts()) {
829     FRem.customFor({S16, S32, S64});
830   } else {
831     FRem.minScalar(0, S32)
832         .customFor({S32, S64});
833   }
834   FRem.scalarize(0);
835 
836   // TODO: Do we need to clamp maximum bitwidth?
837   getActionDefinitionsBuilder(G_TRUNC)
838     .legalIf(isScalar(0))
839     .legalFor({{V2S16, V2S32}})
840     .clampMaxNumElements(0, S16, 2)
841     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
842     // situations (like an invalid implicit use), we don't want to infinite loop
843     // in the legalizer.
844     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
845     .alwaysLegal();
846 
847   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
848     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
849                {S32, S1}, {S64, S1}, {S16, S1}})
850     .scalarize(0)
851     .clampScalar(0, S32, S64)
852     .widenScalarToNextPow2(1, 32);
853 
854   // TODO: Split s1->s64 during regbankselect for VALU.
855   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
856                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
857                     .lowerIf(typeIs(1, S1))
858                     .customFor({{S32, S64}, {S64, S64}});
859   if (ST.has16BitInsts())
860     IToFP.legalFor({{S16, S16}});
861   IToFP.clampScalar(1, S32, S64)
862        .minScalar(0, S32)
863        .scalarize(0)
864        .widenScalarToNextPow2(1);
865 
866   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
867     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
868     .customFor({{S64, S32}, {S64, S64}})
869     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
870   if (ST.has16BitInsts())
871     FPToI.legalFor({{S16, S16}});
872   else
873     FPToI.minScalar(1, S32);
874 
875   FPToI.minScalar(0, S32)
876        .widenScalarToNextPow2(0, 32)
877        .scalarize(0)
878        .lower();
879 
880   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
881       .customFor({S16, S32})
882       .scalarize(0)
883       .lower();
884 
885   // Lower roundeven into G_FRINT
886   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
887     .scalarize(0)
888     .lower();
889 
890   if (ST.has16BitInsts()) {
891     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
892       .legalFor({S16, S32, S64})
893       .clampScalar(0, S16, S64)
894       .scalarize(0);
895   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
896     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
897       .legalFor({S32, S64})
898       .clampScalar(0, S32, S64)
899       .scalarize(0);
900   } else {
901     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
902       .legalFor({S32})
903       .customFor({S64})
904       .clampScalar(0, S32, S64)
905       .scalarize(0);
906   }
907 
908   getActionDefinitionsBuilder(G_PTR_ADD)
909     .legalIf(all(isPointer(0), sameSize(0, 1)))
910     .scalarize(0)
911     .scalarSameSizeAs(1, 0);
912 
913   getActionDefinitionsBuilder(G_PTRMASK)
914     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
915     .scalarSameSizeAs(1, 0)
916     .scalarize(0);
917 
918   auto &CmpBuilder =
919     getActionDefinitionsBuilder(G_ICMP)
920     // The compare output type differs based on the register bank of the output,
921     // so make both s1 and s32 legal.
922     //
923     // Scalar compares producing output in scc will be promoted to s32, as that
924     // is the allocatable register type that will be needed for the copy from
925     // scc. This will be promoted during RegBankSelect, and we assume something
926     // before that won't try to use s32 result types.
927     //
928     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
929     // bank.
930     .legalForCartesianProduct(
931       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
932     .legalForCartesianProduct(
933       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
934   if (ST.has16BitInsts()) {
935     CmpBuilder.legalFor({{S1, S16}});
936   }
937 
938   CmpBuilder
939     .widenScalarToNextPow2(1)
940     .clampScalar(1, S32, S64)
941     .scalarize(0)
942     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
943 
944   getActionDefinitionsBuilder(G_FCMP)
945     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
946     .widenScalarToNextPow2(1)
947     .clampScalar(1, S32, S64)
948     .scalarize(0);
949 
950   // FIXME: fpow has a selection pattern that should move to custom lowering.
951   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
952   if (ST.has16BitInsts())
953     Exp2Ops.legalFor({S32, S16});
954   else
955     Exp2Ops.legalFor({S32});
956   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
957   Exp2Ops.scalarize(0);
958 
959   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
960   if (ST.has16BitInsts())
961     ExpOps.customFor({{S32}, {S16}});
962   else
963     ExpOps.customFor({S32});
964   ExpOps.clampScalar(0, MinScalarFPTy, S32)
965         .scalarize(0);
966 
967   getActionDefinitionsBuilder(G_FPOWI)
968     .clampScalar(0, MinScalarFPTy, S32)
969     .lower();
970 
971   // The 64-bit versions produce 32-bit results, but only on the SALU.
972   getActionDefinitionsBuilder(G_CTPOP)
973     .legalFor({{S32, S32}, {S32, S64}})
974     .clampScalar(0, S32, S32)
975     .widenScalarToNextPow2(1, 32)
976     .clampScalar(1, S32, S64)
977     .scalarize(0)
978     .widenScalarToNextPow2(0, 32);
979 
980   // If no 16 bit instr is available, lower into different instructions.
981   if (ST.has16BitInsts())
982     getActionDefinitionsBuilder(G_IS_FPCLASS)
983         .legalForCartesianProduct({S1}, FPTypes16)
984         .widenScalarToNextPow2(1)
985         .scalarize(0)
986         .lower();
987   else
988     getActionDefinitionsBuilder(G_IS_FPCLASS)
989         .legalForCartesianProduct({S1}, FPTypesBase)
990         .lowerFor({S1, S16})
991         .widenScalarToNextPow2(1)
992         .scalarize(0)
993         .lower();
994 
995   // The hardware instructions return a different result on 0 than the generic
996   // instructions expect. The hardware produces -1, but these produce the
997   // bitwidth.
998   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
999     .scalarize(0)
1000     .clampScalar(0, S32, S32)
1001     .clampScalar(1, S32, S64)
1002     .widenScalarToNextPow2(0, 32)
1003     .widenScalarToNextPow2(1, 32)
1004     .custom();
1005 
1006   // The 64-bit versions produce 32-bit results, but only on the SALU.
1007   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1008     .legalFor({{S32, S32}, {S32, S64}})
1009     .clampScalar(0, S32, S32)
1010     .clampScalar(1, S32, S64)
1011     .scalarize(0)
1012     .widenScalarToNextPow2(0, 32)
1013     .widenScalarToNextPow2(1, 32);
1014 
1015   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1016   // RegBankSelect.
1017   getActionDefinitionsBuilder(G_BITREVERSE)
1018     .legalFor({S32, S64})
1019     .clampScalar(0, S32, S64)
1020     .scalarize(0)
1021     .widenScalarToNextPow2(0);
1022 
1023   if (ST.has16BitInsts()) {
1024     getActionDefinitionsBuilder(G_BSWAP)
1025       .legalFor({S16, S32, V2S16})
1026       .clampMaxNumElementsStrict(0, S16, 2)
1027       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1028       // narrowScalar limitation.
1029       .widenScalarToNextPow2(0)
1030       .clampScalar(0, S16, S32)
1031       .scalarize(0);
1032 
1033     if (ST.hasVOP3PInsts()) {
1034       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1035         .legalFor({S32, S16, V2S16})
1036         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1037         .clampMaxNumElements(0, S16, 2)
1038         .minScalar(0, S16)
1039         .widenScalarToNextPow2(0)
1040         .scalarize(0)
1041         .lower();
1042     } else {
1043       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1044         .legalFor({S32, S16})
1045         .widenScalarToNextPow2(0)
1046         .minScalar(0, S16)
1047         .scalarize(0)
1048         .lower();
1049     }
1050   } else {
1051     // TODO: Should have same legality without v_perm_b32
1052     getActionDefinitionsBuilder(G_BSWAP)
1053       .legalFor({S32})
1054       .lowerIf(scalarNarrowerThan(0, 32))
1055       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1056       // narrowScalar limitation.
1057       .widenScalarToNextPow2(0)
1058       .maxScalar(0, S32)
1059       .scalarize(0)
1060       .lower();
1061 
1062     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1063       .legalFor({S32})
1064       .minScalar(0, S32)
1065       .widenScalarToNextPow2(0)
1066       .scalarize(0)
1067       .lower();
1068   }
1069 
1070   getActionDefinitionsBuilder(G_INTTOPTR)
1071       // List the common cases
1072       .legalForCartesianProduct(AddrSpaces64, {S64})
1073       .legalForCartesianProduct(AddrSpaces32, {S32})
1074       .scalarize(0)
1075       // Accept any address space as long as the size matches
1076       .legalIf(sameSize(0, 1))
1077       .widenScalarIf(smallerThan(1, 0),
1078                      [](const LegalityQuery &Query) {
1079                        return std::pair(
1080                            1, LLT::scalar(Query.Types[0].getSizeInBits()));
1081                      })
1082       .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1083         return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1084       });
1085 
1086   getActionDefinitionsBuilder(G_PTRTOINT)
1087       // List the common cases
1088       .legalForCartesianProduct(AddrSpaces64, {S64})
1089       .legalForCartesianProduct(AddrSpaces32, {S32})
1090       .scalarize(0)
1091       // Accept any address space as long as the size matches
1092       .legalIf(sameSize(0, 1))
1093       .widenScalarIf(smallerThan(0, 1),
1094                      [](const LegalityQuery &Query) {
1095                        return std::pair(
1096                            0, LLT::scalar(Query.Types[1].getSizeInBits()));
1097                      })
1098       .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1099         return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1100       });
1101 
1102   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1103     .scalarize(0)
1104     .custom();
1105 
1106   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1107                                     bool IsLoad) -> bool {
1108     const LLT DstTy = Query.Types[0];
1109 
1110     // Split vector extloads.
1111     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1112 
1113     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1114       return true;
1115 
1116     const LLT PtrTy = Query.Types[1];
1117     unsigned AS = PtrTy.getAddressSpace();
1118     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1119       return true;
1120 
1121     // Catch weird sized loads that don't evenly divide into the access sizes
1122     // TODO: May be able to widen depending on alignment etc.
1123     unsigned NumRegs = (MemSize + 31) / 32;
1124     if (NumRegs == 3) {
1125       if (!ST.hasDwordx3LoadStores())
1126         return true;
1127     } else {
1128       // If the alignment allows, these should have been widened.
1129       if (!isPowerOf2_32(NumRegs))
1130         return true;
1131     }
1132 
1133     return false;
1134   };
1135 
1136   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1137   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1138   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1139 
1140   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1141   // LDS
1142   // TODO: Unsupported flat for SI.
1143 
1144   for (unsigned Op : {G_LOAD, G_STORE}) {
1145     const bool IsStore = Op == G_STORE;
1146 
1147     auto &Actions = getActionDefinitionsBuilder(Op);
1148     // Explicitly list some common cases.
1149     // TODO: Does this help compile time at all?
1150     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1151                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1152                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1153                                       {S64, GlobalPtr, S64, GlobalAlign32},
1154                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1155                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1156                                       {S32, GlobalPtr, S8, GlobalAlign8},
1157                                       {S32, GlobalPtr, S16, GlobalAlign16},
1158 
1159                                       {S32, LocalPtr, S32, 32},
1160                                       {S64, LocalPtr, S64, 32},
1161                                       {V2S32, LocalPtr, V2S32, 32},
1162                                       {S32, LocalPtr, S8, 8},
1163                                       {S32, LocalPtr, S16, 16},
1164                                       {V2S16, LocalPtr, S32, 32},
1165 
1166                                       {S32, PrivatePtr, S32, 32},
1167                                       {S32, PrivatePtr, S8, 8},
1168                                       {S32, PrivatePtr, S16, 16},
1169                                       {V2S16, PrivatePtr, S32, 32},
1170 
1171                                       {S32, ConstantPtr, S32, GlobalAlign32},
1172                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1173                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1174                                       {S64, ConstantPtr, S64, GlobalAlign32},
1175                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1176     Actions.legalIf(
1177       [=](const LegalityQuery &Query) -> bool {
1178         return isLoadStoreLegal(ST, Query);
1179       });
1180 
1181     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1182     // 64-bits.
1183     //
1184     // TODO: Should generalize bitcast action into coerce, which will also cover
1185     // inserting addrspacecasts.
1186     Actions.customIf(typeIs(1, Constant32Ptr));
1187 
1188     // Turn any illegal element vectors into something easier to deal
1189     // with. These will ultimately produce 32-bit scalar shifts to extract the
1190     // parts anyway.
1191     //
1192     // For odd 16-bit element vectors, prefer to split those into pieces with
1193     // 16-bit vector parts.
1194     Actions.bitcastIf(
1195       [=](const LegalityQuery &Query) -> bool {
1196         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1197                                           Query.MMODescrs[0].MemoryTy);
1198       }, bitcastToRegisterType(0));
1199 
1200     if (!IsStore) {
1201       // Widen suitably aligned loads by loading extra bytes. The standard
1202       // legalization actions can't properly express widening memory operands.
1203       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1204         return shouldWidenLoad(ST, Query, G_LOAD);
1205       });
1206     }
1207 
1208     // FIXME: load/store narrowing should be moved to lower action
1209     Actions
1210         .narrowScalarIf(
1211             [=](const LegalityQuery &Query) -> bool {
1212               return !Query.Types[0].isVector() &&
1213                      needToSplitMemOp(Query, Op == G_LOAD);
1214             },
1215             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1216               const LLT DstTy = Query.Types[0];
1217               const LLT PtrTy = Query.Types[1];
1218 
1219               const unsigned DstSize = DstTy.getSizeInBits();
1220               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1221 
1222               // Split extloads.
1223               if (DstSize > MemSize)
1224                 return std::pair(0, LLT::scalar(MemSize));
1225 
1226               unsigned MaxSize = maxSizeForAddrSpace(ST,
1227                                                      PtrTy.getAddressSpace(),
1228                                                      Op == G_LOAD);
1229               if (MemSize > MaxSize)
1230                 return std::pair(0, LLT::scalar(MaxSize));
1231 
1232               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1233               return std::pair(0, LLT::scalar(Align));
1234             })
1235         .fewerElementsIf(
1236             [=](const LegalityQuery &Query) -> bool {
1237               return Query.Types[0].isVector() &&
1238                      needToSplitMemOp(Query, Op == G_LOAD);
1239             },
1240             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1241               const LLT DstTy = Query.Types[0];
1242               const LLT PtrTy = Query.Types[1];
1243 
1244               LLT EltTy = DstTy.getElementType();
1245               unsigned MaxSize = maxSizeForAddrSpace(ST,
1246                                                      PtrTy.getAddressSpace(),
1247                                                      Op == G_LOAD);
1248 
1249               // FIXME: Handle widened to power of 2 results better. This ends
1250               // up scalarizing.
1251               // FIXME: 3 element stores scalarized on SI
1252 
1253               // Split if it's too large for the address space.
1254               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1255               if (MemSize > MaxSize) {
1256                 unsigned NumElts = DstTy.getNumElements();
1257                 unsigned EltSize = EltTy.getSizeInBits();
1258 
1259                 if (MaxSize % EltSize == 0) {
1260                   return std::pair(
1261                       0, LLT::scalarOrVector(
1262                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1263                 }
1264 
1265                 unsigned NumPieces = MemSize / MaxSize;
1266 
1267                 // FIXME: Refine when odd breakdowns handled
1268                 // The scalars will need to be re-legalized.
1269                 if (NumPieces == 1 || NumPieces >= NumElts ||
1270                     NumElts % NumPieces != 0)
1271                   return std::pair(0, EltTy);
1272 
1273                 return std::pair(0,
1274                                  LLT::fixed_vector(NumElts / NumPieces, EltTy));
1275               }
1276 
1277               // FIXME: We could probably handle weird extending loads better.
1278               if (DstTy.getSizeInBits() > MemSize)
1279                 return std::pair(0, EltTy);
1280 
1281               unsigned EltSize = EltTy.getSizeInBits();
1282               unsigned DstSize = DstTy.getSizeInBits();
1283               if (!isPowerOf2_32(DstSize)) {
1284                 // We're probably decomposing an odd sized store. Try to split
1285                 // to the widest type. TODO: Account for alignment. As-is it
1286                 // should be OK, since the new parts will be further legalized.
1287                 unsigned FloorSize = PowerOf2Floor(DstSize);
1288                 return std::pair(
1289                     0, LLT::scalarOrVector(
1290                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1291               }
1292 
1293               // May need relegalization for the scalars.
1294               return std::pair(0, EltTy);
1295             })
1296     .minScalar(0, S32)
1297     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1298     .widenScalarToNextPow2(0)
1299     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1300     .lower();
1301   }
1302 
1303   // FIXME: Unaligned accesses not lowered.
1304   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1305                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1306                                                   {S32, GlobalPtr, S16, 2 * 8},
1307                                                   {S32, LocalPtr, S8, 8},
1308                                                   {S32, LocalPtr, S16, 16},
1309                                                   {S32, PrivatePtr, S8, 8},
1310                                                   {S32, PrivatePtr, S16, 16},
1311                                                   {S32, ConstantPtr, S8, 8},
1312                                                   {S32, ConstantPtr, S16, 2 * 8}})
1313                        .legalIf(
1314                          [=](const LegalityQuery &Query) -> bool {
1315                            return isLoadStoreLegal(ST, Query);
1316                          });
1317 
1318   if (ST.hasFlatAddressSpace()) {
1319     ExtLoads.legalForTypesWithMemDesc(
1320         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1321   }
1322 
1323   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1324   // 64-bits.
1325   //
1326   // TODO: Should generalize bitcast action into coerce, which will also cover
1327   // inserting addrspacecasts.
1328   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1329 
1330   ExtLoads.clampScalar(0, S32, S32)
1331           .widenScalarToNextPow2(0)
1332           .lower();
1333 
1334   auto &Atomics = getActionDefinitionsBuilder(
1335     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1336      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1337      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1338      G_ATOMICRMW_UMIN})
1339     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1340                {S64, GlobalPtr}, {S64, LocalPtr},
1341                {S32, RegionPtr}, {S64, RegionPtr}});
1342   if (ST.hasFlatAddressSpace()) {
1343     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1344   }
1345 
1346   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1347   if (ST.hasLDSFPAtomicAdd()) {
1348     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1349     if (ST.hasGFX90AInsts())
1350       Atomic.legalFor({{S64, LocalPtr}});
1351     if (ST.hasGFX940Insts())
1352       Atomic.legalFor({{V2S16, LocalPtr}});
1353   }
1354   if (ST.hasAtomicFaddInsts())
1355     Atomic.legalFor({{S32, GlobalPtr}});
1356   if (ST.hasFlatAtomicFaddF32Inst())
1357     Atomic.legalFor({{S32, FlatPtr}});
1358 
1359   if (ST.hasGFX90AInsts()) {
1360     // These are legal with some caveats, and should have undergone expansion in
1361     // the IR in most situations
1362     // TODO: Move atomic expansion into legalizer
1363     Atomic.legalFor({
1364         {S32, GlobalPtr},
1365         {S64, GlobalPtr},
1366         {S64, FlatPtr}
1367       });
1368   }
1369 
1370   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1371   // demarshalling
1372   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1373     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1374                 {S32, FlatPtr}, {S64, FlatPtr}})
1375     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1376                {S32, RegionPtr}, {S64, RegionPtr}});
1377   // TODO: Pointer types, any 32-bit or 64-bit vector
1378 
1379   // Condition should be s32 for scalar, s1 for vector.
1380   getActionDefinitionsBuilder(G_SELECT)
1381       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1382                                  LocalPtr, FlatPtr, PrivatePtr,
1383                                  LLT::fixed_vector(2, LocalPtr),
1384                                  LLT::fixed_vector(2, PrivatePtr)},
1385                                 {S1, S32})
1386       .clampScalar(0, S16, S64)
1387       .scalarize(1)
1388       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1389       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1390       .clampMaxNumElements(0, S32, 2)
1391       .clampMaxNumElements(0, LocalPtr, 2)
1392       .clampMaxNumElements(0, PrivatePtr, 2)
1393       .scalarize(0)
1394       .widenScalarToNextPow2(0)
1395       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1396 
1397   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1398   // be more flexible with the shift amount type.
1399   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1400     .legalFor({{S32, S32}, {S64, S32}});
1401   if (ST.has16BitInsts()) {
1402     if (ST.hasVOP3PInsts()) {
1403       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1404             .clampMaxNumElements(0, S16, 2);
1405     } else
1406       Shifts.legalFor({{S16, S16}});
1407 
1408     // TODO: Support 16-bit shift amounts for all types
1409     Shifts.widenScalarIf(
1410       [=](const LegalityQuery &Query) {
1411         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1412         // 32-bit amount.
1413         const LLT ValTy = Query.Types[0];
1414         const LLT AmountTy = Query.Types[1];
1415         return ValTy.getSizeInBits() <= 16 &&
1416                AmountTy.getSizeInBits() < 16;
1417       }, changeTo(1, S16));
1418     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1419     Shifts.clampScalar(1, S32, S32);
1420     Shifts.widenScalarToNextPow2(0, 16);
1421     Shifts.clampScalar(0, S16, S64);
1422 
1423     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1424       .minScalar(0, S16)
1425       .scalarize(0)
1426       .lower();
1427   } else {
1428     // Make sure we legalize the shift amount type first, as the general
1429     // expansion for the shifted type will produce much worse code if it hasn't
1430     // been truncated already.
1431     Shifts.clampScalar(1, S32, S32);
1432     Shifts.widenScalarToNextPow2(0, 32);
1433     Shifts.clampScalar(0, S32, S64);
1434 
1435     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1436       .minScalar(0, S32)
1437       .scalarize(0)
1438       .lower();
1439   }
1440   Shifts.scalarize(0);
1441 
1442   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1443     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1444     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1445     unsigned IdxTypeIdx = 2;
1446 
1447     getActionDefinitionsBuilder(Op)
1448       .customIf([=](const LegalityQuery &Query) {
1449           const LLT EltTy = Query.Types[EltTypeIdx];
1450           const LLT VecTy = Query.Types[VecTypeIdx];
1451           const LLT IdxTy = Query.Types[IdxTypeIdx];
1452           const unsigned EltSize = EltTy.getSizeInBits();
1453           return (EltSize == 32 || EltSize == 64) &&
1454                   VecTy.getSizeInBits() % 32 == 0 &&
1455                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1456                   IdxTy.getSizeInBits() == 32;
1457         })
1458       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1459                  bitcastToVectorElement32(VecTypeIdx))
1460       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1461       .bitcastIf(
1462         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1463         [=](const LegalityQuery &Query) {
1464           // For > 64-bit element types, try to turn this into a 64-bit
1465           // element vector since we may be able to do better indexing
1466           // if this is scalar. If not, fall back to 32.
1467           const LLT EltTy = Query.Types[EltTypeIdx];
1468           const LLT VecTy = Query.Types[VecTypeIdx];
1469           const unsigned DstEltSize = EltTy.getSizeInBits();
1470           const unsigned VecSize = VecTy.getSizeInBits();
1471 
1472           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1473           return std::pair(
1474               VecTypeIdx,
1475               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1476         })
1477       .clampScalar(EltTypeIdx, S32, S64)
1478       .clampScalar(VecTypeIdx, S32, S64)
1479       .clampScalar(IdxTypeIdx, S32, S32)
1480       .clampMaxNumElements(VecTypeIdx, S32, 32)
1481       // TODO: Clamp elements for 64-bit vectors?
1482       // It should only be necessary with variable indexes.
1483       // As a last resort, lower to the stack
1484       .lower();
1485   }
1486 
1487   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1488     .unsupportedIf([=](const LegalityQuery &Query) {
1489         const LLT &EltTy = Query.Types[1].getElementType();
1490         return Query.Types[0] != EltTy;
1491       });
1492 
1493   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1494     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1495     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1496 
1497     // FIXME: Doesn't handle extract of illegal sizes.
1498     getActionDefinitionsBuilder(Op)
1499       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1500       .lowerIf([=](const LegalityQuery &Query) {
1501           // Sub-vector(or single element) insert and extract.
1502           // TODO: verify immediate offset here since lower only works with
1503           // whole elements.
1504           const LLT BigTy = Query.Types[BigTyIdx];
1505           return BigTy.isVector();
1506         })
1507       // FIXME: Multiples of 16 should not be legal.
1508       .legalIf([=](const LegalityQuery &Query) {
1509           const LLT BigTy = Query.Types[BigTyIdx];
1510           const LLT LitTy = Query.Types[LitTyIdx];
1511           return (BigTy.getSizeInBits() % 32 == 0) &&
1512                  (LitTy.getSizeInBits() % 16 == 0);
1513         })
1514       .widenScalarIf(
1515         [=](const LegalityQuery &Query) {
1516           const LLT BigTy = Query.Types[BigTyIdx];
1517           return (BigTy.getScalarSizeInBits() < 16);
1518         },
1519         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1520       .widenScalarIf(
1521         [=](const LegalityQuery &Query) {
1522           const LLT LitTy = Query.Types[LitTyIdx];
1523           return (LitTy.getScalarSizeInBits() < 16);
1524         },
1525         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1526       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1527       .widenScalarToNextPow2(BigTyIdx, 32);
1528 
1529   }
1530 
1531   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1532     .legalForCartesianProduct(AllS32Vectors, {S32})
1533     .legalForCartesianProduct(AllS64Vectors, {S64})
1534     .clampNumElements(0, V16S32, V32S32)
1535     .clampNumElements(0, V2S64, V16S64)
1536     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1537 
1538   if (ST.hasScalarPackInsts()) {
1539     BuildVector
1540       // FIXME: Should probably widen s1 vectors straight to s32
1541       .minScalarOrElt(0, S16)
1542       .minScalar(1, S16);
1543 
1544     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1545       .legalFor({V2S16, S32})
1546       .lower();
1547   } else {
1548     BuildVector.customFor({V2S16, S16});
1549     BuildVector.minScalarOrElt(0, S32);
1550 
1551     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1552       .customFor({V2S16, S32})
1553       .lower();
1554   }
1555 
1556   BuildVector.legalIf(isRegisterType(0));
1557 
1558   // FIXME: Clamp maximum size
1559   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1560     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1561     .clampMaxNumElements(0, S32, 32)
1562     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1563     .clampMaxNumElements(0, S16, 64);
1564 
1565   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1566 
1567   // Merge/Unmerge
1568   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1569     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1570     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1571 
1572     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1573       const LLT Ty = Query.Types[TypeIdx];
1574       if (Ty.isVector()) {
1575         const LLT &EltTy = Ty.getElementType();
1576         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1577           return true;
1578         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1579           return true;
1580       }
1581       return false;
1582     };
1583 
1584     auto &Builder = getActionDefinitionsBuilder(Op)
1585       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1586       .lowerFor({{S16, V2S16}})
1587       .lowerIf([=](const LegalityQuery &Query) {
1588           const LLT BigTy = Query.Types[BigTyIdx];
1589           return BigTy.getSizeInBits() == 32;
1590         })
1591       // Try to widen to s16 first for small types.
1592       // TODO: Only do this on targets with legal s16 shifts
1593       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1594       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1595       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1596       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1597                            elementTypeIs(1, S16)),
1598                        changeTo(1, V2S16))
1599       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1600       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1601       // valid.
1602       .clampScalar(LitTyIdx, S32, S512)
1603       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1604       // Break up vectors with weird elements into scalars
1605       .fewerElementsIf(
1606         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1607         scalarize(0))
1608       .fewerElementsIf(
1609         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1610         scalarize(1))
1611       .clampScalar(BigTyIdx, S32, MaxScalar);
1612 
1613     if (Op == G_MERGE_VALUES) {
1614       Builder.widenScalarIf(
1615         // TODO: Use 16-bit shifts if legal for 8-bit values?
1616         [=](const LegalityQuery &Query) {
1617           const LLT Ty = Query.Types[LitTyIdx];
1618           return Ty.getSizeInBits() < 32;
1619         },
1620         changeTo(LitTyIdx, S32));
1621     }
1622 
1623     Builder.widenScalarIf(
1624       [=](const LegalityQuery &Query) {
1625         const LLT Ty = Query.Types[BigTyIdx];
1626         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1627           Ty.getSizeInBits() % 16 != 0;
1628       },
1629       [=](const LegalityQuery &Query) {
1630         // Pick the next power of 2, or a multiple of 64 over 128.
1631         // Whichever is smaller.
1632         const LLT &Ty = Query.Types[BigTyIdx];
1633         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1634         if (NewSizeInBits >= 256) {
1635           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1636           if (RoundedTo < NewSizeInBits)
1637             NewSizeInBits = RoundedTo;
1638         }
1639         return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1640       })
1641       // Any vectors left are the wrong size. Scalarize them.
1642       .scalarize(0)
1643       .scalarize(1);
1644   }
1645 
1646   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1647   // RegBankSelect.
1648   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1649     .legalFor({{S32}, {S64}});
1650 
1651   if (ST.hasVOP3PInsts()) {
1652     SextInReg.lowerFor({{V2S16}})
1653       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1654       // get more vector shift opportunities, since we'll get those when
1655       // expanded.
1656       .clampMaxNumElementsStrict(0, S16, 2);
1657   } else if (ST.has16BitInsts()) {
1658     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1659   } else {
1660     // Prefer to promote to s32 before lowering if we don't have 16-bit
1661     // shifts. This avoid a lot of intermediate truncate and extend operations.
1662     SextInReg.lowerFor({{S32}, {S64}});
1663   }
1664 
1665   SextInReg
1666     .scalarize(0)
1667     .clampScalar(0, S32, S64)
1668     .lower();
1669 
1670   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1671     .scalarize(0)
1672     .lower();
1673 
1674   // TODO: Only Try to form v2s16 with legal packed instructions.
1675   getActionDefinitionsBuilder(G_FSHR)
1676     .legalFor({{S32, S32}})
1677     .lowerFor({{V2S16, V2S16}})
1678     .clampMaxNumElementsStrict(0, S16, 2)
1679     .scalarize(0)
1680     .lower();
1681 
1682   if (ST.hasVOP3PInsts()) {
1683     getActionDefinitionsBuilder(G_FSHL)
1684       .lowerFor({{V2S16, V2S16}})
1685       .clampMaxNumElementsStrict(0, S16, 2)
1686       .scalarize(0)
1687       .lower();
1688   } else {
1689     getActionDefinitionsBuilder(G_FSHL)
1690       .scalarize(0)
1691       .lower();
1692   }
1693 
1694   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1695     .legalFor({S64});
1696 
1697   getActionDefinitionsBuilder(G_FENCE)
1698     .alwaysLegal();
1699 
1700   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1701       .scalarize(0)
1702       .minScalar(0, S32)
1703       .lower();
1704 
1705   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1706       .legalFor({{S32, S32}, {S64, S32}})
1707       .clampScalar(1, S32, S32)
1708       .clampScalar(0, S32, S64)
1709       .widenScalarToNextPow2(0)
1710       .scalarize(0);
1711 
1712   getActionDefinitionsBuilder({
1713       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1714       G_FCOPYSIGN,
1715 
1716       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1717       G_ATOMICRMW_NAND,
1718       G_ATOMICRMW_FSUB,
1719       G_READ_REGISTER,
1720       G_WRITE_REGISTER,
1721 
1722       G_SADDO, G_SSUBO,
1723 
1724        // TODO: Implement
1725       G_FMINIMUM, G_FMAXIMUM}).lower();
1726 
1727   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1728       .lower();
1729 
1730   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1731         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1732         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1733     .unsupported();
1734 
1735   getLegacyLegalizerInfo().computeTables();
1736   verify(*ST.getInstrInfo());
1737 }
1738 
1739 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1740                                          MachineInstr &MI) const {
1741   MachineIRBuilder &B = Helper.MIRBuilder;
1742   MachineRegisterInfo &MRI = *B.getMRI();
1743 
1744   switch (MI.getOpcode()) {
1745   case TargetOpcode::G_ADDRSPACE_CAST:
1746     return legalizeAddrSpaceCast(MI, MRI, B);
1747   case TargetOpcode::G_FRINT:
1748     return legalizeFrint(MI, MRI, B);
1749   case TargetOpcode::G_FCEIL:
1750     return legalizeFceil(MI, MRI, B);
1751   case TargetOpcode::G_FREM:
1752     return legalizeFrem(MI, MRI, B);
1753   case TargetOpcode::G_INTRINSIC_TRUNC:
1754     return legalizeIntrinsicTrunc(MI, MRI, B);
1755   case TargetOpcode::G_SITOFP:
1756     return legalizeITOFP(MI, MRI, B, true);
1757   case TargetOpcode::G_UITOFP:
1758     return legalizeITOFP(MI, MRI, B, false);
1759   case TargetOpcode::G_FPTOSI:
1760     return legalizeFPTOI(MI, MRI, B, true);
1761   case TargetOpcode::G_FPTOUI:
1762     return legalizeFPTOI(MI, MRI, B, false);
1763   case TargetOpcode::G_FMINNUM:
1764   case TargetOpcode::G_FMAXNUM:
1765   case TargetOpcode::G_FMINNUM_IEEE:
1766   case TargetOpcode::G_FMAXNUM_IEEE:
1767     return legalizeMinNumMaxNum(Helper, MI);
1768   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1769     return legalizeExtractVectorElt(MI, MRI, B);
1770   case TargetOpcode::G_INSERT_VECTOR_ELT:
1771     return legalizeInsertVectorElt(MI, MRI, B);
1772   case TargetOpcode::G_FSIN:
1773   case TargetOpcode::G_FCOS:
1774     return legalizeSinCos(MI, MRI, B);
1775   case TargetOpcode::G_GLOBAL_VALUE:
1776     return legalizeGlobalValue(MI, MRI, B);
1777   case TargetOpcode::G_LOAD:
1778   case TargetOpcode::G_SEXTLOAD:
1779   case TargetOpcode::G_ZEXTLOAD:
1780     return legalizeLoad(Helper, MI);
1781   case TargetOpcode::G_FMAD:
1782     return legalizeFMad(MI, MRI, B);
1783   case TargetOpcode::G_FDIV:
1784     return legalizeFDIV(MI, MRI, B);
1785   case TargetOpcode::G_UDIV:
1786   case TargetOpcode::G_UREM:
1787   case TargetOpcode::G_UDIVREM:
1788     return legalizeUnsignedDIV_REM(MI, MRI, B);
1789   case TargetOpcode::G_SDIV:
1790   case TargetOpcode::G_SREM:
1791   case TargetOpcode::G_SDIVREM:
1792     return legalizeSignedDIV_REM(MI, MRI, B);
1793   case TargetOpcode::G_ATOMIC_CMPXCHG:
1794     return legalizeAtomicCmpXChg(MI, MRI, B);
1795   case TargetOpcode::G_FLOG:
1796     return legalizeFlog(MI, B, numbers::ln2f);
1797   case TargetOpcode::G_FLOG10:
1798     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1799   case TargetOpcode::G_FEXP:
1800     return legalizeFExp(MI, B);
1801   case TargetOpcode::G_FPOW:
1802     return legalizeFPow(MI, B);
1803   case TargetOpcode::G_FFLOOR:
1804     return legalizeFFloor(MI, MRI, B);
1805   case TargetOpcode::G_BUILD_VECTOR:
1806   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
1807     return legalizeBuildVector(MI, MRI, B);
1808   case TargetOpcode::G_MUL:
1809     return legalizeMul(Helper, MI);
1810   case TargetOpcode::G_CTLZ:
1811   case TargetOpcode::G_CTTZ:
1812     return legalizeCTLZ_CTTZ(MI, MRI, B);
1813   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1814     return legalizeFPTruncRound(MI, B);
1815   default:
1816     return false;
1817   }
1818 
1819   llvm_unreachable("expected switch to return");
1820 }
1821 
1822 Register AMDGPULegalizerInfo::getSegmentAperture(
1823   unsigned AS,
1824   MachineRegisterInfo &MRI,
1825   MachineIRBuilder &B) const {
1826   MachineFunction &MF = B.getMF();
1827   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1828   const LLT S32 = LLT::scalar(32);
1829   const LLT S64 = LLT::scalar(64);
1830 
1831   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1832 
1833   if (ST.hasApertureRegs()) {
1834     // Note: this register is somewhat broken. When used as a 32-bit operand,
1835     // it only returns zeroes. The real value is in the upper 32 bits.
1836     // Thus, we must emit extract the high 32 bits.
1837     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
1838                                        ? AMDGPU::SRC_SHARED_BASE
1839                                        : AMDGPU::SRC_PRIVATE_BASE;
1840     // FIXME: It would be more natural to emit a COPY here, but then copy
1841     // coalescing would kick in and it would think it's okay to use the "HI"
1842     // subregister (instead of extracting the HI 32 bits) which is an artificial
1843     // (unusable) register.
1844     //  Register TableGen definitions would need an overhaul to get rid of the
1845     //  artificial "HI" aperture registers and prevent this kind of issue from
1846     //  happening.
1847     Register Dst = MRI.createGenericVirtualRegister(S64);
1848     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
1849     B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
1850     return B.buildUnmerge(S32, Dst).getReg(1);
1851   }
1852 
1853   // TODO: can we be smarter about machine pointer info?
1854   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1855   Register LoadAddr = MRI.createGenericVirtualRegister(
1856     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1857   // For code object version 5, private_base and shared_base are passed through
1858   // implicit kernargs.
1859   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
1860     AMDGPUTargetLowering::ImplicitParameter Param =
1861         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
1862                                       : AMDGPUTargetLowering::PRIVATE_BASE;
1863     uint64_t Offset =
1864         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1865 
1866     Register KernargPtrReg = MRI.createGenericVirtualRegister(
1867         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1868 
1869     if (!loadInputValue(KernargPtrReg, B,
1870                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
1871       return Register();
1872 
1873     MachineMemOperand *MMO = MF.getMachineMemOperand(
1874         PtrInfo,
1875         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1876             MachineMemOperand::MOInvariant,
1877         LLT::scalar(32), commonAlignment(Align(64), Offset));
1878 
1879     // Pointer address
1880     B.buildPtrAdd(LoadAddr, KernargPtrReg,
1881                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1882     // Load address
1883     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1884   }
1885 
1886   Register QueuePtr = MRI.createGenericVirtualRegister(
1887     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1888 
1889   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1890     return Register();
1891 
1892   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1893   // private_segment_aperture_base_hi.
1894   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1895 
1896   MachineMemOperand *MMO = MF.getMachineMemOperand(
1897       PtrInfo,
1898       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1899           MachineMemOperand::MOInvariant,
1900       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1901 
1902   B.buildPtrAdd(LoadAddr, QueuePtr,
1903                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1904   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1905 }
1906 
1907 /// Return true if the value is a known valid address, such that a null check is
1908 /// not necessary.
1909 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1910                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1911   MachineInstr *Def = MRI.getVRegDef(Val);
1912   switch (Def->getOpcode()) {
1913   case AMDGPU::G_FRAME_INDEX:
1914   case AMDGPU::G_GLOBAL_VALUE:
1915   case AMDGPU::G_BLOCK_ADDR:
1916     return true;
1917   case AMDGPU::G_CONSTANT: {
1918     const ConstantInt *CI = Def->getOperand(1).getCImm();
1919     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1920   }
1921   default:
1922     return false;
1923   }
1924 
1925   return false;
1926 }
1927 
1928 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1929   MachineInstr &MI, MachineRegisterInfo &MRI,
1930   MachineIRBuilder &B) const {
1931   MachineFunction &MF = B.getMF();
1932 
1933   const LLT S32 = LLT::scalar(32);
1934   Register Dst = MI.getOperand(0).getReg();
1935   Register Src = MI.getOperand(1).getReg();
1936 
1937   LLT DstTy = MRI.getType(Dst);
1938   LLT SrcTy = MRI.getType(Src);
1939   unsigned DestAS = DstTy.getAddressSpace();
1940   unsigned SrcAS = SrcTy.getAddressSpace();
1941 
1942   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1943   // vector element.
1944   assert(!DstTy.isVector());
1945 
1946   const AMDGPUTargetMachine &TM
1947     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1948 
1949   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1950     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1951     return true;
1952   }
1953 
1954   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1955       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1956        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1957     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1958       // Extract low 32-bits of the pointer.
1959       B.buildExtract(Dst, Src, 0);
1960       MI.eraseFromParent();
1961       return true;
1962     }
1963 
1964     unsigned NullVal = TM.getNullPointerValue(DestAS);
1965 
1966     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1967     auto FlatNull = B.buildConstant(SrcTy, 0);
1968 
1969     // Extract low 32-bits of the pointer.
1970     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1971 
1972     auto CmpRes =
1973         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1974     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1975 
1976     MI.eraseFromParent();
1977     return true;
1978   }
1979 
1980   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1981       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1982        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1983     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1984     if (!ApertureReg.isValid())
1985       return false;
1986 
1987     // Coerce the type of the low half of the result so we can use merge_values.
1988     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1989 
1990     // TODO: Should we allow mismatched types but matching sizes in merges to
1991     // avoid the ptrtoint?
1992     auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
1993 
1994     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1995       B.buildCopy(Dst, BuildPtr);
1996       MI.eraseFromParent();
1997       return true;
1998     }
1999 
2000     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2001     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2002 
2003     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2004                               SegmentNull.getReg(0));
2005 
2006     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2007 
2008     MI.eraseFromParent();
2009     return true;
2010   }
2011 
2012   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2013       SrcTy.getSizeInBits() == 64) {
2014     // Truncate.
2015     B.buildExtract(Dst, Src, 0);
2016     MI.eraseFromParent();
2017     return true;
2018   }
2019 
2020   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2021       DstTy.getSizeInBits() == 64) {
2022     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2023     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2024     auto PtrLo = B.buildPtrToInt(S32, Src);
2025     auto HighAddr = B.buildConstant(S32, AddrHiVal);
2026     B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2027     MI.eraseFromParent();
2028     return true;
2029   }
2030 
2031   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2032       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2033 
2034   LLVMContext &Ctx = MF.getFunction().getContext();
2035   Ctx.diagnose(InvalidAddrSpaceCast);
2036   B.buildUndef(Dst);
2037   MI.eraseFromParent();
2038   return true;
2039 }
2040 
2041 bool AMDGPULegalizerInfo::legalizeFrint(
2042   MachineInstr &MI, MachineRegisterInfo &MRI,
2043   MachineIRBuilder &B) const {
2044   Register Src = MI.getOperand(1).getReg();
2045   LLT Ty = MRI.getType(Src);
2046   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2047 
2048   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2049   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2050 
2051   auto C1 = B.buildFConstant(Ty, C1Val);
2052   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2053 
2054   // TODO: Should this propagate fast-math-flags?
2055   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2056   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2057 
2058   auto C2 = B.buildFConstant(Ty, C2Val);
2059   auto Fabs = B.buildFAbs(Ty, Src);
2060 
2061   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2062   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2063   MI.eraseFromParent();
2064   return true;
2065 }
2066 
2067 bool AMDGPULegalizerInfo::legalizeFceil(
2068   MachineInstr &MI, MachineRegisterInfo &MRI,
2069   MachineIRBuilder &B) const {
2070 
2071   const LLT S1 = LLT::scalar(1);
2072   const LLT S64 = LLT::scalar(64);
2073 
2074   Register Src = MI.getOperand(1).getReg();
2075   assert(MRI.getType(Src) == S64);
2076 
2077   // result = trunc(src)
2078   // if (src > 0.0 && src != result)
2079   //   result += 1.0
2080 
2081   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2082 
2083   const auto Zero = B.buildFConstant(S64, 0.0);
2084   const auto One = B.buildFConstant(S64, 1.0);
2085   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2086   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2087   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2088   auto Add = B.buildSelect(S64, And, One, Zero);
2089 
2090   // TODO: Should this propagate fast-math-flags?
2091   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2092   MI.eraseFromParent();
2093   return true;
2094 }
2095 
2096 bool AMDGPULegalizerInfo::legalizeFrem(
2097   MachineInstr &MI, MachineRegisterInfo &MRI,
2098   MachineIRBuilder &B) const {
2099     Register DstReg = MI.getOperand(0).getReg();
2100     Register Src0Reg = MI.getOperand(1).getReg();
2101     Register Src1Reg = MI.getOperand(2).getReg();
2102     auto Flags = MI.getFlags();
2103     LLT Ty = MRI.getType(DstReg);
2104 
2105     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2106     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2107     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2108     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2109     MI.eraseFromParent();
2110     return true;
2111 }
2112 
2113 static MachineInstrBuilder extractF64Exponent(Register Hi,
2114                                               MachineIRBuilder &B) {
2115   const unsigned FractBits = 52;
2116   const unsigned ExpBits = 11;
2117   LLT S32 = LLT::scalar(32);
2118 
2119   auto Const0 = B.buildConstant(S32, FractBits - 32);
2120   auto Const1 = B.buildConstant(S32, ExpBits);
2121 
2122   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2123     .addUse(Hi)
2124     .addUse(Const0.getReg(0))
2125     .addUse(Const1.getReg(0));
2126 
2127   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2128 }
2129 
2130 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2131   MachineInstr &MI, MachineRegisterInfo &MRI,
2132   MachineIRBuilder &B) const {
2133   const LLT S1 = LLT::scalar(1);
2134   const LLT S32 = LLT::scalar(32);
2135   const LLT S64 = LLT::scalar(64);
2136 
2137   Register Src = MI.getOperand(1).getReg();
2138   assert(MRI.getType(Src) == S64);
2139 
2140   // TODO: Should this use extract since the low half is unused?
2141   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2142   Register Hi = Unmerge.getReg(1);
2143 
2144   // Extract the upper half, since this is where we will find the sign and
2145   // exponent.
2146   auto Exp = extractF64Exponent(Hi, B);
2147 
2148   const unsigned FractBits = 52;
2149 
2150   // Extract the sign bit.
2151   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2152   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2153 
2154   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2155 
2156   const auto Zero32 = B.buildConstant(S32, 0);
2157 
2158   // Extend back to 64-bits.
2159   auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2160 
2161   auto Shr = B.buildAShr(S64, FractMask, Exp);
2162   auto Not = B.buildNot(S64, Shr);
2163   auto Tmp0 = B.buildAnd(S64, Src, Not);
2164   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2165 
2166   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2167   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2168 
2169   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2170   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2171   MI.eraseFromParent();
2172   return true;
2173 }
2174 
2175 bool AMDGPULegalizerInfo::legalizeITOFP(
2176   MachineInstr &MI, MachineRegisterInfo &MRI,
2177   MachineIRBuilder &B, bool Signed) const {
2178 
2179   Register Dst = MI.getOperand(0).getReg();
2180   Register Src = MI.getOperand(1).getReg();
2181 
2182   const LLT S64 = LLT::scalar(64);
2183   const LLT S32 = LLT::scalar(32);
2184 
2185   assert(MRI.getType(Src) == S64);
2186 
2187   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2188   auto ThirtyTwo = B.buildConstant(S32, 32);
2189 
2190   if (MRI.getType(Dst) == S64) {
2191     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2192                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2193 
2194     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2195     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2196                      .addUse(CvtHi.getReg(0))
2197                      .addUse(ThirtyTwo.getReg(0));
2198 
2199     // TODO: Should this propagate fast-math-flags?
2200     B.buildFAdd(Dst, LdExp, CvtLo);
2201     MI.eraseFromParent();
2202     return true;
2203   }
2204 
2205   assert(MRI.getType(Dst) == S32);
2206 
2207   auto One = B.buildConstant(S32, 1);
2208 
2209   MachineInstrBuilder ShAmt;
2210   if (Signed) {
2211     auto ThirtyOne = B.buildConstant(S32, 31);
2212     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2213     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2214     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2215     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2216                                /*HasSideEffects=*/false)
2217                   .addUse(Unmerge.getReg(1));
2218     auto LS2 = B.buildSub(S32, LS, One);
2219     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2220   } else
2221     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2222   auto Norm = B.buildShl(S64, Src, ShAmt);
2223   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2224   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2225   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2226   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2227   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2228   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2229                    /*HasSideEffects=*/false)
2230       .addUse(FVal.getReg(0))
2231       .addUse(Scale.getReg(0));
2232   MI.eraseFromParent();
2233   return true;
2234 }
2235 
2236 // TODO: Copied from DAG implementation. Verify logic and document how this
2237 // actually works.
2238 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2239                                         MachineRegisterInfo &MRI,
2240                                         MachineIRBuilder &B,
2241                                         bool Signed) const {
2242 
2243   Register Dst = MI.getOperand(0).getReg();
2244   Register Src = MI.getOperand(1).getReg();
2245 
2246   const LLT S64 = LLT::scalar(64);
2247   const LLT S32 = LLT::scalar(32);
2248 
2249   const LLT SrcLT = MRI.getType(Src);
2250   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2251 
2252   unsigned Flags = MI.getFlags();
2253 
2254   // The basic idea of converting a floating point number into a pair of 32-bit
2255   // integers is illustrated as follows:
2256   //
2257   //     tf := trunc(val);
2258   //    hif := floor(tf * 2^-32);
2259   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2260   //     hi := fptoi(hif);
2261   //     lo := fptoi(lof);
2262   //
2263   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2264   MachineInstrBuilder Sign;
2265   if (Signed && SrcLT == S32) {
2266     // However, a 32-bit floating point number has only 23 bits mantissa and
2267     // it's not enough to hold all the significant bits of `lof` if val is
2268     // negative. To avoid the loss of precision, We need to take the absolute
2269     // value after truncating and flip the result back based on the original
2270     // signedness.
2271     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2272     Trunc = B.buildFAbs(S32, Trunc, Flags);
2273   }
2274   MachineInstrBuilder K0, K1;
2275   if (SrcLT == S64) {
2276     K0 = B.buildFConstant(S64,
2277                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2278     K1 = B.buildFConstant(S64,
2279                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2280   } else {
2281     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2282     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2283   }
2284 
2285   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2286   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2287   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2288 
2289   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2290                                      : B.buildFPTOUI(S32, FloorMul);
2291   auto Lo = B.buildFPTOUI(S32, Fma);
2292 
2293   if (Signed && SrcLT == S32) {
2294     // Flip the result based on the signedness, which is either all 0s or 1s.
2295     Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2296     // r := xor({lo, hi}, sign) - sign;
2297     B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2298                Sign);
2299   } else
2300     B.buildMergeLikeInstr(Dst, {Lo, Hi});
2301   MI.eraseFromParent();
2302 
2303   return true;
2304 }
2305 
2306 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2307                                                MachineInstr &MI) const {
2308   MachineFunction &MF = Helper.MIRBuilder.getMF();
2309   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2310 
2311   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2312                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2313 
2314   // With ieee_mode disabled, the instructions have the correct behavior
2315   // already for G_FMINNUM/G_FMAXNUM
2316   if (!MFI->getMode().IEEE)
2317     return !IsIEEEOp;
2318 
2319   if (IsIEEEOp)
2320     return true;
2321 
2322   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2323 }
2324 
2325 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2326   MachineInstr &MI, MachineRegisterInfo &MRI,
2327   MachineIRBuilder &B) const {
2328   // TODO: Should move some of this into LegalizerHelper.
2329 
2330   // TODO: Promote dynamic indexing of s16 to s32
2331 
2332   // FIXME: Artifact combiner probably should have replaced the truncated
2333   // constant before this, so we shouldn't need
2334   // getIConstantVRegValWithLookThrough.
2335   std::optional<ValueAndVReg> MaybeIdxVal =
2336       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2337   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2338     return true;
2339   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2340 
2341   Register Dst = MI.getOperand(0).getReg();
2342   Register Vec = MI.getOperand(1).getReg();
2343 
2344   LLT VecTy = MRI.getType(Vec);
2345   LLT EltTy = VecTy.getElementType();
2346   assert(EltTy == MRI.getType(Dst));
2347 
2348   if (IdxVal < VecTy.getNumElements()) {
2349     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2350     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2351   } else {
2352     B.buildUndef(Dst);
2353   }
2354 
2355   MI.eraseFromParent();
2356   return true;
2357 }
2358 
2359 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2360   MachineInstr &MI, MachineRegisterInfo &MRI,
2361   MachineIRBuilder &B) const {
2362   // TODO: Should move some of this into LegalizerHelper.
2363 
2364   // TODO: Promote dynamic indexing of s16 to s32
2365 
2366   // FIXME: Artifact combiner probably should have replaced the truncated
2367   // constant before this, so we shouldn't need
2368   // getIConstantVRegValWithLookThrough.
2369   std::optional<ValueAndVReg> MaybeIdxVal =
2370       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2371   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2372     return true;
2373 
2374   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2375   Register Dst = MI.getOperand(0).getReg();
2376   Register Vec = MI.getOperand(1).getReg();
2377   Register Ins = MI.getOperand(2).getReg();
2378 
2379   LLT VecTy = MRI.getType(Vec);
2380   LLT EltTy = VecTy.getElementType();
2381   assert(EltTy == MRI.getType(Ins));
2382   (void)Ins;
2383 
2384   unsigned NumElts = VecTy.getNumElements();
2385   if (IdxVal < NumElts) {
2386     SmallVector<Register, 8> SrcRegs;
2387     for (unsigned i = 0; i < NumElts; ++i)
2388       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2389     B.buildUnmerge(SrcRegs, Vec);
2390 
2391     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2392     B.buildMergeLikeInstr(Dst, SrcRegs);
2393   } else {
2394     B.buildUndef(Dst);
2395   }
2396 
2397   MI.eraseFromParent();
2398   return true;
2399 }
2400 
2401 bool AMDGPULegalizerInfo::legalizeSinCos(
2402   MachineInstr &MI, MachineRegisterInfo &MRI,
2403   MachineIRBuilder &B) const {
2404 
2405   Register DstReg = MI.getOperand(0).getReg();
2406   Register SrcReg = MI.getOperand(1).getReg();
2407   LLT Ty = MRI.getType(DstReg);
2408   unsigned Flags = MI.getFlags();
2409 
2410   Register TrigVal;
2411   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2412   if (ST.hasTrigReducedRange()) {
2413     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2414     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2415       .addUse(MulVal.getReg(0))
2416       .setMIFlags(Flags).getReg(0);
2417   } else
2418     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2419 
2420   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2421     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2422   B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false)
2423       .addUse(TrigVal)
2424       .setMIFlags(Flags);
2425   MI.eraseFromParent();
2426   return true;
2427 }
2428 
2429 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2430                                                   MachineIRBuilder &B,
2431                                                   const GlobalValue *GV,
2432                                                   int64_t Offset,
2433                                                   unsigned GAFlags) const {
2434   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2435   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2436   // to the following code sequence:
2437   //
2438   // For constant address space:
2439   //   s_getpc_b64 s[0:1]
2440   //   s_add_u32 s0, s0, $symbol
2441   //   s_addc_u32 s1, s1, 0
2442   //
2443   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2444   //   a fixup or relocation is emitted to replace $symbol with a literal
2445   //   constant, which is a pc-relative offset from the encoding of the $symbol
2446   //   operand to the global variable.
2447   //
2448   // For global address space:
2449   //   s_getpc_b64 s[0:1]
2450   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2451   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2452   //
2453   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2454   //   fixups or relocations are emitted to replace $symbol@*@lo and
2455   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2456   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2457   //   operand to the global variable.
2458   //
2459   // What we want here is an offset from the value returned by s_getpc
2460   // (which is the address of the s_add_u32 instruction) to the global
2461   // variable, but since the encoding of $symbol starts 4 bytes after the start
2462   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2463   // small. This requires us to add 4 to the global variable offset in order to
2464   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2465   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2466   // instruction.
2467 
2468   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2469 
2470   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2471     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2472 
2473   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2474     .addDef(PCReg);
2475 
2476   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2477   if (GAFlags == SIInstrInfo::MO_NONE)
2478     MIB.addImm(0);
2479   else
2480     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2481 
2482   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2483 
2484   if (PtrTy.getSizeInBits() == 32)
2485     B.buildExtract(DstReg, PCReg, 0);
2486   return true;
2487  }
2488 
2489 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2490   MachineInstr &MI, MachineRegisterInfo &MRI,
2491   MachineIRBuilder &B) const {
2492   Register DstReg = MI.getOperand(0).getReg();
2493   LLT Ty = MRI.getType(DstReg);
2494   unsigned AS = Ty.getAddressSpace();
2495 
2496   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2497   MachineFunction &MF = B.getMF();
2498   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2499 
2500   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2501     if (!MFI->isModuleEntryFunction() &&
2502         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2503       const Function &Fn = MF.getFunction();
2504       DiagnosticInfoUnsupported BadLDSDecl(
2505         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2506         DS_Warning);
2507       Fn.getContext().diagnose(BadLDSDecl);
2508 
2509       // We currently don't have a way to correctly allocate LDS objects that
2510       // aren't directly associated with a kernel. We do force inlining of
2511       // functions that use local objects. However, if these dead functions are
2512       // not eliminated, we don't want a compile time error. Just emit a warning
2513       // and a trap, since there should be no callable path here.
2514       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2515       B.buildUndef(DstReg);
2516       MI.eraseFromParent();
2517       return true;
2518     }
2519 
2520     // TODO: We could emit code to handle the initialization somewhere.
2521     // We ignore the initializer for now and legalize it to allow selection.
2522     // The initializer will anyway get errored out during assembly emission.
2523     const SITargetLowering *TLI = ST.getTargetLowering();
2524     if (!TLI->shouldUseLDSConstAddress(GV)) {
2525       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2526       return true; // Leave in place;
2527     }
2528 
2529     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2530       Type *Ty = GV->getValueType();
2531       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2532       // zero-sized type in other languages to declare the dynamic shared
2533       // memory which size is not known at the compile time. They will be
2534       // allocated by the runtime and placed directly after the static
2535       // allocated ones. They all share the same offset.
2536       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2537         // Adjust alignment for that dynamic shared memory array.
2538         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2539         LLT S32 = LLT::scalar(32);
2540         auto Sz =
2541             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2542         B.buildIntToPtr(DstReg, Sz);
2543         MI.eraseFromParent();
2544         return true;
2545       }
2546     }
2547 
2548     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2549                                                    *cast<GlobalVariable>(GV)));
2550     MI.eraseFromParent();
2551     return true;
2552   }
2553 
2554   const SITargetLowering *TLI = ST.getTargetLowering();
2555 
2556   if (TLI->shouldEmitFixup(GV)) {
2557     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2558     MI.eraseFromParent();
2559     return true;
2560   }
2561 
2562   if (TLI->shouldEmitPCReloc(GV)) {
2563     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2564     MI.eraseFromParent();
2565     return true;
2566   }
2567 
2568   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2569   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2570 
2571   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2572   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2573       MachinePointerInfo::getGOT(MF),
2574       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2575           MachineMemOperand::MOInvariant,
2576       LoadTy, Align(8));
2577 
2578   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2579 
2580   if (Ty.getSizeInBits() == 32) {
2581     // Truncate if this is a 32-bit constant address.
2582     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2583     B.buildExtract(DstReg, Load, 0);
2584   } else
2585     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2586 
2587   MI.eraseFromParent();
2588   return true;
2589 }
2590 
2591 static LLT widenToNextPowerOf2(LLT Ty) {
2592   if (Ty.isVector())
2593     return Ty.changeElementCount(
2594         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2595   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2596 }
2597 
2598 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2599                                        MachineInstr &MI) const {
2600   MachineIRBuilder &B = Helper.MIRBuilder;
2601   MachineRegisterInfo &MRI = *B.getMRI();
2602   GISelChangeObserver &Observer = Helper.Observer;
2603 
2604   Register PtrReg = MI.getOperand(1).getReg();
2605   LLT PtrTy = MRI.getType(PtrReg);
2606   unsigned AddrSpace = PtrTy.getAddressSpace();
2607 
2608   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2609     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2610     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2611     Observer.changingInstr(MI);
2612     MI.getOperand(1).setReg(Cast.getReg(0));
2613     Observer.changedInstr(MI);
2614     return true;
2615   }
2616 
2617   if (MI.getOpcode() != AMDGPU::G_LOAD)
2618     return false;
2619 
2620   Register ValReg = MI.getOperand(0).getReg();
2621   LLT ValTy = MRI.getType(ValReg);
2622 
2623   MachineMemOperand *MMO = *MI.memoperands_begin();
2624   const unsigned ValSize = ValTy.getSizeInBits();
2625   const LLT MemTy = MMO->getMemoryType();
2626   const Align MemAlign = MMO->getAlign();
2627   const unsigned MemSize = MemTy.getSizeInBits();
2628   const uint64_t AlignInBits = 8 * MemAlign.value();
2629 
2630   // Widen non-power-of-2 loads to the alignment if needed
2631   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2632     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2633 
2634     // This was already the correct extending load result type, so just adjust
2635     // the memory type.
2636     if (WideMemSize == ValSize) {
2637       MachineFunction &MF = B.getMF();
2638 
2639       MachineMemOperand *WideMMO =
2640           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2641       Observer.changingInstr(MI);
2642       MI.setMemRefs(MF, {WideMMO});
2643       Observer.changedInstr(MI);
2644       return true;
2645     }
2646 
2647     // Don't bother handling edge case that should probably never be produced.
2648     if (ValSize > WideMemSize)
2649       return false;
2650 
2651     LLT WideTy = widenToNextPowerOf2(ValTy);
2652 
2653     Register WideLoad;
2654     if (!WideTy.isVector()) {
2655       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2656       B.buildTrunc(ValReg, WideLoad).getReg(0);
2657     } else {
2658       // Extract the subvector.
2659 
2660       if (isRegisterType(ValTy)) {
2661         // If this a case where G_EXTRACT is legal, use it.
2662         // (e.g. <3 x s32> -> <4 x s32>)
2663         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2664         B.buildExtract(ValReg, WideLoad, 0);
2665       } else {
2666         // For cases where the widened type isn't a nice register value, unmerge
2667         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2668         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2669         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2670       }
2671     }
2672 
2673     MI.eraseFromParent();
2674     return true;
2675   }
2676 
2677   return false;
2678 }
2679 
2680 bool AMDGPULegalizerInfo::legalizeFMad(
2681   MachineInstr &MI, MachineRegisterInfo &MRI,
2682   MachineIRBuilder &B) const {
2683   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2684   assert(Ty.isScalar());
2685 
2686   MachineFunction &MF = B.getMF();
2687   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2688 
2689   // TODO: Always legal with future ftz flag.
2690   // FIXME: Do we need just output?
2691   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2692     return true;
2693   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2694     return true;
2695 
2696   MachineIRBuilder HelperBuilder(MI);
2697   GISelObserverWrapper DummyObserver;
2698   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2699   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2700 }
2701 
2702 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2703   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2704   Register DstReg = MI.getOperand(0).getReg();
2705   Register PtrReg = MI.getOperand(1).getReg();
2706   Register CmpVal = MI.getOperand(2).getReg();
2707   Register NewVal = MI.getOperand(3).getReg();
2708 
2709   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2710          "this should not have been custom lowered");
2711 
2712   LLT ValTy = MRI.getType(CmpVal);
2713   LLT VecTy = LLT::fixed_vector(2, ValTy);
2714 
2715   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2716 
2717   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2718     .addDef(DstReg)
2719     .addUse(PtrReg)
2720     .addUse(PackedVal)
2721     .setMemRefs(MI.memoperands());
2722 
2723   MI.eraseFromParent();
2724   return true;
2725 }
2726 
2727 bool AMDGPULegalizerInfo::legalizeFlog(
2728   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2729   Register Dst = MI.getOperand(0).getReg();
2730   Register Src = MI.getOperand(1).getReg();
2731   LLT Ty = B.getMRI()->getType(Dst);
2732   unsigned Flags = MI.getFlags();
2733 
2734   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2735   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2736 
2737   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2738   MI.eraseFromParent();
2739   return true;
2740 }
2741 
2742 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2743                                        MachineIRBuilder &B) const {
2744   Register Dst = MI.getOperand(0).getReg();
2745   Register Src = MI.getOperand(1).getReg();
2746   unsigned Flags = MI.getFlags();
2747   LLT Ty = B.getMRI()->getType(Dst);
2748 
2749   auto K = B.buildFConstant(Ty, numbers::log2e);
2750   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2751   B.buildFExp2(Dst, Mul, Flags);
2752   MI.eraseFromParent();
2753   return true;
2754 }
2755 
2756 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2757                                        MachineIRBuilder &B) const {
2758   Register Dst = MI.getOperand(0).getReg();
2759   Register Src0 = MI.getOperand(1).getReg();
2760   Register Src1 = MI.getOperand(2).getReg();
2761   unsigned Flags = MI.getFlags();
2762   LLT Ty = B.getMRI()->getType(Dst);
2763   const LLT S16 = LLT::scalar(16);
2764   const LLT S32 = LLT::scalar(32);
2765 
2766   if (Ty == S32) {
2767     auto Log = B.buildFLog2(S32, Src0, Flags);
2768     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2769       .addUse(Log.getReg(0))
2770       .addUse(Src1)
2771       .setMIFlags(Flags);
2772     B.buildFExp2(Dst, Mul, Flags);
2773   } else if (Ty == S16) {
2774     // There's no f16 fmul_legacy, so we need to convert for it.
2775     auto Log = B.buildFLog2(S16, Src0, Flags);
2776     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2777     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2778     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2779       .addUse(Ext0.getReg(0))
2780       .addUse(Ext1.getReg(0))
2781       .setMIFlags(Flags);
2782 
2783     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2784   } else
2785     return false;
2786 
2787   MI.eraseFromParent();
2788   return true;
2789 }
2790 
2791 // Find a source register, ignoring any possible source modifiers.
2792 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2793   Register ModSrc = OrigSrc;
2794   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2795     ModSrc = SrcFNeg->getOperand(1).getReg();
2796     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2797       ModSrc = SrcFAbs->getOperand(1).getReg();
2798   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2799     ModSrc = SrcFAbs->getOperand(1).getReg();
2800   return ModSrc;
2801 }
2802 
2803 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2804                                          MachineRegisterInfo &MRI,
2805                                          MachineIRBuilder &B) const {
2806 
2807   const LLT S1 = LLT::scalar(1);
2808   const LLT S64 = LLT::scalar(64);
2809   Register Dst = MI.getOperand(0).getReg();
2810   Register OrigSrc = MI.getOperand(1).getReg();
2811   unsigned Flags = MI.getFlags();
2812   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2813          "this should not have been custom lowered");
2814 
2815   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2816   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2817   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2818   // V_FRACT bug is:
2819   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2820   //
2821   // Convert floor(x) to (x - fract(x))
2822 
2823   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2824     .addUse(OrigSrc)
2825     .setMIFlags(Flags);
2826 
2827   // Give source modifier matching some assistance before obscuring a foldable
2828   // pattern.
2829 
2830   // TODO: We can avoid the neg on the fract? The input sign to fract
2831   // shouldn't matter?
2832   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2833 
2834   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2835 
2836   Register Min = MRI.createGenericVirtualRegister(S64);
2837 
2838   // We don't need to concern ourselves with the snan handling difference, so
2839   // use the one which will directly select.
2840   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2841   if (MFI->getMode().IEEE)
2842     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2843   else
2844     B.buildFMinNum(Min, Fract, Const, Flags);
2845 
2846   Register CorrectedFract = Min;
2847   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2848     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2849     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2850   }
2851 
2852   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2853   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2854 
2855   MI.eraseFromParent();
2856   return true;
2857 }
2858 
2859 // Turn an illegal packed v2s16 build vector into bit operations.
2860 // TODO: This should probably be a bitcast action in LegalizerHelper.
2861 bool AMDGPULegalizerInfo::legalizeBuildVector(
2862   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2863   Register Dst = MI.getOperand(0).getReg();
2864   const LLT S32 = LLT::scalar(32);
2865   const LLT S16 = LLT::scalar(16);
2866   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2867 
2868   Register Src0 = MI.getOperand(1).getReg();
2869   Register Src1 = MI.getOperand(2).getReg();
2870 
2871   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
2872     assert(MRI.getType(Src0) == S32);
2873     Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
2874     Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
2875   }
2876 
2877   auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
2878   B.buildBitcast(Dst, Merge);
2879 
2880   MI.eraseFromParent();
2881   return true;
2882 }
2883 
2884 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
2885 //
2886 // Source and accumulation registers must all be 32-bits.
2887 //
2888 // TODO: When the multiply is uniform, we should produce a code sequence
2889 // that is better suited to instruction selection on the SALU. Instead of
2890 // the outer loop going over parts of the result, the outer loop should go
2891 // over parts of one of the factors. This should result in instruction
2892 // selection that makes full use of S_ADDC_U32 instructions.
2893 void AMDGPULegalizerInfo::buildMultiply(
2894     LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
2895     ArrayRef<Register> Src0, ArrayRef<Register> Src1,
2896     bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
2897   // Use (possibly empty) vectors of S1 registers to represent the set of
2898   // carries from one pair of positions to the next.
2899   using Carry = SmallVector<Register, 2>;
2900 
2901   MachineIRBuilder &B = Helper.MIRBuilder;
2902 
2903   const LLT S1 = LLT::scalar(1);
2904   const LLT S32 = LLT::scalar(32);
2905   const LLT S64 = LLT::scalar(64);
2906 
2907   Register Zero32;
2908   Register Zero64;
2909 
2910   auto getZero32 = [&]() -> Register {
2911     if (!Zero32)
2912       Zero32 = B.buildConstant(S32, 0).getReg(0);
2913     return Zero32;
2914   };
2915   auto getZero64 = [&]() -> Register {
2916     if (!Zero64)
2917       Zero64 = B.buildConstant(S64, 0).getReg(0);
2918     return Zero64;
2919   };
2920 
2921   // Merge the given carries into the 32-bit LocalAccum, which is modified
2922   // in-place.
2923   //
2924   // Returns the carry-out, which is a single S1 register or null.
2925   auto mergeCarry =
2926       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
2927         if (CarryIn.empty())
2928           return Register();
2929 
2930         bool HaveCarryOut = true;
2931         Register CarryAccum;
2932         if (CarryIn.size() == 1) {
2933           if (!LocalAccum) {
2934             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2935             return Register();
2936           }
2937 
2938           CarryAccum = getZero32();
2939         } else {
2940           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2941           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
2942             CarryAccum =
2943                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
2944                     .getReg(0);
2945           }
2946 
2947           if (!LocalAccum) {
2948             LocalAccum = getZero32();
2949             HaveCarryOut = false;
2950           }
2951         }
2952 
2953         auto Add =
2954             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2955         LocalAccum = Add.getReg(0);
2956         return HaveCarryOut ? Add.getReg(1) : Register();
2957       };
2958 
2959   // Build a multiply-add chain to compute
2960   //
2961   //   LocalAccum + (partial products at DstIndex)
2962   //       + (opportunistic subset of CarryIn)
2963   //
2964   // LocalAccum is an array of one or two 32-bit registers that are updated
2965   // in-place. The incoming registers may be null.
2966   //
2967   // In some edge cases, carry-ins can be consumed "for free". In that case,
2968   // the consumed carry bits are removed from CarryIn in-place.
2969   auto buildMadChain =
2970       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
2971           -> Carry {
2972         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
2973                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
2974 
2975         Carry CarryOut;
2976         unsigned j0 = 0;
2977 
2978         // Use plain 32-bit multiplication for the most significant part of the
2979         // result by default.
2980         if (LocalAccum.size() == 1 &&
2981             (!UsePartialMad64_32 || !CarryIn.empty())) {
2982           do {
2983             unsigned j1 = DstIndex - j0;
2984             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
2985             if (!LocalAccum[0]) {
2986               LocalAccum[0] = Mul.getReg(0);
2987             } else {
2988               if (CarryIn.empty()) {
2989                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
2990               } else {
2991                 LocalAccum[0] =
2992                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
2993                         .getReg(0);
2994                 CarryIn.pop_back();
2995               }
2996             }
2997             ++j0;
2998           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
2999         }
3000 
3001         // Build full 64-bit multiplies.
3002         if (j0 <= DstIndex) {
3003           bool HaveSmallAccum = false;
3004           Register Tmp;
3005 
3006           if (LocalAccum[0]) {
3007             if (LocalAccum.size() == 1) {
3008               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3009               HaveSmallAccum = true;
3010             } else if (LocalAccum[1]) {
3011               Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3012               HaveSmallAccum = false;
3013             } else {
3014               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3015               HaveSmallAccum = true;
3016             }
3017           } else {
3018             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3019             Tmp = getZero64();
3020             HaveSmallAccum = true;
3021           }
3022 
3023           do {
3024             unsigned j1 = DstIndex - j0;
3025             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3026                                     {Src0[j0], Src1[j1], Tmp});
3027             Tmp = Mad.getReg(0);
3028             if (!HaveSmallAccum)
3029               CarryOut.push_back(Mad.getReg(1));
3030             HaveSmallAccum = false;
3031             ++j0;
3032           } while (j0 <= DstIndex);
3033 
3034           auto Unmerge = B.buildUnmerge(S32, Tmp);
3035           LocalAccum[0] = Unmerge.getReg(0);
3036           if (LocalAccum.size() > 1)
3037             LocalAccum[1] = Unmerge.getReg(1);
3038         }
3039 
3040         return CarryOut;
3041       };
3042 
3043   // Outer multiply loop, iterating over destination parts from least
3044   // significant to most significant parts.
3045   //
3046   // The columns of the following diagram correspond to the destination parts
3047   // affected by one iteration of the outer loop (ignoring boundary
3048   // conditions).
3049   //
3050   //   Dest index relative to 2 * i:      1 0 -1
3051   //                                      ------
3052   //   Carries from previous iteration:     e o
3053   //   Even-aligned partial product sum:  E E .
3054   //   Odd-aligned partial product sum:     O O
3055   //
3056   // 'o' is OddCarry, 'e' is EvenCarry.
3057   // EE and OO are computed from partial products via buildMadChain and use
3058   // accumulation where possible and appropriate.
3059   //
3060   Register SeparateOddCarry;
3061   Carry EvenCarry;
3062   Carry OddCarry;
3063 
3064   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3065     Carry OddCarryIn = std::move(OddCarry);
3066     Carry EvenCarryIn = std::move(EvenCarry);
3067     OddCarry.clear();
3068     EvenCarry.clear();
3069 
3070     // Partial products at offset 2 * i.
3071     if (2 * i < Accum.size()) {
3072       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3073       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3074     }
3075 
3076     // Partial products at offset 2 * i - 1.
3077     if (i > 0) {
3078       if (!SeparateOddAlignedProducts) {
3079         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3080         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3081       } else {
3082         bool IsHighest = 2 * i >= Accum.size();
3083         Register SeparateOddOut[2];
3084         auto LocalAccum = MutableArrayRef(SeparateOddOut)
3085                               .take_front(IsHighest ? 1 : 2);
3086         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3087 
3088         MachineInstr *Lo;
3089 
3090         if (i == 1) {
3091           if (!IsHighest)
3092             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3093           else
3094             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3095         } else {
3096           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3097                             SeparateOddCarry);
3098         }
3099         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3100 
3101         if (!IsHighest) {
3102           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3103                                 Lo->getOperand(1).getReg());
3104           Accum[2 * i] = Hi.getReg(0);
3105           SeparateOddCarry = Hi.getReg(1);
3106         }
3107       }
3108     }
3109 
3110     // Add in the carries from the previous iteration
3111     if (i > 0) {
3112       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3113         EvenCarryIn.push_back(CarryOut);
3114 
3115       if (2 * i < Accum.size()) {
3116         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3117           OddCarry.push_back(CarryOut);
3118       }
3119     }
3120   }
3121 }
3122 
3123 // Custom narrowing of wide multiplies using wide multiply-add instructions.
3124 //
3125 // TODO: If the multiply is followed by an addition, we should attempt to
3126 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3127 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
3128                                       MachineInstr &MI) const {
3129   assert(ST.hasMad64_32());
3130   assert(MI.getOpcode() == TargetOpcode::G_MUL);
3131 
3132   MachineIRBuilder &B = Helper.MIRBuilder;
3133   MachineRegisterInfo &MRI = *B.getMRI();
3134 
3135   Register DstReg = MI.getOperand(0).getReg();
3136   Register Src0 = MI.getOperand(1).getReg();
3137   Register Src1 = MI.getOperand(2).getReg();
3138 
3139   LLT Ty = MRI.getType(DstReg);
3140   assert(Ty.isScalar());
3141 
3142   unsigned Size = Ty.getSizeInBits();
3143   unsigned NumParts = Size / 32;
3144   assert((Size % 32) == 0);
3145   assert(NumParts >= 2);
3146 
3147   // Whether to use MAD_64_32 for partial products whose high half is
3148   // discarded. This avoids some ADD instructions but risks false dependency
3149   // stalls on some subtargets in some cases.
3150   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3151 
3152   // Whether to compute odd-aligned partial products separately. This is
3153   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3154   // in an even-aligned VGPR.
3155   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3156 
3157   LLT S32 = LLT::scalar(32);
3158   SmallVector<Register, 2> Src0Parts, Src1Parts;
3159   for (unsigned i = 0; i < NumParts; ++i) {
3160     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3161     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3162   }
3163   B.buildUnmerge(Src0Parts, Src0);
3164   B.buildUnmerge(Src1Parts, Src1);
3165 
3166   SmallVector<Register, 2> AccumRegs(NumParts);
3167   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3168                 SeparateOddAlignedProducts);
3169 
3170   B.buildMergeLikeInstr(DstReg, AccumRegs);
3171   MI.eraseFromParent();
3172   return true;
3173 
3174 }
3175 
3176 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3177 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3178 // case with a single min instruction instead of a compare+select.
3179 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3180                                             MachineRegisterInfo &MRI,
3181                                             MachineIRBuilder &B) const {
3182   Register Dst = MI.getOperand(0).getReg();
3183   Register Src = MI.getOperand(1).getReg();
3184   LLT DstTy = MRI.getType(Dst);
3185   LLT SrcTy = MRI.getType(Src);
3186 
3187   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3188                         ? AMDGPU::G_AMDGPU_FFBH_U32
3189                         : AMDGPU::G_AMDGPU_FFBL_B32;
3190   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3191   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3192 
3193   MI.eraseFromParent();
3194   return true;
3195 }
3196 
3197 // Check that this is a G_XOR x, -1
3198 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3199   if (MI.getOpcode() != TargetOpcode::G_XOR)
3200     return false;
3201   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3202   return ConstVal && *ConstVal == -1;
3203 }
3204 
3205 // Return the use branch instruction, otherwise null if the usage is invalid.
3206 static MachineInstr *
3207 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3208                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3209   Register CondDef = MI.getOperand(0).getReg();
3210   if (!MRI.hasOneNonDBGUse(CondDef))
3211     return nullptr;
3212 
3213   MachineBasicBlock *Parent = MI.getParent();
3214   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3215 
3216   if (isNot(MRI, *UseMI)) {
3217     Register NegatedCond = UseMI->getOperand(0).getReg();
3218     if (!MRI.hasOneNonDBGUse(NegatedCond))
3219       return nullptr;
3220 
3221     // We're deleting the def of this value, so we need to remove it.
3222     eraseInstr(*UseMI, MRI);
3223 
3224     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3225     Negated = true;
3226   }
3227 
3228   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3229     return nullptr;
3230 
3231   // Make sure the cond br is followed by a G_BR, or is the last instruction.
3232   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3233   if (Next == Parent->end()) {
3234     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3235     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3236       return nullptr;
3237     UncondBrTarget = &*NextMBB;
3238   } else {
3239     if (Next->getOpcode() != AMDGPU::G_BR)
3240       return nullptr;
3241     Br = &*Next;
3242     UncondBrTarget = Br->getOperand(0).getMBB();
3243   }
3244 
3245   return UseMI;
3246 }
3247 
3248 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3249                                          const ArgDescriptor *Arg,
3250                                          const TargetRegisterClass *ArgRC,
3251                                          LLT ArgTy) const {
3252   MCRegister SrcReg = Arg->getRegister();
3253   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
3254   assert(DstReg.isVirtual() && "Virtual register expected");
3255 
3256   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
3257                                              *ArgRC, B.getDebugLoc(), ArgTy);
3258   if (Arg->isMasked()) {
3259     // TODO: Should we try to emit this once in the entry block?
3260     const LLT S32 = LLT::scalar(32);
3261     const unsigned Mask = Arg->getMask();
3262     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
3263 
3264     Register AndMaskSrc = LiveIn;
3265 
3266     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
3267     // 0.
3268     if (Shift != 0) {
3269       auto ShiftAmt = B.buildConstant(S32, Shift);
3270       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3271     }
3272 
3273     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
3274   } else {
3275     B.buildCopy(DstReg, LiveIn);
3276   }
3277 
3278   return true;
3279 }
3280 
3281 bool AMDGPULegalizerInfo::loadInputValue(
3282     Register DstReg, MachineIRBuilder &B,
3283     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3284   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3285   const ArgDescriptor *Arg;
3286   const TargetRegisterClass *ArgRC;
3287   LLT ArgTy;
3288   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3289 
3290   if (!Arg) {
3291     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
3292       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3293       // case the pointer argument may be missing and we use null.
3294       B.buildConstant(DstReg, 0);
3295       return true;
3296     }
3297 
3298     // It's undefined behavior if a function marked with the amdgpu-no-*
3299     // attributes uses the corresponding intrinsic.
3300     B.buildUndef(DstReg);
3301     return true;
3302   }
3303 
3304   if (!Arg->isRegister() || !Arg->getRegister().isValid())
3305     return false; // TODO: Handle these
3306   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
3307 }
3308 
3309 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
3310     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3311     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3312   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3313     return false;
3314 
3315   MI.eraseFromParent();
3316   return true;
3317 }
3318 
3319 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
3320                                 int64_t C) {
3321   B.buildConstant(MI.getOperand(0).getReg(), C);
3322   MI.eraseFromParent();
3323   return true;
3324 }
3325 
3326 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
3327     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3328     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3329   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3330   if (MaxID == 0)
3331     return replaceWithConstant(B, MI, 0);
3332 
3333   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3334   const ArgDescriptor *Arg;
3335   const TargetRegisterClass *ArgRC;
3336   LLT ArgTy;
3337   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3338 
3339   Register DstReg = MI.getOperand(0).getReg();
3340   if (!Arg) {
3341     // It's undefined behavior if a function marked with the amdgpu-no-*
3342     // attributes uses the corresponding intrinsic.
3343     B.buildUndef(DstReg);
3344     MI.eraseFromParent();
3345     return true;
3346   }
3347 
3348   if (Arg->isMasked()) {
3349     // Don't bother inserting AssertZext for packed IDs since we're emitting the
3350     // masking operations anyway.
3351     //
3352     // TODO: We could assert the top bit is 0 for the source copy.
3353     if (!loadInputValue(DstReg, B, ArgType))
3354       return false;
3355   } else {
3356     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
3357     if (!loadInputValue(TmpReg, B, ArgType))
3358       return false;
3359     B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
3360   }
3361 
3362   MI.eraseFromParent();
3363   return true;
3364 }
3365 
3366 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
3367                                                      int64_t Offset) const {
3368   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3369   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3370 
3371   // TODO: If we passed in the base kernel offset we could have a better
3372   // alignment than 4, but we don't really need it.
3373   if (!loadInputValue(KernArgReg, B,
3374                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3375     llvm_unreachable("failed to find kernarg segment ptr");
3376 
3377   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3378   // TODO: Should get nuw
3379   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3380 }
3381 
3382 /// Legalize a value that's loaded from kernel arguments. This is only used by
3383 /// legacy intrinsics.
3384 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
3385                                                       MachineIRBuilder &B,
3386                                                       uint64_t Offset,
3387                                                       Align Alignment) const {
3388   Register DstReg = MI.getOperand(0).getReg();
3389 
3390   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
3391          "unexpected kernarg parameter type");
3392 
3393   Register Ptr = getKernargParameterPtr(B, Offset);
3394   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
3395   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3396               MachineMemOperand::MODereferenceable |
3397                   MachineMemOperand::MOInvariant);
3398   MI.eraseFromParent();
3399   return true;
3400 }
3401 
3402 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
3403                                        MachineRegisterInfo &MRI,
3404                                        MachineIRBuilder &B) const {
3405   Register Dst = MI.getOperand(0).getReg();
3406   LLT DstTy = MRI.getType(Dst);
3407   LLT S16 = LLT::scalar(16);
3408   LLT S32 = LLT::scalar(32);
3409   LLT S64 = LLT::scalar(64);
3410 
3411   if (DstTy == S16)
3412     return legalizeFDIV16(MI, MRI, B);
3413   if (DstTy == S32)
3414     return legalizeFDIV32(MI, MRI, B);
3415   if (DstTy == S64)
3416     return legalizeFDIV64(MI, MRI, B);
3417 
3418   return false;
3419 }
3420 
3421 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3422                                                         Register DstDivReg,
3423                                                         Register DstRemReg,
3424                                                         Register X,
3425                                                         Register Y) const {
3426   const LLT S1 = LLT::scalar(1);
3427   const LLT S32 = LLT::scalar(32);
3428 
3429   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3430   // algorithm used here.
3431 
3432   // Initial estimate of inv(y).
3433   auto FloatY = B.buildUITOFP(S32, Y);
3434   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3435   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3436   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3437   auto Z = B.buildFPTOUI(S32, ScaledY);
3438 
3439   // One round of UNR.
3440   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3441   auto NegYZ = B.buildMul(S32, NegY, Z);
3442   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3443 
3444   // Quotient/remainder estimate.
3445   auto Q = B.buildUMulH(S32, X, Z);
3446   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3447 
3448   // First quotient/remainder refinement.
3449   auto One = B.buildConstant(S32, 1);
3450   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3451   if (DstDivReg)
3452     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3453   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3454 
3455   // Second quotient/remainder refinement.
3456   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3457   if (DstDivReg)
3458     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3459 
3460   if (DstRemReg)
3461     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3462 }
3463 
3464 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3465 //
3466 // Return lo, hi of result
3467 //
3468 // %cvt.lo = G_UITOFP Val.lo
3469 // %cvt.hi = G_UITOFP Val.hi
3470 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3471 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3472 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3473 // %mul2 = G_FMUL %mul1, 2**(-32)
3474 // %trunc = G_INTRINSIC_TRUNC %mul2
3475 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3476 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3477 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3478                                                        Register Val) {
3479   const LLT S32 = LLT::scalar(32);
3480   auto Unmerge = B.buildUnmerge(S32, Val);
3481 
3482   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3483   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3484 
3485   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3486                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3487 
3488   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3489   auto Mul1 =
3490       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3491 
3492   // 2**(-32)
3493   auto Mul2 =
3494       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3495   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3496 
3497   // -(2**32)
3498   auto Mad2 = B.buildFMAD(S32, Trunc,
3499                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3500 
3501   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3502   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3503 
3504   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3505 }
3506 
3507 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3508                                                         Register DstDivReg,
3509                                                         Register DstRemReg,
3510                                                         Register Numer,
3511                                                         Register Denom) const {
3512   const LLT S32 = LLT::scalar(32);
3513   const LLT S64 = LLT::scalar(64);
3514   const LLT S1 = LLT::scalar(1);
3515   Register RcpLo, RcpHi;
3516 
3517   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3518 
3519   auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
3520 
3521   auto Zero64 = B.buildConstant(S64, 0);
3522   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3523 
3524   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3525   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3526 
3527   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3528   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3529   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3530 
3531   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3532   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3533   auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
3534 
3535   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3536   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3537   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3538   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3539   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3540 
3541   auto Zero32 = B.buildConstant(S32, 0);
3542   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3543   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3544   auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
3545 
3546   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3547   Register NumerLo = UnmergeNumer.getReg(0);
3548   Register NumerHi = UnmergeNumer.getReg(1);
3549 
3550   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3551   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3552   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3553   Register Mul3_Lo = UnmergeMul3.getReg(0);
3554   Register Mul3_Hi = UnmergeMul3.getReg(1);
3555   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3556   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3557   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3558   auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
3559 
3560   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3561   Register DenomLo = UnmergeDenom.getReg(0);
3562   Register DenomHi = UnmergeDenom.getReg(1);
3563 
3564   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3565   auto C1 = B.buildSExt(S32, CmpHi);
3566 
3567   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3568   auto C2 = B.buildSExt(S32, CmpLo);
3569 
3570   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3571   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3572 
3573   // TODO: Here and below portions of the code can be enclosed into if/endif.
3574   // Currently control flow is unconditional and we have 4 selects after
3575   // potential endif to substitute PHIs.
3576 
3577   // if C3 != 0 ...
3578   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3579   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3580   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3581   auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
3582 
3583   auto One64 = B.buildConstant(S64, 1);
3584   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3585 
3586   auto C4 =
3587       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3588   auto C5 =
3589       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3590   auto C6 = B.buildSelect(
3591       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3592 
3593   // if (C6 != 0)
3594   auto Add4 = B.buildAdd(S64, Add3, One64);
3595   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3596 
3597   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3598   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3599   auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
3600 
3601   // endif C6
3602   // endif C3
3603 
3604   if (DstDivReg) {
3605     auto Sel1 = B.buildSelect(
3606         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3607     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3608                   Sel1, MulHi3);
3609   }
3610 
3611   if (DstRemReg) {
3612     auto Sel2 = B.buildSelect(
3613         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3614     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3615                   Sel2, Sub1);
3616   }
3617 }
3618 
3619 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3620                                                   MachineRegisterInfo &MRI,
3621                                                   MachineIRBuilder &B) const {
3622   Register DstDivReg, DstRemReg;
3623   switch (MI.getOpcode()) {
3624   default:
3625     llvm_unreachable("Unexpected opcode!");
3626   case AMDGPU::G_UDIV: {
3627     DstDivReg = MI.getOperand(0).getReg();
3628     break;
3629   }
3630   case AMDGPU::G_UREM: {
3631     DstRemReg = MI.getOperand(0).getReg();
3632     break;
3633   }
3634   case AMDGPU::G_UDIVREM: {
3635     DstDivReg = MI.getOperand(0).getReg();
3636     DstRemReg = MI.getOperand(1).getReg();
3637     break;
3638   }
3639   }
3640 
3641   const LLT S64 = LLT::scalar(64);
3642   const LLT S32 = LLT::scalar(32);
3643   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3644   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3645   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3646   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3647 
3648   if (Ty == S32)
3649     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3650   else if (Ty == S64)
3651     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3652   else
3653     return false;
3654 
3655   MI.eraseFromParent();
3656   return true;
3657 }
3658 
3659 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3660                                                 MachineRegisterInfo &MRI,
3661                                                 MachineIRBuilder &B) const {
3662   const LLT S64 = LLT::scalar(64);
3663   const LLT S32 = LLT::scalar(32);
3664 
3665   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3666   if (Ty != S32 && Ty != S64)
3667     return false;
3668 
3669   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3670   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3671   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3672 
3673   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3674   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3675   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3676 
3677   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3678   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3679 
3680   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3681   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3682 
3683   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3684   switch (MI.getOpcode()) {
3685   default:
3686     llvm_unreachable("Unexpected opcode!");
3687   case AMDGPU::G_SDIV: {
3688     DstDivReg = MI.getOperand(0).getReg();
3689     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3690     break;
3691   }
3692   case AMDGPU::G_SREM: {
3693     DstRemReg = MI.getOperand(0).getReg();
3694     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3695     break;
3696   }
3697   case AMDGPU::G_SDIVREM: {
3698     DstDivReg = MI.getOperand(0).getReg();
3699     DstRemReg = MI.getOperand(1).getReg();
3700     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3701     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3702     break;
3703   }
3704   }
3705 
3706   if (Ty == S32)
3707     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3708   else
3709     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3710 
3711   if (DstDivReg) {
3712     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3713     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3714     B.buildSub(DstDivReg, SignXor, Sign);
3715   }
3716 
3717   if (DstRemReg) {
3718     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3719     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3720     B.buildSub(DstRemReg, SignXor, Sign);
3721   }
3722 
3723   MI.eraseFromParent();
3724   return true;
3725 }
3726 
3727 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3728                                                  MachineRegisterInfo &MRI,
3729                                                  MachineIRBuilder &B) const {
3730   Register Res = MI.getOperand(0).getReg();
3731   Register LHS = MI.getOperand(1).getReg();
3732   Register RHS = MI.getOperand(2).getReg();
3733   uint16_t Flags = MI.getFlags();
3734   LLT ResTy = MRI.getType(Res);
3735 
3736   const MachineFunction &MF = B.getMF();
3737   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3738                             MI.getFlag(MachineInstr::FmAfn);
3739 
3740   if (!AllowInaccurateRcp)
3741     return false;
3742 
3743   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3744     // 1 / x -> RCP(x)
3745     if (CLHS->isExactlyValue(1.0)) {
3746       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3747         .addUse(RHS)
3748         .setMIFlags(Flags);
3749 
3750       MI.eraseFromParent();
3751       return true;
3752     }
3753 
3754     // -1 / x -> RCP( FNEG(x) )
3755     if (CLHS->isExactlyValue(-1.0)) {
3756       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3757       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3758         .addUse(FNeg.getReg(0))
3759         .setMIFlags(Flags);
3760 
3761       MI.eraseFromParent();
3762       return true;
3763     }
3764   }
3765 
3766   // x / y -> x * (1.0 / y)
3767   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3768     .addUse(RHS)
3769     .setMIFlags(Flags);
3770   B.buildFMul(Res, LHS, RCP, Flags);
3771 
3772   MI.eraseFromParent();
3773   return true;
3774 }
3775 
3776 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3777                                                    MachineRegisterInfo &MRI,
3778                                                    MachineIRBuilder &B) const {
3779   Register Res = MI.getOperand(0).getReg();
3780   Register X = MI.getOperand(1).getReg();
3781   Register Y = MI.getOperand(2).getReg();
3782   uint16_t Flags = MI.getFlags();
3783   LLT ResTy = MRI.getType(Res);
3784 
3785   const MachineFunction &MF = B.getMF();
3786   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3787                             MI.getFlag(MachineInstr::FmAfn);
3788 
3789   if (!AllowInaccurateRcp)
3790     return false;
3791 
3792   auto NegY = B.buildFNeg(ResTy, Y);
3793   auto One = B.buildFConstant(ResTy, 1.0);
3794 
3795   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3796     .addUse(Y)
3797     .setMIFlags(Flags);
3798 
3799   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3800   R = B.buildFMA(ResTy, Tmp0, R, R);
3801 
3802   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3803   R = B.buildFMA(ResTy, Tmp1, R, R);
3804 
3805   auto Ret = B.buildFMul(ResTy, X, R);
3806   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3807 
3808   B.buildFMA(Res, Tmp2, R, Ret);
3809   MI.eraseFromParent();
3810   return true;
3811 }
3812 
3813 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3814                                          MachineRegisterInfo &MRI,
3815                                          MachineIRBuilder &B) const {
3816   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3817     return true;
3818 
3819   Register Res = MI.getOperand(0).getReg();
3820   Register LHS = MI.getOperand(1).getReg();
3821   Register RHS = MI.getOperand(2).getReg();
3822 
3823   uint16_t Flags = MI.getFlags();
3824 
3825   LLT S16 = LLT::scalar(16);
3826   LLT S32 = LLT::scalar(32);
3827 
3828   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3829   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3830 
3831   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3832     .addUse(RHSExt.getReg(0))
3833     .setMIFlags(Flags);
3834 
3835   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3836   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3837 
3838   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3839     .addUse(RDst.getReg(0))
3840     .addUse(RHS)
3841     .addUse(LHS)
3842     .setMIFlags(Flags);
3843 
3844   MI.eraseFromParent();
3845   return true;
3846 }
3847 
3848 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3849 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3850 static void toggleSPDenormMode(bool Enable,
3851                                MachineIRBuilder &B,
3852                                const GCNSubtarget &ST,
3853                                AMDGPU::SIModeRegisterDefaults Mode) {
3854   // Set SP denorm mode to this value.
3855   unsigned SPDenormMode =
3856     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3857 
3858   if (ST.hasDenormModeInst()) {
3859     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3860     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3861 
3862     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3863     B.buildInstr(AMDGPU::S_DENORM_MODE)
3864       .addImm(NewDenormModeValue);
3865 
3866   } else {
3867     // Select FP32 bit field in mode register.
3868     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3869                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3870                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3871 
3872     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3873       .addImm(SPDenormMode)
3874       .addImm(SPDenormModeBitField);
3875   }
3876 }
3877 
3878 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3879                                          MachineRegisterInfo &MRI,
3880                                          MachineIRBuilder &B) const {
3881   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3882     return true;
3883 
3884   Register Res = MI.getOperand(0).getReg();
3885   Register LHS = MI.getOperand(1).getReg();
3886   Register RHS = MI.getOperand(2).getReg();
3887   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3888   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3889 
3890   uint16_t Flags = MI.getFlags();
3891 
3892   LLT S32 = LLT::scalar(32);
3893   LLT S1 = LLT::scalar(1);
3894 
3895   auto One = B.buildFConstant(S32, 1.0f);
3896 
3897   auto DenominatorScaled =
3898     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3899       .addUse(LHS)
3900       .addUse(RHS)
3901       .addImm(0)
3902       .setMIFlags(Flags);
3903   auto NumeratorScaled =
3904     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3905       .addUse(LHS)
3906       .addUse(RHS)
3907       .addImm(1)
3908       .setMIFlags(Flags);
3909 
3910   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3911     .addUse(DenominatorScaled.getReg(0))
3912     .setMIFlags(Flags);
3913   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3914 
3915   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3916   // aren't modeled as reading it.
3917   if (!Mode.allFP32Denormals())
3918     toggleSPDenormMode(true, B, ST, Mode);
3919 
3920   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3921   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3922   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3923   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3924   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3925   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3926 
3927   if (!Mode.allFP32Denormals())
3928     toggleSPDenormMode(false, B, ST, Mode);
3929 
3930   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3931     .addUse(Fma4.getReg(0))
3932     .addUse(Fma1.getReg(0))
3933     .addUse(Fma3.getReg(0))
3934     .addUse(NumeratorScaled.getReg(1))
3935     .setMIFlags(Flags);
3936 
3937   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3938     .addUse(Fmas.getReg(0))
3939     .addUse(RHS)
3940     .addUse(LHS)
3941     .setMIFlags(Flags);
3942 
3943   MI.eraseFromParent();
3944   return true;
3945 }
3946 
3947 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3948                                          MachineRegisterInfo &MRI,
3949                                          MachineIRBuilder &B) const {
3950   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3951     return true;
3952 
3953   Register Res = MI.getOperand(0).getReg();
3954   Register LHS = MI.getOperand(1).getReg();
3955   Register RHS = MI.getOperand(2).getReg();
3956 
3957   uint16_t Flags = MI.getFlags();
3958 
3959   LLT S64 = LLT::scalar(64);
3960   LLT S1 = LLT::scalar(1);
3961 
3962   auto One = B.buildFConstant(S64, 1.0);
3963 
3964   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3965     .addUse(LHS)
3966     .addUse(RHS)
3967     .addImm(0)
3968     .setMIFlags(Flags);
3969 
3970   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3971 
3972   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3973     .addUse(DivScale0.getReg(0))
3974     .setMIFlags(Flags);
3975 
3976   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3977   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3978   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3979 
3980   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3981     .addUse(LHS)
3982     .addUse(RHS)
3983     .addImm(1)
3984     .setMIFlags(Flags);
3985 
3986   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3987   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3988   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3989 
3990   Register Scale;
3991   if (!ST.hasUsableDivScaleConditionOutput()) {
3992     // Workaround a hardware bug on SI where the condition output from div_scale
3993     // is not usable.
3994 
3995     LLT S32 = LLT::scalar(32);
3996 
3997     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3998     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3999     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4000     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4001 
4002     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4003                               Scale1Unmerge.getReg(1));
4004     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4005                               Scale0Unmerge.getReg(1));
4006     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4007   } else {
4008     Scale = DivScale1.getReg(1);
4009   }
4010 
4011   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4012     .addUse(Fma4.getReg(0))
4013     .addUse(Fma3.getReg(0))
4014     .addUse(Mul.getReg(0))
4015     .addUse(Scale)
4016     .setMIFlags(Flags);
4017 
4018   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false)
4019       .addUse(Fmas.getReg(0))
4020       .addUse(RHS)
4021       .addUse(LHS)
4022       .setMIFlags(Flags);
4023 
4024   MI.eraseFromParent();
4025   return true;
4026 }
4027 
4028 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
4029                                                  MachineRegisterInfo &MRI,
4030                                                  MachineIRBuilder &B) const {
4031   Register Res = MI.getOperand(0).getReg();
4032   Register LHS = MI.getOperand(2).getReg();
4033   Register RHS = MI.getOperand(3).getReg();
4034   uint16_t Flags = MI.getFlags();
4035 
4036   LLT S32 = LLT::scalar(32);
4037   LLT S1 = LLT::scalar(1);
4038 
4039   auto Abs = B.buildFAbs(S32, RHS, Flags);
4040   const APFloat C0Val(1.0f);
4041 
4042   auto C0 = B.buildConstant(S32, 0x6f800000);
4043   auto C1 = B.buildConstant(S32, 0x2f800000);
4044   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
4045 
4046   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4047   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4048 
4049   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4050 
4051   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4052     .addUse(Mul0.getReg(0))
4053     .setMIFlags(Flags);
4054 
4055   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4056 
4057   B.buildFMul(Res, Sel, Mul1, Flags);
4058 
4059   MI.eraseFromParent();
4060   return true;
4061 }
4062 
4063 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4064 // FIXME: Why do we handle this one but not other removed instructions?
4065 //
4066 // Reciprocal square root.  The clamp prevents infinite results, clamping
4067 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
4068 // +-max_float.
4069 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4070                                                     MachineRegisterInfo &MRI,
4071                                                     MachineIRBuilder &B) const {
4072   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4073     return true;
4074 
4075   Register Dst = MI.getOperand(0).getReg();
4076   Register Src = MI.getOperand(2).getReg();
4077   auto Flags = MI.getFlags();
4078 
4079   LLT Ty = MRI.getType(Dst);
4080 
4081   const fltSemantics *FltSemantics;
4082   if (Ty == LLT::scalar(32))
4083     FltSemantics = &APFloat::IEEEsingle();
4084   else if (Ty == LLT::scalar(64))
4085     FltSemantics = &APFloat::IEEEdouble();
4086   else
4087     return false;
4088 
4089   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4090     .addUse(Src)
4091     .setMIFlags(Flags);
4092 
4093   // We don't need to concern ourselves with the snan handling difference, since
4094   // the rsq quieted (or not) so use the one which will directly select.
4095   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4096   const bool UseIEEE = MFI->getMode().IEEE;
4097 
4098   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4099   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4100                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4101 
4102   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4103 
4104   if (UseIEEE)
4105     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4106   else
4107     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4108   MI.eraseFromParent();
4109   return true;
4110 }
4111 
4112 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4113   switch (IID) {
4114   case Intrinsic::amdgcn_ds_fadd:
4115     return AMDGPU::G_ATOMICRMW_FADD;
4116   case Intrinsic::amdgcn_ds_fmin:
4117     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4118   case Intrinsic::amdgcn_ds_fmax:
4119     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4120   default:
4121     llvm_unreachable("not a DS FP intrinsic");
4122   }
4123 }
4124 
4125 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
4126                                                       MachineInstr &MI,
4127                                                       Intrinsic::ID IID) const {
4128   GISelChangeObserver &Observer = Helper.Observer;
4129   Observer.changingInstr(MI);
4130 
4131   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4132 
4133   // The remaining operands were used to set fields in the MemOperand on
4134   // construction.
4135   for (int I = 6; I > 3; --I)
4136     MI.removeOperand(I);
4137 
4138   MI.removeOperand(1); // Remove the intrinsic ID.
4139   Observer.changedInstr(MI);
4140   return true;
4141 }
4142 
4143 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
4144                                             MachineRegisterInfo &MRI,
4145                                             MachineIRBuilder &B) const {
4146   uint64_t Offset =
4147     ST.getTargetLowering()->getImplicitParameterOffset(
4148       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
4149   LLT DstTy = MRI.getType(DstReg);
4150   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4151 
4152   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4153   if (!loadInputValue(KernargPtrReg, B,
4154                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4155     return false;
4156 
4157   // FIXME: This should be nuw
4158   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4159   return true;
4160 }
4161 
4162 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
4163                                                  MachineRegisterInfo &MRI,
4164                                                  MachineIRBuilder &B) const {
4165   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4166   if (!MFI->isEntryFunction()) {
4167     return legalizePreloadedArgIntrin(MI, MRI, B,
4168                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
4169   }
4170 
4171   Register DstReg = MI.getOperand(0).getReg();
4172   if (!getImplicitArgPtr(DstReg, MRI, B))
4173     return false;
4174 
4175   MI.eraseFromParent();
4176   return true;
4177 }
4178 
4179 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
4180                                          MachineRegisterInfo &MRI,
4181                                          MachineIRBuilder &B) const {
4182   Function &F = B.getMF().getFunction();
4183   std::optional<uint32_t> KnownSize =
4184       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
4185   if (KnownSize.has_value())
4186     B.buildConstant(DstReg, *KnownSize);
4187   return false;
4188 }
4189 
4190 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
4191                                               MachineRegisterInfo &MRI,
4192                                               MachineIRBuilder &B) const {
4193 
4194   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4195   if (!MFI->isEntryFunction()) {
4196     return legalizePreloadedArgIntrin(MI, MRI, B,
4197                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
4198   }
4199 
4200   Register DstReg = MI.getOperand(0).getReg();
4201   if (!getLDSKernelId(DstReg, MRI, B))
4202     return false;
4203 
4204   MI.eraseFromParent();
4205   return true;
4206 }
4207 
4208 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
4209                                               MachineRegisterInfo &MRI,
4210                                               MachineIRBuilder &B,
4211                                               unsigned AddrSpace) const {
4212   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4213   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4214   Register Hi32 = Unmerge.getReg(1);
4215 
4216   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
4217   MI.eraseFromParent();
4218   return true;
4219 }
4220 
4221 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
4222 // offset (the offset that is included in bounds checking and swizzling, to be
4223 // split between the instruction's voffset and immoffset fields) and soffset
4224 // (the offset that is excluded from bounds checking and swizzling, to go in
4225 // the instruction's soffset field).  This function takes the first kind of
4226 // offset and figures out how to split it between voffset and immoffset.
4227 std::pair<Register, unsigned>
4228 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
4229                                         Register OrigOffset) const {
4230   const unsigned MaxImm = 4095;
4231   Register BaseReg;
4232   unsigned ImmOffset;
4233   const LLT S32 = LLT::scalar(32);
4234   MachineRegisterInfo &MRI = *B.getMRI();
4235 
4236   std::tie(BaseReg, ImmOffset) =
4237       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
4238 
4239   // If BaseReg is a pointer, convert it to int.
4240   if (MRI.getType(BaseReg).isPointer())
4241     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
4242 
4243   // If the immediate value is too big for the immoffset field, put the value
4244   // and -4096 into the immoffset field so that the value that is copied/added
4245   // for the voffset field is a multiple of 4096, and it stands more chance
4246   // of being CSEd with the copy/add for another similar load/store.
4247   // However, do not do that rounding down to a multiple of 4096 if that is a
4248   // negative number, as it appears to be illegal to have a negative offset
4249   // in the vgpr, even if adding the immediate offset makes it positive.
4250   unsigned Overflow = ImmOffset & ~MaxImm;
4251   ImmOffset -= Overflow;
4252   if ((int32_t)Overflow < 0) {
4253     Overflow += ImmOffset;
4254     ImmOffset = 0;
4255   }
4256 
4257   if (Overflow != 0) {
4258     if (!BaseReg) {
4259       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
4260     } else {
4261       auto OverflowVal = B.buildConstant(S32, Overflow);
4262       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4263     }
4264   }
4265 
4266   if (!BaseReg)
4267     BaseReg = B.buildConstant(S32, 0).getReg(0);
4268 
4269   return std::pair(BaseReg, ImmOffset);
4270 }
4271 
4272 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
4273 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
4274                                           Register VOffset, Register SOffset,
4275                                           unsigned ImmOffset, Register VIndex,
4276                                           MachineRegisterInfo &MRI) const {
4277   std::optional<ValueAndVReg> MaybeVOffsetVal =
4278       getIConstantVRegValWithLookThrough(VOffset, MRI);
4279   std::optional<ValueAndVReg> MaybeSOffsetVal =
4280       getIConstantVRegValWithLookThrough(SOffset, MRI);
4281   std::optional<ValueAndVReg> MaybeVIndexVal =
4282       getIConstantVRegValWithLookThrough(VIndex, MRI);
4283   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4284   // update the MMO with that offset. The stride is unknown so we can only do
4285   // this if VIndex is constant 0.
4286   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4287       MaybeVIndexVal->Value == 0) {
4288     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4289                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4290     MMO->setOffset(TotalOffset);
4291   } else {
4292     // We don't have a constant combined offset to use in the MMO. Give up.
4293     MMO->setValue((Value *)nullptr);
4294   }
4295 }
4296 
4297 /// Handle register layout difference for f16 images for some subtargets.
4298 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
4299                                              MachineRegisterInfo &MRI,
4300                                              Register Reg,
4301                                              bool ImageStore) const {
4302   const LLT S16 = LLT::scalar(16);
4303   const LLT S32 = LLT::scalar(32);
4304   LLT StoreVT = MRI.getType(Reg);
4305   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
4306 
4307   if (ST.hasUnpackedD16VMem()) {
4308     auto Unmerge = B.buildUnmerge(S16, Reg);
4309 
4310     SmallVector<Register, 4> WideRegs;
4311     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4312       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
4313 
4314     int NumElts = StoreVT.getNumElements();
4315 
4316     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4317         .getReg(0);
4318   }
4319 
4320   if (ImageStore && ST.hasImageStoreD16Bug()) {
4321     if (StoreVT.getNumElements() == 2) {
4322       SmallVector<Register, 4> PackedRegs;
4323       Reg = B.buildBitcast(S32, Reg).getReg(0);
4324       PackedRegs.push_back(Reg);
4325       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4326       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4327           .getReg(0);
4328     }
4329 
4330     if (StoreVT.getNumElements() == 3) {
4331       SmallVector<Register, 4> PackedRegs;
4332       auto Unmerge = B.buildUnmerge(S16, Reg);
4333       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4334         PackedRegs.push_back(Unmerge.getReg(I));
4335       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4336       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4337       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4338     }
4339 
4340     if (StoreVT.getNumElements() == 4) {
4341       SmallVector<Register, 4> PackedRegs;
4342       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4343       auto Unmerge = B.buildUnmerge(S32, Reg);
4344       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4345         PackedRegs.push_back(Unmerge.getReg(I));
4346       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4347       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4348           .getReg(0);
4349     }
4350 
4351     llvm_unreachable("invalid data type");
4352   }
4353 
4354   if (StoreVT == LLT::fixed_vector(3, S16)) {
4355     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4356               .getReg(0);
4357   }
4358   return Reg;
4359 }
4360 
4361 Register AMDGPULegalizerInfo::fixStoreSourceType(
4362   MachineIRBuilder &B, Register VData, bool IsFormat) const {
4363   MachineRegisterInfo *MRI = B.getMRI();
4364   LLT Ty = MRI->getType(VData);
4365 
4366   const LLT S16 = LLT::scalar(16);
4367 
4368   // Fixup illegal register types for i8 stores.
4369   if (Ty == LLT::scalar(8) || Ty == S16) {
4370     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4371     return AnyExt;
4372   }
4373 
4374   if (Ty.isVector()) {
4375     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4376       if (IsFormat)
4377         return handleD16VData(B, *MRI, VData);
4378     }
4379   }
4380 
4381   return VData;
4382 }
4383 
4384 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
4385                                               MachineRegisterInfo &MRI,
4386                                               MachineIRBuilder &B,
4387                                               bool IsTyped,
4388                                               bool IsFormat) const {
4389   Register VData = MI.getOperand(1).getReg();
4390   LLT Ty = MRI.getType(VData);
4391   LLT EltTy = Ty.getScalarType();
4392   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4393   const LLT S32 = LLT::scalar(32);
4394 
4395   VData = fixStoreSourceType(B, VData, IsFormat);
4396   Register RSrc = MI.getOperand(2).getReg();
4397 
4398   MachineMemOperand *MMO = *MI.memoperands_begin();
4399   const int MemSize = MMO->getSize();
4400 
4401   unsigned ImmOffset;
4402 
4403   // The typed intrinsics add an immediate after the registers.
4404   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4405 
4406   // The struct intrinsic variants add one additional operand over raw.
4407   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4408   Register VIndex;
4409   int OpOffset = 0;
4410   if (HasVIndex) {
4411     VIndex = MI.getOperand(3).getReg();
4412     OpOffset = 1;
4413   } else {
4414     VIndex = B.buildConstant(S32, 0).getReg(0);
4415   }
4416 
4417   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4418   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4419 
4420   unsigned Format = 0;
4421   if (IsTyped) {
4422     Format = MI.getOperand(5 + OpOffset).getImm();
4423     ++OpOffset;
4424   }
4425 
4426   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4427 
4428   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4429   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4430 
4431   unsigned Opc;
4432   if (IsTyped) {
4433     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4434                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4435   } else if (IsFormat) {
4436     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4437                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4438   } else {
4439     switch (MemSize) {
4440     case 1:
4441       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4442       break;
4443     case 2:
4444       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4445       break;
4446     default:
4447       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4448       break;
4449     }
4450   }
4451 
4452   auto MIB = B.buildInstr(Opc)
4453     .addUse(VData)              // vdata
4454     .addUse(RSrc)               // rsrc
4455     .addUse(VIndex)             // vindex
4456     .addUse(VOffset)            // voffset
4457     .addUse(SOffset)            // soffset
4458     .addImm(ImmOffset);         // offset(imm)
4459 
4460   if (IsTyped)
4461     MIB.addImm(Format);
4462 
4463   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4464      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4465      .addMemOperand(MMO);
4466 
4467   MI.eraseFromParent();
4468   return true;
4469 }
4470 
4471 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
4472                             Register VIndex, Register VOffset, Register SOffset,
4473                             unsigned ImmOffset, unsigned Format,
4474                             unsigned AuxiliaryData, MachineMemOperand *MMO,
4475                             bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
4476   auto MIB = B.buildInstr(Opc)
4477                  .addDef(LoadDstReg) // vdata
4478                  .addUse(RSrc)       // rsrc
4479                  .addUse(VIndex)     // vindex
4480                  .addUse(VOffset)    // voffset
4481                  .addUse(SOffset)    // soffset
4482                  .addImm(ImmOffset); // offset(imm)
4483 
4484   if (IsTyped)
4485     MIB.addImm(Format);
4486 
4487   MIB.addImm(AuxiliaryData)       // cachepolicy, swizzled buffer(imm)
4488       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4489       .addMemOperand(MMO);
4490 }
4491 
4492 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4493                                              MachineRegisterInfo &MRI,
4494                                              MachineIRBuilder &B,
4495                                              bool IsFormat,
4496                                              bool IsTyped) const {
4497   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4498   MachineMemOperand *MMO = *MI.memoperands_begin();
4499   const LLT MemTy = MMO->getMemoryType();
4500   const LLT S32 = LLT::scalar(32);
4501 
4502   Register Dst = MI.getOperand(0).getReg();
4503 
4504   Register StatusDst;
4505   int OpOffset = 0;
4506   assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
4507   bool IsTFE = MI.getNumExplicitDefs() == 2;
4508   if (IsTFE) {
4509     StatusDst = MI.getOperand(1).getReg();
4510     ++OpOffset;
4511   }
4512 
4513   Register RSrc = MI.getOperand(2 + OpOffset).getReg();
4514 
4515   // The typed intrinsics add an immediate after the registers.
4516   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4517 
4518   // The struct intrinsic variants add one additional operand over raw.
4519   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
4520   Register VIndex;
4521   if (HasVIndex) {
4522     VIndex = MI.getOperand(3 + OpOffset).getReg();
4523     ++OpOffset;
4524   } else {
4525     VIndex = B.buildConstant(S32, 0).getReg(0);
4526   }
4527 
4528   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4529   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4530 
4531   unsigned Format = 0;
4532   if (IsTyped) {
4533     Format = MI.getOperand(5 + OpOffset).getImm();
4534     ++OpOffset;
4535   }
4536 
4537   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4538   unsigned ImmOffset;
4539 
4540   LLT Ty = MRI.getType(Dst);
4541   LLT EltTy = Ty.getScalarType();
4542   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4543   const bool Unpacked = ST.hasUnpackedD16VMem();
4544 
4545   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4546   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4547 
4548   unsigned Opc;
4549 
4550   // TODO: Support TFE for typed and narrow loads.
4551   if (IsTyped) {
4552     if (IsTFE)
4553       return false;
4554     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4555                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4556   } else if (IsFormat) {
4557     if (IsD16) {
4558       if (IsTFE)
4559         return false;
4560       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
4561     } else {
4562       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
4563                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4564     }
4565   } else {
4566     if (IsTFE)
4567       return false;
4568     switch (MemTy.getSizeInBits()) {
4569     case 8:
4570       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4571       break;
4572     case 16:
4573       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4574       break;
4575     default:
4576       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4577       break;
4578     }
4579   }
4580 
4581   if (IsTFE) {
4582     unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
4583     unsigned NumLoadDWords = NumValueDWords + 1;
4584     LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
4585     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
4586     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4587                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4588     if (NumValueDWords == 1) {
4589       B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
4590     } else {
4591       SmallVector<Register, 5> LoadElts;
4592       for (unsigned I = 0; I != NumValueDWords; ++I)
4593         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
4594       LoadElts.push_back(StatusDst);
4595       B.buildUnmerge(LoadElts, LoadDstReg);
4596       LoadElts.truncate(NumValueDWords);
4597       B.buildMergeLikeInstr(Dst, LoadElts);
4598     }
4599   } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
4600              (IsD16 && !Ty.isVector())) {
4601     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4602     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4603                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4604     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4605     B.buildTrunc(Dst, LoadDstReg);
4606   } else if (Unpacked && IsD16 && Ty.isVector()) {
4607     LLT UnpackedTy = Ty.changeElementSize(32);
4608     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4609     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4610                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4611     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4612     // FIXME: G_TRUNC should work, but legalization currently fails
4613     auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4614     SmallVector<Register, 4> Repack;
4615     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4616       Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4617     B.buildMergeLikeInstr(Dst, Repack);
4618   } else {
4619     buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
4620                     AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4621   }
4622 
4623   MI.eraseFromParent();
4624   return true;
4625 }
4626 
4627 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4628                                                MachineIRBuilder &B,
4629                                                bool IsInc) const {
4630   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4631                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4632   B.buildInstr(Opc)
4633     .addDef(MI.getOperand(0).getReg())
4634     .addUse(MI.getOperand(2).getReg())
4635     .addUse(MI.getOperand(3).getReg())
4636     .cloneMemRefs(MI);
4637   MI.eraseFromParent();
4638   return true;
4639 }
4640 
4641 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4642   switch (IntrID) {
4643   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4644   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4645     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4646   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4647   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4648     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4649   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4650   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4651     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4652   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4653   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4654     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4655   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4656   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4657     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4658   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4659   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4660     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4661   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4662   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4663     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4664   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4665   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4666     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4667   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4668   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4669     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4670   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4671   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4672     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4673   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4674   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4675     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4676   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4677   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4678     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4679   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4680   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4681     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4682   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4683   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4684     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4685   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4686   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4687     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4688   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4689   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4690     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4691   default:
4692     llvm_unreachable("unhandled atomic opcode");
4693   }
4694 }
4695 
4696 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4697                                                MachineIRBuilder &B,
4698                                                Intrinsic::ID IID) const {
4699   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4700                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4701   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4702 
4703   Register Dst;
4704 
4705   int OpOffset = 0;
4706   if (HasReturn) {
4707     // A few FP atomics do not support return values.
4708     Dst = MI.getOperand(0).getReg();
4709   } else {
4710     OpOffset = -1;
4711   }
4712 
4713   Register VData = MI.getOperand(2 + OpOffset).getReg();
4714   Register CmpVal;
4715 
4716   if (IsCmpSwap) {
4717     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4718     ++OpOffset;
4719   }
4720 
4721   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4722   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4723 
4724   // The struct intrinsic variants add one additional operand over raw.
4725   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4726   Register VIndex;
4727   if (HasVIndex) {
4728     VIndex = MI.getOperand(4 + OpOffset).getReg();
4729     ++OpOffset;
4730   } else {
4731     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4732   }
4733 
4734   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4735   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4736   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4737 
4738   MachineMemOperand *MMO = *MI.memoperands_begin();
4739 
4740   unsigned ImmOffset;
4741   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4742   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4743 
4744   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4745 
4746   if (HasReturn)
4747     MIB.addDef(Dst);
4748 
4749   MIB.addUse(VData); // vdata
4750 
4751   if (IsCmpSwap)
4752     MIB.addReg(CmpVal);
4753 
4754   MIB.addUse(RSrc)               // rsrc
4755      .addUse(VIndex)             // vindex
4756      .addUse(VOffset)            // voffset
4757      .addUse(SOffset)            // soffset
4758      .addImm(ImmOffset)          // offset(imm)
4759      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4760      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4761      .addMemOperand(MMO);
4762 
4763   MI.eraseFromParent();
4764   return true;
4765 }
4766 
4767 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4768 /// vector with s16 typed elements.
4769 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4770                                       SmallVectorImpl<Register> &PackedAddrs,
4771                                       unsigned ArgOffset,
4772                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4773                                       bool IsA16, bool IsG16) {
4774   const LLT S16 = LLT::scalar(16);
4775   const LLT V2S16 = LLT::fixed_vector(2, 16);
4776   auto EndIdx = Intr->VAddrEnd;
4777 
4778   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4779     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4780     if (!SrcOp.isReg())
4781       continue; // _L to _LZ may have eliminated this.
4782 
4783     Register AddrReg = SrcOp.getReg();
4784 
4785     if ((I < Intr->GradientStart) ||
4786         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4787         (I >= Intr->CoordStart && !IsA16)) {
4788       if ((I < Intr->GradientStart) && IsA16 &&
4789           (B.getMRI()->getType(AddrReg) == S16)) {
4790         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4791         // Special handling of bias when A16 is on. Bias is of type half but
4792         // occupies full 32-bit.
4793         PackedAddrs.push_back(
4794             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4795                 .getReg(0));
4796       } else {
4797         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4798                "Bias needs to be converted to 16 bit in A16 mode");
4799         // Handle any gradient or coordinate operands that should not be packed
4800         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4801         PackedAddrs.push_back(AddrReg);
4802       }
4803     } else {
4804       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4805       // derivatives dx/dh and dx/dv are packed with undef.
4806       if (((I + 1) >= EndIdx) ||
4807           ((Intr->NumGradients / 2) % 2 == 1 &&
4808            (I == static_cast<unsigned>(Intr->GradientStart +
4809                                        (Intr->NumGradients / 2) - 1) ||
4810             I == static_cast<unsigned>(Intr->GradientStart +
4811                                        Intr->NumGradients - 1))) ||
4812           // Check for _L to _LZ optimization
4813           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4814         PackedAddrs.push_back(
4815             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4816                 .getReg(0));
4817       } else {
4818         PackedAddrs.push_back(
4819             B.buildBuildVector(
4820                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4821                 .getReg(0));
4822         ++I;
4823       }
4824     }
4825   }
4826 }
4827 
4828 /// Convert from separate vaddr components to a single vector address register,
4829 /// and replace the remaining operands with $noreg.
4830 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4831                                      int DimIdx, int NumVAddrs) {
4832   const LLT S32 = LLT::scalar(32);
4833   (void)S32;
4834   SmallVector<Register, 8> AddrRegs;
4835   for (int I = 0; I != NumVAddrs; ++I) {
4836     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4837     if (SrcOp.isReg()) {
4838       AddrRegs.push_back(SrcOp.getReg());
4839       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4840     }
4841   }
4842 
4843   int NumAddrRegs = AddrRegs.size();
4844   if (NumAddrRegs != 1) {
4845     auto VAddr =
4846         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4847     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4848   }
4849 
4850   for (int I = 1; I != NumVAddrs; ++I) {
4851     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4852     if (SrcOp.isReg())
4853       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4854   }
4855 }
4856 
4857 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4858 ///
4859 /// Depending on the subtarget, load/store with 16-bit element data need to be
4860 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4861 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4862 /// registers.
4863 ///
4864 /// We don't want to directly select image instructions just yet, but also want
4865 /// to exposes all register repacking to the legalizer/combiners. We also don't
4866 /// want a selected instruction entering RegBankSelect. In order to avoid
4867 /// defining a multitude of intermediate image instructions, directly hack on
4868 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4869 /// padding now unnecessary arguments with $noreg.
4870 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4871     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4872     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4873 
4874   const MachineFunction &MF = *MI.getMF();
4875   const unsigned NumDefs = MI.getNumExplicitDefs();
4876   const unsigned ArgOffset = NumDefs + 1;
4877   bool IsTFE = NumDefs == 2;
4878   // We are only processing the operands of d16 image operations on subtargets
4879   // that use the unpacked register layout, or need to repack the TFE result.
4880 
4881   // TODO: Do we need to guard against already legalized intrinsics?
4882   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4883       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4884 
4885   MachineRegisterInfo *MRI = B.getMRI();
4886   const LLT S32 = LLT::scalar(32);
4887   const LLT S16 = LLT::scalar(16);
4888   const LLT V2S16 = LLT::fixed_vector(2, 16);
4889 
4890   unsigned DMask = 0;
4891   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4892   LLT Ty = MRI->getType(VData);
4893 
4894   // Check for 16 bit addresses and pack if true.
4895   LLT GradTy =
4896       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4897   LLT AddrTy =
4898       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4899   const bool IsG16 = GradTy == S16;
4900   const bool IsA16 = AddrTy == S16;
4901   const bool IsD16 = Ty.getScalarType() == S16;
4902 
4903   int DMaskLanes = 0;
4904   if (!BaseOpcode->Atomic) {
4905     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4906     if (BaseOpcode->Gather4) {
4907       DMaskLanes = 4;
4908     } else if (DMask != 0) {
4909       DMaskLanes = llvm::popcount(DMask);
4910     } else if (!IsTFE && !BaseOpcode->Store) {
4911       // If dmask is 0, this is a no-op load. This can be eliminated.
4912       B.buildUndef(MI.getOperand(0));
4913       MI.eraseFromParent();
4914       return true;
4915     }
4916   }
4917 
4918   Observer.changingInstr(MI);
4919   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4920 
4921   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4922                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4923   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4924                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4925   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4926 
4927   // Track that we legalized this
4928   MI.setDesc(B.getTII().get(NewOpcode));
4929 
4930   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4931   // dmask to be at least 1 otherwise the instruction will fail
4932   if (IsTFE && DMask == 0) {
4933     DMask = 0x1;
4934     DMaskLanes = 1;
4935     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4936   }
4937 
4938   if (BaseOpcode->Atomic) {
4939     Register VData0 = MI.getOperand(2).getReg();
4940     LLT Ty = MRI->getType(VData0);
4941 
4942     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4943     if (Ty.isVector())
4944       return false;
4945 
4946     if (BaseOpcode->AtomicX2) {
4947       Register VData1 = MI.getOperand(3).getReg();
4948       // The two values are packed in one register.
4949       LLT PackedTy = LLT::fixed_vector(2, Ty);
4950       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4951       MI.getOperand(2).setReg(Concat.getReg(0));
4952       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4953     }
4954   }
4955 
4956   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4957 
4958   // Rewrite the addressing register layout before doing anything else.
4959   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4960     // 16 bit gradients are supported, but are tied to the A16 control
4961     // so both gradients and addresses must be 16 bit
4962     return false;
4963   }
4964 
4965   if (IsA16 && !ST.hasA16()) {
4966     // A16 not supported
4967     return false;
4968   }
4969 
4970   if (IsA16 || IsG16) {
4971     if (Intr->NumVAddrs > 1) {
4972       SmallVector<Register, 4> PackedRegs;
4973 
4974       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4975                                 IsG16);
4976 
4977       // See also below in the non-a16 branch
4978       const bool UseNSA = ST.hasNSAEncoding() &&
4979                           PackedRegs.size() >= ST.getNSAThreshold(MF) &&
4980                           PackedRegs.size() <= ST.getNSAMaxSize();
4981 
4982       if (!UseNSA && PackedRegs.size() > 1) {
4983         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4984         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4985         PackedRegs[0] = Concat.getReg(0);
4986         PackedRegs.resize(1);
4987       }
4988 
4989       const unsigned NumPacked = PackedRegs.size();
4990       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4991         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4992         if (!SrcOp.isReg()) {
4993           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4994           continue;
4995         }
4996 
4997         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4998 
4999         if (I - Intr->VAddrStart < NumPacked)
5000           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
5001         else
5002           SrcOp.setReg(AMDGPU::NoRegister);
5003       }
5004     }
5005   } else {
5006     // If the register allocator cannot place the address registers contiguously
5007     // without introducing moves, then using the non-sequential address encoding
5008     // is always preferable, since it saves VALU instructions and is usually a
5009     // wash in terms of code size or even better.
5010     //
5011     // However, we currently have no way of hinting to the register allocator
5012     // that MIMG addresses should be placed contiguously when it is possible to
5013     // do so, so force non-NSA for the common 2-address case as a heuristic.
5014     //
5015     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5016     // allocation when possible.
5017     //
5018     // TODO: we can actually allow partial NSA where the final register is a
5019     // contiguous set of the remaining addresses.
5020     // This could help where there are more addresses than supported.
5021     const bool UseNSA = ST.hasNSAEncoding() &&
5022                         CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
5023                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
5024 
5025     if (!UseNSA && Intr->NumVAddrs > 1)
5026       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
5027                                Intr->NumVAddrs);
5028   }
5029 
5030   int Flags = 0;
5031   if (IsA16)
5032     Flags |= 1;
5033   if (IsG16)
5034     Flags |= 2;
5035   MI.addOperand(MachineOperand::CreateImm(Flags));
5036 
5037   if (BaseOpcode->Store) { // No TFE for stores?
5038     // TODO: Handle dmask trim
5039     if (!Ty.isVector() || !IsD16)
5040       return true;
5041 
5042     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
5043     if (RepackedReg != VData) {
5044       MI.getOperand(1).setReg(RepackedReg);
5045     }
5046 
5047     return true;
5048   }
5049 
5050   Register DstReg = MI.getOperand(0).getReg();
5051   const LLT EltTy = Ty.getScalarType();
5052   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
5053 
5054   // Confirm that the return type is large enough for the dmask specified
5055   if (NumElts < DMaskLanes)
5056     return false;
5057 
5058   if (NumElts > 4 || DMaskLanes > 4)
5059     return false;
5060 
5061   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5062   const LLT AdjustedTy =
5063       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
5064 
5065   // The raw dword aligned data component of the load. The only legal cases
5066   // where this matters should be when using the packed D16 format, for
5067   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
5068   LLT RoundedTy;
5069 
5070   // S32 vector to cover all data, plus TFE result element.
5071   LLT TFETy;
5072 
5073   // Register type to use for each loaded component. Will be S32 or V2S16.
5074   LLT RegTy;
5075 
5076   if (IsD16 && ST.hasUnpackedD16VMem()) {
5077     RoundedTy =
5078         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
5079     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
5080     RegTy = S32;
5081   } else {
5082     unsigned EltSize = EltTy.getSizeInBits();
5083     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
5084     unsigned RoundedSize = 32 * RoundedElts;
5085     RoundedTy = LLT::scalarOrVector(
5086         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
5087     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
5088     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
5089   }
5090 
5091   // The return type does not need adjustment.
5092   // TODO: Should we change s16 case to s32 or <2 x s16>?
5093   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
5094     return true;
5095 
5096   Register Dst1Reg;
5097 
5098   // Insert after the instruction.
5099   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
5100 
5101   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
5102   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
5103   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
5104   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
5105 
5106   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
5107 
5108   MI.getOperand(0).setReg(NewResultReg);
5109 
5110   // In the IR, TFE is supposed to be used with a 2 element struct return
5111   // type. The instruction really returns these two values in one contiguous
5112   // register, with one additional dword beyond the loaded data. Rewrite the
5113   // return type to use a single register result.
5114 
5115   if (IsTFE) {
5116     Dst1Reg = MI.getOperand(1).getReg();
5117     if (MRI->getType(Dst1Reg) != S32)
5118       return false;
5119 
5120     // TODO: Make sure the TFE operand bit is set.
5121     MI.removeOperand(1);
5122 
5123     // Handle the easy case that requires no repack instructions.
5124     if (Ty == S32) {
5125       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
5126       return true;
5127     }
5128   }
5129 
5130   // Now figure out how to copy the new result register back into the old
5131   // result.
5132   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
5133 
5134   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
5135 
5136   if (ResultNumRegs == 1) {
5137     assert(!IsTFE);
5138     ResultRegs[0] = NewResultReg;
5139   } else {
5140     // We have to repack into a new vector of some kind.
5141     for (int I = 0; I != NumDataRegs; ++I)
5142       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
5143     B.buildUnmerge(ResultRegs, NewResultReg);
5144 
5145     // Drop the final TFE element to get the data part. The TFE result is
5146     // directly written to the right place already.
5147     if (IsTFE)
5148       ResultRegs.resize(NumDataRegs);
5149   }
5150 
5151   // For an s16 scalar result, we form an s32 result with a truncate regardless
5152   // of packed vs. unpacked.
5153   if (IsD16 && !Ty.isVector()) {
5154     B.buildTrunc(DstReg, ResultRegs[0]);
5155     return true;
5156   }
5157 
5158   // Avoid a build/concat_vector of 1 entry.
5159   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
5160     B.buildBitcast(DstReg, ResultRegs[0]);
5161     return true;
5162   }
5163 
5164   assert(Ty.isVector());
5165 
5166   if (IsD16) {
5167     // For packed D16 results with TFE enabled, all the data components are
5168     // S32. Cast back to the expected type.
5169     //
5170     // TODO: We don't really need to use load s32 elements. We would only need one
5171     // cast for the TFE result if a multiple of v2s16 was used.
5172     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
5173       for (Register &Reg : ResultRegs)
5174         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
5175     } else if (ST.hasUnpackedD16VMem()) {
5176       for (Register &Reg : ResultRegs)
5177         Reg = B.buildTrunc(S16, Reg).getReg(0);
5178     }
5179   }
5180 
5181   auto padWithUndef = [&](LLT Ty, int NumElts) {
5182     if (NumElts == 0)
5183       return;
5184     Register Undef = B.buildUndef(Ty).getReg(0);
5185     for (int I = 0; I != NumElts; ++I)
5186       ResultRegs.push_back(Undef);
5187   };
5188 
5189   // Pad out any elements eliminated due to the dmask.
5190   LLT ResTy = MRI->getType(ResultRegs[0]);
5191   if (!ResTy.isVector()) {
5192     padWithUndef(ResTy, NumElts - ResultRegs.size());
5193     B.buildBuildVector(DstReg, ResultRegs);
5194     return true;
5195   }
5196 
5197   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
5198   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
5199 
5200   // Deal with the one annoying legal case.
5201   const LLT V3S16 = LLT::fixed_vector(3, 16);
5202   if (Ty == V3S16) {
5203     if (IsTFE) {
5204       if (ResultRegs.size() == 1) {
5205         NewResultReg = ResultRegs[0];
5206       } else if (ResultRegs.size() == 2) {
5207         LLT V4S16 = LLT::fixed_vector(4, 16);
5208         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
5209       } else {
5210         return false;
5211       }
5212     }
5213 
5214     if (MRI->getType(DstReg).getNumElements() <
5215         MRI->getType(NewResultReg).getNumElements()) {
5216       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
5217     } else {
5218       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
5219     }
5220     return true;
5221   }
5222 
5223   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
5224   B.buildConcatVectors(DstReg, ResultRegs);
5225   return true;
5226 }
5227 
5228 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
5229   LegalizerHelper &Helper, MachineInstr &MI) const {
5230   MachineIRBuilder &B = Helper.MIRBuilder;
5231   GISelChangeObserver &Observer = Helper.Observer;
5232 
5233   Register Dst = MI.getOperand(0).getReg();
5234   LLT Ty = B.getMRI()->getType(Dst);
5235   unsigned Size = Ty.getSizeInBits();
5236   MachineFunction &MF = B.getMF();
5237 
5238   Observer.changingInstr(MI);
5239 
5240   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
5241     Ty = getBitcastRegisterType(Ty);
5242     Helper.bitcastDst(MI, Ty, 0);
5243     Dst = MI.getOperand(0).getReg();
5244     B.setInsertPt(B.getMBB(), MI);
5245   }
5246 
5247   // FIXME: We don't really need this intermediate instruction. The intrinsic
5248   // should be fixed to have a memory operand. Since it's readnone, we're not
5249   // allowed to add one.
5250   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
5251   MI.removeOperand(1); // Remove intrinsic ID
5252 
5253   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
5254   // TODO: Should this use datalayout alignment?
5255   const unsigned MemSize = (Size + 7) / 8;
5256   const Align MemAlign(4);
5257   MachineMemOperand *MMO = MF.getMachineMemOperand(
5258       MachinePointerInfo(),
5259       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5260           MachineMemOperand::MOInvariant,
5261       MemSize, MemAlign);
5262   MI.addMemOperand(MF, MMO);
5263 
5264   // There are no 96-bit result scalar loads, but widening to 128-bit should
5265   // always be legal. We may need to restore this to a 96-bit result if it turns
5266   // out this needs to be converted to a vector load during RegBankSelect.
5267   if (!isPowerOf2_32(Size)) {
5268     if (Ty.isVector())
5269       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
5270     else
5271       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
5272   }
5273 
5274   Observer.changedInstr(MI);
5275   return true;
5276 }
5277 
5278 // TODO: Move to selection
5279 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
5280                                                 MachineRegisterInfo &MRI,
5281                                                 MachineIRBuilder &B) const {
5282   if (!ST.isTrapHandlerEnabled() ||
5283       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
5284     return legalizeTrapEndpgm(MI, MRI, B);
5285 
5286   if (std::optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
5287     switch (*HsaAbiVer) {
5288     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
5289     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
5290       return legalizeTrapHsaQueuePtr(MI, MRI, B);
5291     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
5292     case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
5293       return ST.supportsGetDoorbellID() ?
5294           legalizeTrapHsa(MI, MRI, B) :
5295           legalizeTrapHsaQueuePtr(MI, MRI, B);
5296     }
5297   }
5298 
5299   llvm_unreachable("Unknown trap handler");
5300 }
5301 
5302 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
5303     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5304   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
5305   MI.eraseFromParent();
5306   return true;
5307 }
5308 
5309 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
5310     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5311   MachineFunction &MF = B.getMF();
5312   const LLT S64 = LLT::scalar(64);
5313 
5314   Register SGPR01(AMDGPU::SGPR0_SGPR1);
5315   // For code object version 5, queue_ptr is passed through implicit kernarg.
5316   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
5317     AMDGPUTargetLowering::ImplicitParameter Param =
5318         AMDGPUTargetLowering::QUEUE_PTR;
5319     uint64_t Offset =
5320         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
5321 
5322     Register KernargPtrReg = MRI.createGenericVirtualRegister(
5323         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5324 
5325     if (!loadInputValue(KernargPtrReg, B,
5326                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5327       return false;
5328 
5329     // TODO: can we be smarter about machine pointer info?
5330     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5331     MachineMemOperand *MMO = MF.getMachineMemOperand(
5332         PtrInfo,
5333         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5334             MachineMemOperand::MOInvariant,
5335         LLT::scalar(64), commonAlignment(Align(64), Offset));
5336 
5337     // Pointer address
5338     Register LoadAddr = MRI.createGenericVirtualRegister(
5339         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5340     B.buildPtrAdd(LoadAddr, KernargPtrReg,
5341                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
5342     // Load address
5343     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
5344     B.buildCopy(SGPR01, Temp);
5345     B.buildInstr(AMDGPU::S_TRAP)
5346         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5347         .addReg(SGPR01, RegState::Implicit);
5348     MI.eraseFromParent();
5349     return true;
5350   }
5351 
5352   // Pass queue pointer to trap handler as input, and insert trap instruction
5353   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
5354   Register LiveIn =
5355     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5356   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
5357     return false;
5358 
5359   B.buildCopy(SGPR01, LiveIn);
5360   B.buildInstr(AMDGPU::S_TRAP)
5361       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5362       .addReg(SGPR01, RegState::Implicit);
5363 
5364   MI.eraseFromParent();
5365   return true;
5366 }
5367 
5368 bool AMDGPULegalizerInfo::legalizeTrapHsa(
5369     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5370   B.buildInstr(AMDGPU::S_TRAP)
5371       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
5372   MI.eraseFromParent();
5373   return true;
5374 }
5375 
5376 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
5377     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5378   // Is non-HSA path or trap-handler disabled? Then, report a warning
5379   // accordingly
5380   if (!ST.isTrapHandlerEnabled() ||
5381       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
5382     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
5383                                      "debugtrap handler not supported",
5384                                      MI.getDebugLoc(), DS_Warning);
5385     LLVMContext &Ctx = B.getMF().getFunction().getContext();
5386     Ctx.diagnose(NoTrap);
5387   } else {
5388     // Insert debug-trap instruction
5389     B.buildInstr(AMDGPU::S_TRAP)
5390         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
5391   }
5392 
5393   MI.eraseFromParent();
5394   return true;
5395 }
5396 
5397 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
5398                                                MachineIRBuilder &B) const {
5399   MachineRegisterInfo &MRI = *B.getMRI();
5400   const LLT S16 = LLT::scalar(16);
5401   const LLT S32 = LLT::scalar(32);
5402   const LLT V2S16 = LLT::fixed_vector(2, 16);
5403   const LLT V3S32 = LLT::fixed_vector(3, 32);
5404 
5405   Register DstReg = MI.getOperand(0).getReg();
5406   Register NodePtr = MI.getOperand(2).getReg();
5407   Register RayExtent = MI.getOperand(3).getReg();
5408   Register RayOrigin = MI.getOperand(4).getReg();
5409   Register RayDir = MI.getOperand(5).getReg();
5410   Register RayInvDir = MI.getOperand(6).getReg();
5411   Register TDescr = MI.getOperand(7).getReg();
5412 
5413   if (!ST.hasGFX10_AEncoding()) {
5414     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
5415                                         "intrinsic not supported on subtarget",
5416                                         MI.getDebugLoc());
5417     B.getMF().getFunction().getContext().diagnose(BadIntrin);
5418     return false;
5419   }
5420 
5421   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
5422   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
5423   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
5424   const unsigned NumVDataDwords = 4;
5425   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
5426   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
5427   const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
5428   const unsigned BaseOpcodes[2][2] = {
5429       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5430       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5431        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5432   int Opcode;
5433   if (UseNSA) {
5434     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
5435                                    IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
5436                                                : AMDGPU::MIMGEncGfx10NSA,
5437                                    NumVDataDwords, NumVAddrDwords);
5438   } else {
5439     Opcode = AMDGPU::getMIMGOpcode(
5440         BaseOpcodes[Is64][IsA16],
5441         IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
5442         NumVDataDwords, NumVAddrDwords);
5443   }
5444   assert(Opcode != -1);
5445 
5446   SmallVector<Register, 12> Ops;
5447   if (UseNSA && IsGFX11Plus) {
5448     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
5449       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5450       auto Merged = B.buildMergeLikeInstr(
5451           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
5452       Ops.push_back(Merged.getReg(0));
5453     };
5454 
5455     Ops.push_back(NodePtr);
5456     Ops.push_back(RayExtent);
5457     packLanes(RayOrigin);
5458 
5459     if (IsA16) {
5460       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5461       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5462       auto MergedDir = B.buildMergeLikeInstr(
5463           V3S32,
5464           {B.buildBitcast(
5465                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
5466                                                    UnmergeRayDir.getReg(0)}))
5467                .getReg(0),
5468            B.buildBitcast(
5469                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
5470                                                    UnmergeRayDir.getReg(1)}))
5471                .getReg(0),
5472            B.buildBitcast(
5473                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
5474                                                    UnmergeRayDir.getReg(2)}))
5475                .getReg(0)});
5476       Ops.push_back(MergedDir.getReg(0));
5477     } else {
5478       packLanes(RayDir);
5479       packLanes(RayInvDir);
5480     }
5481   } else {
5482     if (Is64) {
5483       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5484       Ops.push_back(Unmerge.getReg(0));
5485       Ops.push_back(Unmerge.getReg(1));
5486     } else {
5487       Ops.push_back(NodePtr);
5488     }
5489     Ops.push_back(RayExtent);
5490 
5491     auto packLanes = [&Ops, &S32, &B](Register Src) {
5492       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5493       Ops.push_back(Unmerge.getReg(0));
5494       Ops.push_back(Unmerge.getReg(1));
5495       Ops.push_back(Unmerge.getReg(2));
5496     };
5497 
5498     packLanes(RayOrigin);
5499     if (IsA16) {
5500       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5501       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5502       Register R1 = MRI.createGenericVirtualRegister(S32);
5503       Register R2 = MRI.createGenericVirtualRegister(S32);
5504       Register R3 = MRI.createGenericVirtualRegister(S32);
5505       B.buildMergeLikeInstr(R1,
5506                             {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5507       B.buildMergeLikeInstr(
5508           R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5509       B.buildMergeLikeInstr(
5510           R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5511       Ops.push_back(R1);
5512       Ops.push_back(R2);
5513       Ops.push_back(R3);
5514     } else {
5515       packLanes(RayDir);
5516       packLanes(RayInvDir);
5517     }
5518   }
5519 
5520   if (!UseNSA) {
5521     // Build a single vector containing all the operands so far prepared.
5522     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5523     Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
5524     Ops.clear();
5525     Ops.push_back(MergedOps);
5526   }
5527 
5528   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5529     .addDef(DstReg)
5530     .addImm(Opcode);
5531 
5532   for (Register R : Ops) {
5533     MIB.addUse(R);
5534   }
5535 
5536   MIB.addUse(TDescr)
5537      .addImm(IsA16 ? 1 : 0)
5538      .cloneMemRefs(MI);
5539 
5540   MI.eraseFromParent();
5541   return true;
5542 }
5543 
5544 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
5545                                                MachineIRBuilder &B) const {
5546   unsigned Opc;
5547   int RoundMode = MI.getOperand(2).getImm();
5548 
5549   if (RoundMode == (int)RoundingMode::TowardPositive)
5550     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
5551   else if (RoundMode == (int)RoundingMode::TowardNegative)
5552     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
5553   else
5554     return false;
5555 
5556   B.buildInstr(Opc)
5557       .addDef(MI.getOperand(0).getReg())
5558       .addUse(MI.getOperand(1).getReg());
5559 
5560   MI.eraseFromParent();
5561 
5562   return true;
5563 }
5564 
5565 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
5566                                             MachineInstr &MI) const {
5567   MachineIRBuilder &B = Helper.MIRBuilder;
5568   MachineRegisterInfo &MRI = *B.getMRI();
5569 
5570   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5571   auto IntrID = MI.getIntrinsicID();
5572   switch (IntrID) {
5573   case Intrinsic::amdgcn_if:
5574   case Intrinsic::amdgcn_else: {
5575     MachineInstr *Br = nullptr;
5576     MachineBasicBlock *UncondBrTarget = nullptr;
5577     bool Negated = false;
5578     if (MachineInstr *BrCond =
5579             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5580       const SIRegisterInfo *TRI
5581         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5582 
5583       Register Def = MI.getOperand(1).getReg();
5584       Register Use = MI.getOperand(3).getReg();
5585 
5586       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5587 
5588       if (Negated)
5589         std::swap(CondBrTarget, UncondBrTarget);
5590 
5591       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5592       if (IntrID == Intrinsic::amdgcn_if) {
5593         B.buildInstr(AMDGPU::SI_IF)
5594           .addDef(Def)
5595           .addUse(Use)
5596           .addMBB(UncondBrTarget);
5597       } else {
5598         B.buildInstr(AMDGPU::SI_ELSE)
5599             .addDef(Def)
5600             .addUse(Use)
5601             .addMBB(UncondBrTarget);
5602       }
5603 
5604       if (Br) {
5605         Br->getOperand(0).setMBB(CondBrTarget);
5606       } else {
5607         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5608         // since we're swapping branch targets it needs to be reinserted.
5609         // FIXME: IRTranslator should probably not do this
5610         B.buildBr(*CondBrTarget);
5611       }
5612 
5613       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5614       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5615       MI.eraseFromParent();
5616       BrCond->eraseFromParent();
5617       return true;
5618     }
5619 
5620     return false;
5621   }
5622   case Intrinsic::amdgcn_loop: {
5623     MachineInstr *Br = nullptr;
5624     MachineBasicBlock *UncondBrTarget = nullptr;
5625     bool Negated = false;
5626     if (MachineInstr *BrCond =
5627             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5628       const SIRegisterInfo *TRI
5629         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5630 
5631       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5632       Register Reg = MI.getOperand(2).getReg();
5633 
5634       if (Negated)
5635         std::swap(CondBrTarget, UncondBrTarget);
5636 
5637       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5638       B.buildInstr(AMDGPU::SI_LOOP)
5639         .addUse(Reg)
5640         .addMBB(UncondBrTarget);
5641 
5642       if (Br)
5643         Br->getOperand(0).setMBB(CondBrTarget);
5644       else
5645         B.buildBr(*CondBrTarget);
5646 
5647       MI.eraseFromParent();
5648       BrCond->eraseFromParent();
5649       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5650       return true;
5651     }
5652 
5653     return false;
5654   }
5655   case Intrinsic::amdgcn_kernarg_segment_ptr:
5656     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5657       // This only makes sense to call in a kernel, so just lower to null.
5658       B.buildConstant(MI.getOperand(0).getReg(), 0);
5659       MI.eraseFromParent();
5660       return true;
5661     }
5662 
5663     return legalizePreloadedArgIntrin(
5664       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5665   case Intrinsic::amdgcn_implicitarg_ptr:
5666     return legalizeImplicitArgPtr(MI, MRI, B);
5667   case Intrinsic::amdgcn_workitem_id_x:
5668     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
5669                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5670   case Intrinsic::amdgcn_workitem_id_y:
5671     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
5672                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5673   case Intrinsic::amdgcn_workitem_id_z:
5674     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
5675                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5676   case Intrinsic::amdgcn_workgroup_id_x:
5677     return legalizePreloadedArgIntrin(MI, MRI, B,
5678                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5679   case Intrinsic::amdgcn_workgroup_id_y:
5680     return legalizePreloadedArgIntrin(MI, MRI, B,
5681                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5682   case Intrinsic::amdgcn_workgroup_id_z:
5683     return legalizePreloadedArgIntrin(MI, MRI, B,
5684                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5685   case Intrinsic::amdgcn_lds_kernel_id:
5686     return legalizePreloadedArgIntrin(MI, MRI, B,
5687                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5688   case Intrinsic::amdgcn_dispatch_ptr:
5689     return legalizePreloadedArgIntrin(MI, MRI, B,
5690                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5691   case Intrinsic::amdgcn_queue_ptr:
5692     return legalizePreloadedArgIntrin(MI, MRI, B,
5693                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5694   case Intrinsic::amdgcn_implicit_buffer_ptr:
5695     return legalizePreloadedArgIntrin(
5696       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5697   case Intrinsic::amdgcn_dispatch_id:
5698     return legalizePreloadedArgIntrin(MI, MRI, B,
5699                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5700   case Intrinsic::r600_read_ngroups_x:
5701     // TODO: Emit error for hsa
5702     return legalizeKernargMemParameter(MI, B,
5703                                        SI::KernelInputOffsets::NGROUPS_X);
5704   case Intrinsic::r600_read_ngroups_y:
5705     return legalizeKernargMemParameter(MI, B,
5706                                        SI::KernelInputOffsets::NGROUPS_Y);
5707   case Intrinsic::r600_read_ngroups_z:
5708     return legalizeKernargMemParameter(MI, B,
5709                                        SI::KernelInputOffsets::NGROUPS_Z);
5710   case Intrinsic::r600_read_local_size_x:
5711     // TODO: Could insert G_ASSERT_ZEXT from s16
5712     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
5713   case Intrinsic::r600_read_local_size_y:
5714     // TODO: Could insert G_ASSERT_ZEXT from s16
5715     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
5716     // TODO: Could insert G_ASSERT_ZEXT from s16
5717   case Intrinsic::r600_read_local_size_z:
5718     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
5719   case Intrinsic::r600_read_global_size_x:
5720     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
5721   case Intrinsic::r600_read_global_size_y:
5722     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
5723   case Intrinsic::r600_read_global_size_z:
5724     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
5725   case Intrinsic::amdgcn_fdiv_fast:
5726     return legalizeFDIVFastIntrin(MI, MRI, B);
5727   case Intrinsic::amdgcn_is_shared:
5728     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5729   case Intrinsic::amdgcn_is_private:
5730     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5731   case Intrinsic::amdgcn_wavefrontsize: {
5732     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5733     MI.eraseFromParent();
5734     return true;
5735   }
5736   case Intrinsic::amdgcn_s_buffer_load:
5737     return legalizeSBufferLoad(Helper, MI);
5738   case Intrinsic::amdgcn_raw_buffer_store:
5739   case Intrinsic::amdgcn_struct_buffer_store:
5740     return legalizeBufferStore(MI, MRI, B, false, false);
5741   case Intrinsic::amdgcn_raw_buffer_store_format:
5742   case Intrinsic::amdgcn_struct_buffer_store_format:
5743     return legalizeBufferStore(MI, MRI, B, false, true);
5744   case Intrinsic::amdgcn_raw_tbuffer_store:
5745   case Intrinsic::amdgcn_struct_tbuffer_store:
5746     return legalizeBufferStore(MI, MRI, B, true, true);
5747   case Intrinsic::amdgcn_raw_buffer_load:
5748   case Intrinsic::amdgcn_struct_buffer_load:
5749     return legalizeBufferLoad(MI, MRI, B, false, false);
5750   case Intrinsic::amdgcn_raw_buffer_load_format:
5751   case Intrinsic::amdgcn_struct_buffer_load_format:
5752     return legalizeBufferLoad(MI, MRI, B, true, false);
5753   case Intrinsic::amdgcn_raw_tbuffer_load:
5754   case Intrinsic::amdgcn_struct_tbuffer_load:
5755     return legalizeBufferLoad(MI, MRI, B, true, true);
5756   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5757   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5758   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5759   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5760   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5761   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5762   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5763   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5764   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5765   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5766   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5767   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5768   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5769   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5770   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5771   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5772   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5773   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5774   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5775   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5776   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5777   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5778   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5779   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5780   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5781   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5782   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5783   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5784   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5785   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5786   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5787   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5788     return legalizeBufferAtomic(MI, B, IntrID);
5789   case Intrinsic::amdgcn_atomic_inc:
5790     return legalizeAtomicIncDec(MI, B, true);
5791   case Intrinsic::amdgcn_atomic_dec:
5792     return legalizeAtomicIncDec(MI, B, false);
5793   case Intrinsic::trap:
5794     return legalizeTrapIntrinsic(MI, MRI, B);
5795   case Intrinsic::debugtrap:
5796     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5797   case Intrinsic::amdgcn_rsq_clamp:
5798     return legalizeRsqClampIntrinsic(MI, MRI, B);
5799   case Intrinsic::amdgcn_ds_fadd:
5800   case Intrinsic::amdgcn_ds_fmin:
5801   case Intrinsic::amdgcn_ds_fmax:
5802     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5803   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5804     return legalizeBVHIntrinsic(MI, B);
5805   default: {
5806     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5807             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5808       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5809     return true;
5810   }
5811   }
5812 
5813   return true;
5814 }
5815