1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
getPow2VectorType(LLT Ty)49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
getPow2ScalarType(LLT Ty)56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
isSmallOddVector(unsigned TypeIdx)65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
sizeIsMultipleOf32(unsigned TypeIdx)79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
isWideVec16(unsigned TypeIdx)86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
oneMoreElement(unsigned TypeIdx)94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99   };
100 }
101 
fewerEltsToSize64Vector(unsigned TypeIdx)102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105     const LLT EltTy = Ty.getElementType();
106     unsigned Size = Ty.getSizeInBits();
107     unsigned Pieces = (Size + 63) / 64;
108     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110   };
111 }
112 
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
moreEltsToNext32Bit(unsigned TypeIdx)115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116   return [=](const LegalityQuery &Query) {
117     const LLT Ty = Query.Types[TypeIdx];
118 
119     const LLT EltTy = Ty.getElementType();
120     const int Size = Ty.getSizeInBits();
121     const int EltSize = EltTy.getSizeInBits();
122     const int NextMul32 = (Size + 31) / 32;
123 
124     assert(EltSize < 32);
125 
126     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128   };
129 }
130 
getBitcastRegisterType(const LLT Ty)131 static LLT getBitcastRegisterType(const LLT Ty) {
132   const unsigned Size = Ty.getSizeInBits();
133 
134   LLT CoercedTy;
135   if (Size <= 32) {
136     // <2 x s8> -> s16
137     // <4 x s8> -> s32
138     return LLT::scalar(Size);
139   }
140 
141   return LLT::scalarOrVector(Size / 32, 32);
142 }
143 
bitcastToRegisterType(unsigned TypeIdx)144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT Ty = Query.Types[TypeIdx];
147     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148   };
149 }
150 
bitcastToVectorElement32(unsigned TypeIdx)151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152   return [=](const LegalityQuery &Query) {
153     const LLT Ty = Query.Types[TypeIdx];
154     unsigned Size = Ty.getSizeInBits();
155     assert(Size % 32 == 0);
156     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157   };
158 }
159 
vectorSmallerThan(unsigned TypeIdx,unsigned Size)160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161   return [=](const LegalityQuery &Query) {
162     const LLT QueryTy = Query.Types[TypeIdx];
163     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164   };
165 }
166 
vectorWiderThan(unsigned TypeIdx,unsigned Size)167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171   };
172 }
173 
numElementsNotEven(unsigned TypeIdx)174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178   };
179 }
180 
isRegisterSize(unsigned Size)181 static bool isRegisterSize(unsigned Size) {
182   return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184 
isRegisterVectorElementType(LLT EltTy)185 static bool isRegisterVectorElementType(LLT EltTy) {
186   const int EltSize = EltTy.getSizeInBits();
187   return EltSize == 16 || EltSize % 32 == 0;
188 }
189 
isRegisterVectorType(LLT Ty)190 static bool isRegisterVectorType(LLT Ty) {
191   const int EltSize = Ty.getElementType().getSizeInBits();
192   return EltSize == 32 || EltSize == 64 ||
193          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194          EltSize == 128 || EltSize == 256;
195 }
196 
isRegisterType(LLT Ty)197 static bool isRegisterType(LLT Ty) {
198   if (!isRegisterSize(Ty.getSizeInBits()))
199     return false;
200 
201   if (Ty.isVector())
202     return isRegisterVectorType(Ty);
203 
204   return true;
205 }
206 
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
isRegisterType(unsigned TypeIdx)209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210   return [=](const LegalityQuery &Query) {
211     return isRegisterType(Query.Types[TypeIdx]);
212   };
213 }
214 
elementTypeIsLegal(unsigned TypeIdx)215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216   return [=](const LegalityQuery &Query) {
217     const LLT QueryTy = Query.Types[TypeIdx];
218     if (!QueryTy.isVector())
219       return false;
220     const LLT EltTy = QueryTy.getElementType();
221     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222   };
223 }
224 
isWideScalarTruncStore(unsigned TypeIdx)225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
226   return [=](const LegalityQuery &Query) {
227     const LLT Ty = Query.Types[TypeIdx];
228     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
229            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
230   };
231 }
232 
233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
234 // handle some operations by just promoting the register during
235 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
maxSizeForAddrSpace(const GCNSubtarget & ST,unsigned AS,bool IsLoad)236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
237                                     bool IsLoad) {
238   switch (AS) {
239   case AMDGPUAS::PRIVATE_ADDRESS:
240     // FIXME: Private element size.
241     return ST.enableFlatScratch() ? 128 : 32;
242   case AMDGPUAS::LOCAL_ADDRESS:
243     return ST.useDS128() ? 128 : 64;
244   case AMDGPUAS::GLOBAL_ADDRESS:
245   case AMDGPUAS::CONSTANT_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
247     // Treat constant and global as identical. SMRD loads are sometimes usable for
248     // global loads (ideally constant address space should be eliminated)
249     // depending on the context. Legality cannot be context dependent, but
250     // RegBankSelect can split the load as necessary depending on the pointer
251     // register bank/uniformity and if the memory is invariant or not written in a
252     // kernel.
253     return IsLoad ? 512 : 128;
254   default:
255     // Flat addresses may contextually need to be split to 32-bit parts if they
256     // may alias scratch depending on the subtarget.
257     return 128;
258   }
259 }
260 
isLoadStoreSizeLegal(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
262                                  const LegalityQuery &Query,
263                                  unsigned Opcode) {
264   const LLT Ty = Query.Types[0];
265 
266   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
267   const bool IsLoad = Opcode != AMDGPU::G_STORE;
268 
269   unsigned RegSize = Ty.getSizeInBits();
270   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
271   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
272   unsigned AS = Query.Types[1].getAddressSpace();
273 
274   // All of these need to be custom lowered to cast the pointer operand.
275   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
276     return false;
277 
278   // TODO: We should be able to widen loads if the alignment is high enough, but
279   // we also need to modify the memory access size.
280 #if 0
281   // Accept widening loads based on alignment.
282   if (IsLoad && MemSize < Size)
283     MemSize = std::max(MemSize, Align);
284 #endif
285 
286   // Only 1-byte and 2-byte to 32-bit extloads are valid.
287   if (MemSize != RegSize && RegSize != 32)
288     return false;
289 
290   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
291     return false;
292 
293   switch (MemSize) {
294   case 8:
295   case 16:
296   case 32:
297   case 64:
298   case 128:
299     break;
300   case 96:
301     if (!ST.hasDwordx3LoadStores())
302       return false;
303     break;
304   case 256:
305   case 512:
306     // These may contextually need to be broken down.
307     break;
308   default:
309     return false;
310   }
311 
312   assert(RegSize >= MemSize);
313 
314   if (AlignBits < MemSize) {
315     const SITargetLowering *TLI = ST.getTargetLowering();
316     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
317                                                  Align(AlignBits / 8)))
318       return false;
319   }
320 
321   return true;
322 }
323 
324 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
325 // workaround this. Eventually it should ignore the type for loads and only care
326 // about the size. Return true in cases where we will workaround this for now by
327 // bitcasting.
loadStoreBitcastWorkaround(const LLT Ty)328 static bool loadStoreBitcastWorkaround(const LLT Ty) {
329   if (EnableNewLegality)
330     return false;
331 
332   const unsigned Size = Ty.getSizeInBits();
333   if (Size <= 64)
334     return false;
335   if (!Ty.isVector())
336     return true;
337 
338   LLT EltTy = Ty.getElementType();
339   if (EltTy.isPointer())
340     return true;
341 
342   unsigned EltSize = EltTy.getSizeInBits();
343   return EltSize != 32 && EltSize != 64;
344 }
345 
isLoadStoreLegal(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)346 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
347                              unsigned Opcode) {
348   const LLT Ty = Query.Types[0];
349   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
350          !loadStoreBitcastWorkaround(Ty);
351 }
352 
353 /// Return true if a load or store of the type should be lowered with a bitcast
354 /// to a different type.
shouldBitcastLoadStoreType(const GCNSubtarget & ST,const LLT Ty,const unsigned MemSizeInBits)355 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
356                                        const unsigned MemSizeInBits) {
357   const unsigned Size = Ty.getSizeInBits();
358     if (Size != MemSizeInBits)
359       return Size <= 32 && Ty.isVector();
360 
361   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
362     return true;
363   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
364          !isRegisterVectorElementType(Ty.getElementType());
365 }
366 
367 /// Return true if we should legalize a load by widening an odd sized memory
368 /// access up to the alignment. Note this case when the memory access itself
369 /// changes, not the size of the result register.
shouldWidenLoad(const GCNSubtarget & ST,unsigned SizeInBits,unsigned AlignInBits,unsigned AddrSpace,unsigned Opcode)370 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
371                             unsigned AlignInBits, unsigned AddrSpace,
372                             unsigned Opcode) {
373   // We don't want to widen cases that are naturally legal.
374   if (isPowerOf2_32(SizeInBits))
375     return false;
376 
377   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
378   // end up widening these for a scalar load during RegBankSelect, since there
379   // aren't 96-bit scalar loads.
380   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
381     return false;
382 
383   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
384     return false;
385 
386   // A load is known dereferenceable up to the alignment, so it's legal to widen
387   // to it.
388   //
389   // TODO: Could check dereferenceable for less aligned cases.
390   unsigned RoundedSize = NextPowerOf2(SizeInBits);
391   if (AlignInBits < RoundedSize)
392     return false;
393 
394   // Do not widen if it would introduce a slow unaligned load.
395   const SITargetLowering *TLI = ST.getTargetLowering();
396   bool Fast = false;
397   return TLI->allowsMisalignedMemoryAccessesImpl(
398              RoundedSize, AddrSpace, Align(AlignInBits / 8),
399              MachineMemOperand::MOLoad, &Fast) &&
400          Fast;
401 }
402 
shouldWidenLoad(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)403 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
404                             unsigned Opcode) {
405   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
406     return false;
407 
408   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
409                          Query.MMODescrs[0].AlignInBits,
410                          Query.Types[1].getAddressSpace(), Opcode);
411 }
412 
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)413 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
414                                          const GCNTargetMachine &TM)
415   :  ST(ST_) {
416   using namespace TargetOpcode;
417 
418   auto GetAddrSpacePtr = [&TM](unsigned AS) {
419     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
420   };
421 
422   const LLT S1 = LLT::scalar(1);
423   const LLT S8 = LLT::scalar(8);
424   const LLT S16 = LLT::scalar(16);
425   const LLT S32 = LLT::scalar(32);
426   const LLT S64 = LLT::scalar(64);
427   const LLT S128 = LLT::scalar(128);
428   const LLT S256 = LLT::scalar(256);
429   const LLT S512 = LLT::scalar(512);
430   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
431 
432   const LLT V2S8 = LLT::vector(2, 8);
433   const LLT V2S16 = LLT::vector(2, 16);
434   const LLT V4S16 = LLT::vector(4, 16);
435 
436   const LLT V2S32 = LLT::vector(2, 32);
437   const LLT V3S32 = LLT::vector(3, 32);
438   const LLT V4S32 = LLT::vector(4, 32);
439   const LLT V5S32 = LLT::vector(5, 32);
440   const LLT V6S32 = LLT::vector(6, 32);
441   const LLT V7S32 = LLT::vector(7, 32);
442   const LLT V8S32 = LLT::vector(8, 32);
443   const LLT V9S32 = LLT::vector(9, 32);
444   const LLT V10S32 = LLT::vector(10, 32);
445   const LLT V11S32 = LLT::vector(11, 32);
446   const LLT V12S32 = LLT::vector(12, 32);
447   const LLT V13S32 = LLT::vector(13, 32);
448   const LLT V14S32 = LLT::vector(14, 32);
449   const LLT V15S32 = LLT::vector(15, 32);
450   const LLT V16S32 = LLT::vector(16, 32);
451   const LLT V32S32 = LLT::vector(32, 32);
452 
453   const LLT V2S64 = LLT::vector(2, 64);
454   const LLT V3S64 = LLT::vector(3, 64);
455   const LLT V4S64 = LLT::vector(4, 64);
456   const LLT V5S64 = LLT::vector(5, 64);
457   const LLT V6S64 = LLT::vector(6, 64);
458   const LLT V7S64 = LLT::vector(7, 64);
459   const LLT V8S64 = LLT::vector(8, 64);
460   const LLT V16S64 = LLT::vector(16, 64);
461 
462   std::initializer_list<LLT> AllS32Vectors =
463     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
464      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
465   std::initializer_list<LLT> AllS64Vectors =
466     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
467 
468   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
469   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
470   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
471   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
472   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
473   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
474   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
475 
476   const LLT CodePtr = FlatPtr;
477 
478   const std::initializer_list<LLT> AddrSpaces64 = {
479     GlobalPtr, ConstantPtr, FlatPtr
480   };
481 
482   const std::initializer_list<LLT> AddrSpaces32 = {
483     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
484   };
485 
486   const std::initializer_list<LLT> FPTypesBase = {
487     S32, S64
488   };
489 
490   const std::initializer_list<LLT> FPTypes16 = {
491     S32, S64, S16
492   };
493 
494   const std::initializer_list<LLT> FPTypesPK16 = {
495     S32, S64, S16, V2S16
496   };
497 
498   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
499 
500   setAction({G_BRCOND, S1}, Legal); // VCC branches
501   setAction({G_BRCOND, S32}, Legal); // SCC branches
502 
503   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
504   // elements for v3s16
505   getActionDefinitionsBuilder(G_PHI)
506     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
507     .legalFor(AllS32Vectors)
508     .legalFor(AllS64Vectors)
509     .legalFor(AddrSpaces64)
510     .legalFor(AddrSpaces32)
511     .legalIf(isPointer(0))
512     .clampScalar(0, S16, S256)
513     .widenScalarToNextPow2(0, 32)
514     .clampMaxNumElements(0, S32, 16)
515     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
516     .scalarize(0);
517 
518   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
519     // Full set of gfx9 features.
520     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
521       .legalFor({S32, S16, V2S16})
522       .clampScalar(0, S16, S32)
523       .clampMaxNumElements(0, S16, 2)
524       .scalarize(0)
525       .widenScalarToNextPow2(0, 32);
526 
527     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
528       .legalFor({S32, S16, V2S16}) // Clamp modifier
529       .minScalarOrElt(0, S16)
530       .clampMaxNumElements(0, S16, 2)
531       .scalarize(0)
532       .widenScalarToNextPow2(0, 32)
533       .lower();
534   } else if (ST.has16BitInsts()) {
535     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
536       .legalFor({S32, S16})
537       .clampScalar(0, S16, S32)
538       .scalarize(0)
539       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
540 
541     // Technically the saturating operations require clamp bit support, but this
542     // was introduced at the same time as 16-bit operations.
543     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
544       .legalFor({S32, S16}) // Clamp modifier
545       .minScalar(0, S16)
546       .scalarize(0)
547       .widenScalarToNextPow2(0, 16)
548       .lower();
549 
550     // We're just lowering this, but it helps get a better result to try to
551     // coerce to the desired type first.
552     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
553       .minScalar(0, S16)
554       .scalarize(0)
555       .lower();
556   } else {
557     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
558       .legalFor({S32})
559       .clampScalar(0, S32, S32)
560       .scalarize(0);
561 
562     if (ST.hasIntClamp()) {
563       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
564         .legalFor({S32}) // Clamp modifier.
565         .scalarize(0)
566         .minScalarOrElt(0, S32)
567         .lower();
568     } else {
569       // Clamp bit support was added in VI, along with 16-bit operations.
570       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571         .minScalar(0, S32)
572         .scalarize(0)
573         .lower();
574     }
575 
576     // FIXME: DAG expansion gets better results. The widening uses the smaller
577     // range values and goes for the min/max lowering directly.
578     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
579       .minScalar(0, S32)
580       .scalarize(0)
581       .lower();
582   }
583 
584   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
585     .customFor({S32, S64})
586     .clampScalar(0, S32, S64)
587     .widenScalarToNextPow2(0, 32)
588     .scalarize(0);
589 
590   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
591                    .legalFor({S32})
592                    .maxScalarOrElt(0, S32);
593 
594   if (ST.hasVOP3PInsts()) {
595     Mulh
596       .clampMaxNumElements(0, S8, 2)
597       .lowerFor({V2S8});
598   }
599 
600   Mulh
601     .scalarize(0)
602     .lower();
603 
604   // Report legal for any types we can handle anywhere. For the cases only legal
605   // on the SALU, RegBankSelect will be able to re-legalize.
606   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
607     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
608     .clampScalar(0, S32, S64)
609     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
610     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
611     .widenScalarToNextPow2(0)
612     .scalarize(0);
613 
614   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
615                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
616     .legalFor({{S32, S1}, {S32, S32}})
617     .minScalar(0, S32)
618     // TODO: .scalarize(0)
619     .lower();
620 
621   getActionDefinitionsBuilder(G_BITCAST)
622     // Don't worry about the size constraint.
623     .legalIf(all(isRegisterType(0), isRegisterType(1)))
624     .lower();
625 
626 
627   getActionDefinitionsBuilder(G_CONSTANT)
628     .legalFor({S1, S32, S64, S16, GlobalPtr,
629                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
630     .legalIf(isPointer(0))
631     .clampScalar(0, S32, S64)
632     .widenScalarToNextPow2(0);
633 
634   getActionDefinitionsBuilder(G_FCONSTANT)
635     .legalFor({S32, S64, S16})
636     .clampScalar(0, S16, S64);
637 
638   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
639       .legalIf(isRegisterType(0))
640       // s1 and s16 are special cases because they have legal operations on
641       // them, but don't really occupy registers in the normal way.
642       .legalFor({S1, S16})
643       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
644       .clampScalarOrElt(0, S32, MaxScalar)
645       .widenScalarToNextPow2(0, 32)
646       .clampMaxNumElements(0, S32, 16);
647 
648   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
649 
650   // If the amount is divergent, we have to do a wave reduction to get the
651   // maximum value, so this is expanded during RegBankSelect.
652   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
653     .legalFor({{PrivatePtr, S32}});
654 
655   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
656     .customIf(typeIsNot(0, PrivatePtr));
657 
658   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
659 
660   auto &FPOpActions = getActionDefinitionsBuilder(
661     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
662     .legalFor({S32, S64});
663   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
664     .customFor({S32, S64});
665   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
666     .customFor({S32, S64});
667 
668   if (ST.has16BitInsts()) {
669     if (ST.hasVOP3PInsts())
670       FPOpActions.legalFor({S16, V2S16});
671     else
672       FPOpActions.legalFor({S16});
673 
674     TrigActions.customFor({S16});
675     FDIVActions.customFor({S16});
676   }
677 
678   auto &MinNumMaxNum = getActionDefinitionsBuilder({
679       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
680 
681   if (ST.hasVOP3PInsts()) {
682     MinNumMaxNum.customFor(FPTypesPK16)
683       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
684       .clampMaxNumElements(0, S16, 2)
685       .clampScalar(0, S16, S64)
686       .scalarize(0);
687   } else if (ST.has16BitInsts()) {
688     MinNumMaxNum.customFor(FPTypes16)
689       .clampScalar(0, S16, S64)
690       .scalarize(0);
691   } else {
692     MinNumMaxNum.customFor(FPTypesBase)
693       .clampScalar(0, S32, S64)
694       .scalarize(0);
695   }
696 
697   if (ST.hasVOP3PInsts())
698     FPOpActions.clampMaxNumElements(0, S16, 2);
699 
700   FPOpActions
701     .scalarize(0)
702     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
703 
704   TrigActions
705     .scalarize(0)
706     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
707 
708   FDIVActions
709     .scalarize(0)
710     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
711 
712   getActionDefinitionsBuilder({G_FNEG, G_FABS})
713     .legalFor(FPTypesPK16)
714     .clampMaxNumElements(0, S16, 2)
715     .scalarize(0)
716     .clampScalar(0, S16, S64);
717 
718   if (ST.has16BitInsts()) {
719     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
720       .legalFor({S32, S64, S16})
721       .scalarize(0)
722       .clampScalar(0, S16, S64);
723   } else {
724     getActionDefinitionsBuilder(G_FSQRT)
725       .legalFor({S32, S64})
726       .scalarize(0)
727       .clampScalar(0, S32, S64);
728 
729     if (ST.hasFractBug()) {
730       getActionDefinitionsBuilder(G_FFLOOR)
731         .customFor({S64})
732         .legalFor({S32, S64})
733         .scalarize(0)
734         .clampScalar(0, S32, S64);
735     } else {
736       getActionDefinitionsBuilder(G_FFLOOR)
737         .legalFor({S32, S64})
738         .scalarize(0)
739         .clampScalar(0, S32, S64);
740     }
741   }
742 
743   getActionDefinitionsBuilder(G_FPTRUNC)
744     .legalFor({{S32, S64}, {S16, S32}})
745     .scalarize(0)
746     .lower();
747 
748   getActionDefinitionsBuilder(G_FPEXT)
749     .legalFor({{S64, S32}, {S32, S16}})
750     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
751     .scalarize(0);
752 
753   getActionDefinitionsBuilder(G_FSUB)
754       // Use actual fsub instruction
755       .legalFor({S32})
756       // Must use fadd + fneg
757       .lowerFor({S64, S16, V2S16})
758       .scalarize(0)
759       .clampScalar(0, S32, S64);
760 
761   // Whether this is legal depends on the floating point mode for the function.
762   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
763   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
764     FMad.customFor({S32, S16});
765   else if (ST.hasMadMacF32Insts())
766     FMad.customFor({S32});
767   else if (ST.hasMadF16())
768     FMad.customFor({S16});
769   FMad.scalarize(0)
770       .lower();
771 
772   auto &FRem = getActionDefinitionsBuilder(G_FREM);
773   if (ST.has16BitInsts()) {
774     FRem.customFor({S16, S32, S64});
775   } else {
776     FRem.minScalar(0, S32)
777         .customFor({S32, S64});
778   }
779   FRem.scalarize(0);
780 
781   // TODO: Do we need to clamp maximum bitwidth?
782   getActionDefinitionsBuilder(G_TRUNC)
783     .legalIf(isScalar(0))
784     .legalFor({{V2S16, V2S32}})
785     .clampMaxNumElements(0, S16, 2)
786     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
787     // situations (like an invalid implicit use), we don't want to infinite loop
788     // in the legalizer.
789     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
790     .alwaysLegal();
791 
792   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
793     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
794                {S32, S1}, {S64, S1}, {S16, S1}})
795     .scalarize(0)
796     .clampScalar(0, S32, S64)
797     .widenScalarToNextPow2(1, 32);
798 
799   // TODO: Split s1->s64 during regbankselect for VALU.
800   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
801     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
802     .lowerFor({{S32, S64}})
803     .lowerIf(typeIs(1, S1))
804     .customFor({{S64, S64}});
805   if (ST.has16BitInsts())
806     IToFP.legalFor({{S16, S16}});
807   IToFP.clampScalar(1, S32, S64)
808        .minScalar(0, S32)
809        .scalarize(0)
810        .widenScalarToNextPow2(1);
811 
812   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
813     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
814     .customFor({{S64, S64}})
815     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
816   if (ST.has16BitInsts())
817     FPToI.legalFor({{S16, S16}});
818   else
819     FPToI.minScalar(1, S32);
820 
821   FPToI.minScalar(0, S32)
822        .widenScalarToNextPow2(0, 32)
823        .scalarize(0)
824        .lower();
825 
826   // Lower roundeven into G_FRINT
827   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
828     .scalarize(0)
829     .lower();
830 
831   if (ST.has16BitInsts()) {
832     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
833       .legalFor({S16, S32, S64})
834       .clampScalar(0, S16, S64)
835       .scalarize(0);
836   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
837     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
838       .legalFor({S32, S64})
839       .clampScalar(0, S32, S64)
840       .scalarize(0);
841   } else {
842     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
843       .legalFor({S32})
844       .customFor({S64})
845       .clampScalar(0, S32, S64)
846       .scalarize(0);
847   }
848 
849   getActionDefinitionsBuilder(G_PTR_ADD)
850     .legalIf(all(isPointer(0), sameSize(0, 1)))
851     .scalarize(0)
852     .scalarSameSizeAs(1, 0);
853 
854   getActionDefinitionsBuilder(G_PTRMASK)
855     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
856     .scalarSameSizeAs(1, 0)
857     .scalarize(0);
858 
859   auto &CmpBuilder =
860     getActionDefinitionsBuilder(G_ICMP)
861     // The compare output type differs based on the register bank of the output,
862     // so make both s1 and s32 legal.
863     //
864     // Scalar compares producing output in scc will be promoted to s32, as that
865     // is the allocatable register type that will be needed for the copy from
866     // scc. This will be promoted during RegBankSelect, and we assume something
867     // before that won't try to use s32 result types.
868     //
869     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
870     // bank.
871     .legalForCartesianProduct(
872       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
873     .legalForCartesianProduct(
874       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
875   if (ST.has16BitInsts()) {
876     CmpBuilder.legalFor({{S1, S16}});
877   }
878 
879   CmpBuilder
880     .widenScalarToNextPow2(1)
881     .clampScalar(1, S32, S64)
882     .scalarize(0)
883     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
884 
885   getActionDefinitionsBuilder(G_FCMP)
886     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
887     .widenScalarToNextPow2(1)
888     .clampScalar(1, S32, S64)
889     .scalarize(0);
890 
891   // FIXME: fpow has a selection pattern that should move to custom lowering.
892   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
893   if (ST.has16BitInsts())
894     Exp2Ops.legalFor({S32, S16});
895   else
896     Exp2Ops.legalFor({S32});
897   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
898   Exp2Ops.scalarize(0);
899 
900   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
901   if (ST.has16BitInsts())
902     ExpOps.customFor({{S32}, {S16}});
903   else
904     ExpOps.customFor({S32});
905   ExpOps.clampScalar(0, MinScalarFPTy, S32)
906         .scalarize(0);
907 
908   getActionDefinitionsBuilder(G_FPOWI)
909     .clampScalar(0, MinScalarFPTy, S32)
910     .lower();
911 
912   // The 64-bit versions produce 32-bit results, but only on the SALU.
913   getActionDefinitionsBuilder(G_CTPOP)
914     .legalFor({{S32, S32}, {S32, S64}})
915     .clampScalar(0, S32, S32)
916     .clampScalar(1, S32, S64)
917     .scalarize(0)
918     .widenScalarToNextPow2(0, 32)
919     .widenScalarToNextPow2(1, 32);
920 
921   // The hardware instructions return a different result on 0 than the generic
922   // instructions expect. The hardware produces -1, but these produce the
923   // bitwidth.
924   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
925     .scalarize(0)
926     .clampScalar(0, S32, S32)
927     .clampScalar(1, S32, S64)
928     .widenScalarToNextPow2(0, 32)
929     .widenScalarToNextPow2(1, 32)
930     .lower();
931 
932   // The 64-bit versions produce 32-bit results, but only on the SALU.
933   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
934     .legalFor({{S32, S32}, {S32, S64}})
935     .clampScalar(0, S32, S32)
936     .clampScalar(1, S32, S64)
937     .scalarize(0)
938     .widenScalarToNextPow2(0, 32)
939     .widenScalarToNextPow2(1, 32);
940 
941   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
942   // RegBankSelect.
943   getActionDefinitionsBuilder(G_BITREVERSE)
944     .legalFor({S32, S64})
945     .clampScalar(0, S32, S64)
946     .scalarize(0)
947     .widenScalarToNextPow2(0);
948 
949   if (ST.has16BitInsts()) {
950     getActionDefinitionsBuilder(G_BSWAP)
951       .legalFor({S16, S32, V2S16})
952       .clampMaxNumElements(0, S16, 2)
953       // FIXME: Fixing non-power-of-2 before clamp is workaround for
954       // narrowScalar limitation.
955       .widenScalarToNextPow2(0)
956       .clampScalar(0, S16, S32)
957       .scalarize(0);
958 
959     if (ST.hasVOP3PInsts()) {
960       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
961         .legalFor({S32, S16, V2S16})
962         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
963         .clampMaxNumElements(0, S16, 2)
964         .minScalar(0, S16)
965         .widenScalarToNextPow2(0)
966         .scalarize(0)
967         .lower();
968     } else {
969       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
970         .legalFor({S32, S16})
971         .widenScalarToNextPow2(0)
972         .minScalar(0, S16)
973         .scalarize(0)
974         .lower();
975     }
976   } else {
977     // TODO: Should have same legality without v_perm_b32
978     getActionDefinitionsBuilder(G_BSWAP)
979       .legalFor({S32})
980       .lowerIf(scalarNarrowerThan(0, 32))
981       // FIXME: Fixing non-power-of-2 before clamp is workaround for
982       // narrowScalar limitation.
983       .widenScalarToNextPow2(0)
984       .maxScalar(0, S32)
985       .scalarize(0)
986       .lower();
987 
988     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
989       .legalFor({S32})
990       .minScalar(0, S32)
991       .widenScalarToNextPow2(0)
992       .scalarize(0)
993       .lower();
994   }
995 
996   getActionDefinitionsBuilder(G_INTTOPTR)
997     // List the common cases
998     .legalForCartesianProduct(AddrSpaces64, {S64})
999     .legalForCartesianProduct(AddrSpaces32, {S32})
1000     .scalarize(0)
1001     // Accept any address space as long as the size matches
1002     .legalIf(sameSize(0, 1))
1003     .widenScalarIf(smallerThan(1, 0),
1004       [](const LegalityQuery &Query) {
1005         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1006       })
1007     .narrowScalarIf(largerThan(1, 0),
1008       [](const LegalityQuery &Query) {
1009         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1010       });
1011 
1012   getActionDefinitionsBuilder(G_PTRTOINT)
1013     // List the common cases
1014     .legalForCartesianProduct(AddrSpaces64, {S64})
1015     .legalForCartesianProduct(AddrSpaces32, {S32})
1016     .scalarize(0)
1017     // Accept any address space as long as the size matches
1018     .legalIf(sameSize(0, 1))
1019     .widenScalarIf(smallerThan(0, 1),
1020       [](const LegalityQuery &Query) {
1021         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1022       })
1023     .narrowScalarIf(
1024       largerThan(0, 1),
1025       [](const LegalityQuery &Query) {
1026         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1027       });
1028 
1029   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1030     .scalarize(0)
1031     .custom();
1032 
1033   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1034                                     bool IsLoad) -> bool {
1035     const LLT DstTy = Query.Types[0];
1036 
1037     // Split vector extloads.
1038     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1039     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1040 
1041     if (MemSize < DstTy.getSizeInBits())
1042       MemSize = std::max(MemSize, AlignBits);
1043 
1044     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1045       return true;
1046 
1047     const LLT PtrTy = Query.Types[1];
1048     unsigned AS = PtrTy.getAddressSpace();
1049     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1050       return true;
1051 
1052     // Catch weird sized loads that don't evenly divide into the access sizes
1053     // TODO: May be able to widen depending on alignment etc.
1054     unsigned NumRegs = (MemSize + 31) / 32;
1055     if (NumRegs == 3) {
1056       if (!ST.hasDwordx3LoadStores())
1057         return true;
1058     } else {
1059       // If the alignment allows, these should have been widened.
1060       if (!isPowerOf2_32(NumRegs))
1061         return true;
1062     }
1063 
1064     if (AlignBits < MemSize) {
1065       const SITargetLowering *TLI = ST.getTargetLowering();
1066       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1067                                                       Align(AlignBits / 8));
1068     }
1069 
1070     return false;
1071   };
1072 
1073   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1074   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1075   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1076 
1077   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1078   // LDS
1079   // TODO: Unsupported flat for SI.
1080 
1081   for (unsigned Op : {G_LOAD, G_STORE}) {
1082     const bool IsStore = Op == G_STORE;
1083 
1084     auto &Actions = getActionDefinitionsBuilder(Op);
1085     // Explicitly list some common cases.
1086     // TODO: Does this help compile time at all?
1087     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1088                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1089                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1090                                       {S64, GlobalPtr, 64, GlobalAlign32},
1091                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1092                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1093                                       {S32, GlobalPtr, 8, GlobalAlign8},
1094                                       {S32, GlobalPtr, 16, GlobalAlign16},
1095 
1096                                       {S32, LocalPtr, 32, 32},
1097                                       {S64, LocalPtr, 64, 32},
1098                                       {V2S32, LocalPtr, 64, 32},
1099                                       {S32, LocalPtr, 8, 8},
1100                                       {S32, LocalPtr, 16, 16},
1101                                       {V2S16, LocalPtr, 32, 32},
1102 
1103                                       {S32, PrivatePtr, 32, 32},
1104                                       {S32, PrivatePtr, 8, 8},
1105                                       {S32, PrivatePtr, 16, 16},
1106                                       {V2S16, PrivatePtr, 32, 32},
1107 
1108                                       {S32, ConstantPtr, 32, GlobalAlign32},
1109                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1110                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1111                                       {S64, ConstantPtr, 64, GlobalAlign32},
1112                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1113     Actions.legalIf(
1114       [=](const LegalityQuery &Query) -> bool {
1115         return isLoadStoreLegal(ST, Query, Op);
1116       });
1117 
1118     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1119     // 64-bits.
1120     //
1121     // TODO: Should generalize bitcast action into coerce, which will also cover
1122     // inserting addrspacecasts.
1123     Actions.customIf(typeIs(1, Constant32Ptr));
1124 
1125     // Turn any illegal element vectors into something easier to deal
1126     // with. These will ultimately produce 32-bit scalar shifts to extract the
1127     // parts anyway.
1128     //
1129     // For odd 16-bit element vectors, prefer to split those into pieces with
1130     // 16-bit vector parts.
1131     Actions.bitcastIf(
1132       [=](const LegalityQuery &Query) -> bool {
1133         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1134                                           Query.MMODescrs[0].SizeInBits);
1135       }, bitcastToRegisterType(0));
1136 
1137     if (!IsStore) {
1138       // Widen suitably aligned loads by loading extra bytes. The standard
1139       // legalization actions can't properly express widening memory operands.
1140       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1141         return shouldWidenLoad(ST, Query, G_LOAD);
1142       });
1143     }
1144 
1145     // FIXME: load/store narrowing should be moved to lower action
1146     Actions
1147         .narrowScalarIf(
1148             [=](const LegalityQuery &Query) -> bool {
1149               return !Query.Types[0].isVector() &&
1150                      needToSplitMemOp(Query, Op == G_LOAD);
1151             },
1152             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1153               const LLT DstTy = Query.Types[0];
1154               const LLT PtrTy = Query.Types[1];
1155 
1156               const unsigned DstSize = DstTy.getSizeInBits();
1157               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1158 
1159               // Split extloads.
1160               if (DstSize > MemSize)
1161                 return std::make_pair(0, LLT::scalar(MemSize));
1162 
1163               if (!isPowerOf2_32(DstSize)) {
1164                 // We're probably decomposing an odd sized store. Try to split
1165                 // to the widest type. TODO: Account for alignment. As-is it
1166                 // should be OK, since the new parts will be further legalized.
1167                 unsigned FloorSize = PowerOf2Floor(DstSize);
1168                 return std::make_pair(0, LLT::scalar(FloorSize));
1169               }
1170 
1171               if (DstSize > 32 && (DstSize % 32 != 0)) {
1172                 // FIXME: Need a way to specify non-extload of larger size if
1173                 // suitably aligned.
1174                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1175               }
1176 
1177               unsigned MaxSize = maxSizeForAddrSpace(ST,
1178                                                      PtrTy.getAddressSpace(),
1179                                                      Op == G_LOAD);
1180               if (MemSize > MaxSize)
1181                 return std::make_pair(0, LLT::scalar(MaxSize));
1182 
1183               unsigned Align = Query.MMODescrs[0].AlignInBits;
1184               return std::make_pair(0, LLT::scalar(Align));
1185             })
1186         .fewerElementsIf(
1187             [=](const LegalityQuery &Query) -> bool {
1188               return Query.Types[0].isVector() &&
1189                      needToSplitMemOp(Query, Op == G_LOAD);
1190             },
1191             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1192               const LLT DstTy = Query.Types[0];
1193               const LLT PtrTy = Query.Types[1];
1194 
1195               LLT EltTy = DstTy.getElementType();
1196               unsigned MaxSize = maxSizeForAddrSpace(ST,
1197                                                      PtrTy.getAddressSpace(),
1198                                                      Op == G_LOAD);
1199 
1200               // FIXME: Handle widened to power of 2 results better. This ends
1201               // up scalarizing.
1202               // FIXME: 3 element stores scalarized on SI
1203 
1204               // Split if it's too large for the address space.
1205               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1206                 unsigned NumElts = DstTy.getNumElements();
1207                 unsigned EltSize = EltTy.getSizeInBits();
1208 
1209                 if (MaxSize % EltSize == 0) {
1210                   return std::make_pair(
1211                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1212                 }
1213 
1214                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1215 
1216                 // FIXME: Refine when odd breakdowns handled
1217                 // The scalars will need to be re-legalized.
1218                 if (NumPieces == 1 || NumPieces >= NumElts ||
1219                     NumElts % NumPieces != 0)
1220                   return std::make_pair(0, EltTy);
1221 
1222                 return std::make_pair(0,
1223                                       LLT::vector(NumElts / NumPieces, EltTy));
1224               }
1225 
1226               // FIXME: We could probably handle weird extending loads better.
1227               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1228               if (DstTy.getSizeInBits() > MemSize)
1229                 return std::make_pair(0, EltTy);
1230 
1231               unsigned EltSize = EltTy.getSizeInBits();
1232               unsigned DstSize = DstTy.getSizeInBits();
1233               if (!isPowerOf2_32(DstSize)) {
1234                 // We're probably decomposing an odd sized store. Try to split
1235                 // to the widest type. TODO: Account for alignment. As-is it
1236                 // should be OK, since the new parts will be further legalized.
1237                 unsigned FloorSize = PowerOf2Floor(DstSize);
1238                 return std::make_pair(
1239                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1240               }
1241 
1242               // Need to split because of alignment.
1243               unsigned Align = Query.MMODescrs[0].AlignInBits;
1244               if (EltSize > Align &&
1245                   (EltSize / Align < DstTy.getNumElements())) {
1246                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1247               }
1248 
1249               // May need relegalization for the scalars.
1250               return std::make_pair(0, EltTy);
1251             })
1252     .lowerIfMemSizeNotPow2()
1253     .minScalar(0, S32);
1254 
1255     if (IsStore)
1256       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1257 
1258     Actions
1259         .widenScalarToNextPow2(0)
1260         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1261         .lower();
1262   }
1263 
1264   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1265                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1266                                                   {S32, GlobalPtr, 16, 2 * 8},
1267                                                   {S32, LocalPtr, 8, 8},
1268                                                   {S32, LocalPtr, 16, 16},
1269                                                   {S32, PrivatePtr, 8, 8},
1270                                                   {S32, PrivatePtr, 16, 16},
1271                                                   {S32, ConstantPtr, 8, 8},
1272                                                   {S32, ConstantPtr, 16, 2 * 8}});
1273   if (ST.hasFlatAddressSpace()) {
1274     ExtLoads.legalForTypesWithMemDesc(
1275         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1276   }
1277 
1278   ExtLoads.clampScalar(0, S32, S32)
1279           .widenScalarToNextPow2(0)
1280           .unsupportedIfMemSizeNotPow2()
1281           .lower();
1282 
1283   auto &Atomics = getActionDefinitionsBuilder(
1284     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1285      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1286      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1287      G_ATOMICRMW_UMIN})
1288     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1289                {S64, GlobalPtr}, {S64, LocalPtr},
1290                {S32, RegionPtr}, {S64, RegionPtr}});
1291   if (ST.hasFlatAddressSpace()) {
1292     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1293   }
1294 
1295   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1296   if (ST.hasLDSFPAtomics()) {
1297     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1298     if (ST.hasGFX90AInsts())
1299       Atomic.legalFor({{S64, LocalPtr}});
1300   }
1301   if (ST.hasAtomicFaddInsts())
1302     Atomic.legalFor({{S32, GlobalPtr}});
1303 
1304   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1305   // demarshalling
1306   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1307     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1308                 {S32, FlatPtr}, {S64, FlatPtr}})
1309     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1310                {S32, RegionPtr}, {S64, RegionPtr}});
1311   // TODO: Pointer types, any 32-bit or 64-bit vector
1312 
1313   // Condition should be s32 for scalar, s1 for vector.
1314   getActionDefinitionsBuilder(G_SELECT)
1315     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1316           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1317           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1318     .clampScalar(0, S16, S64)
1319     .scalarize(1)
1320     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1321     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1322     .clampMaxNumElements(0, S32, 2)
1323     .clampMaxNumElements(0, LocalPtr, 2)
1324     .clampMaxNumElements(0, PrivatePtr, 2)
1325     .scalarize(0)
1326     .widenScalarToNextPow2(0)
1327     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1328 
1329   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1330   // be more flexible with the shift amount type.
1331   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1332     .legalFor({{S32, S32}, {S64, S32}});
1333   if (ST.has16BitInsts()) {
1334     if (ST.hasVOP3PInsts()) {
1335       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1336             .clampMaxNumElements(0, S16, 2);
1337     } else
1338       Shifts.legalFor({{S16, S16}});
1339 
1340     // TODO: Support 16-bit shift amounts for all types
1341     Shifts.widenScalarIf(
1342       [=](const LegalityQuery &Query) {
1343         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1344         // 32-bit amount.
1345         const LLT ValTy = Query.Types[0];
1346         const LLT AmountTy = Query.Types[1];
1347         return ValTy.getSizeInBits() <= 16 &&
1348                AmountTy.getSizeInBits() < 16;
1349       }, changeTo(1, S16));
1350     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1351     Shifts.clampScalar(1, S32, S32);
1352     Shifts.clampScalar(0, S16, S64);
1353     Shifts.widenScalarToNextPow2(0, 16);
1354 
1355     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1356       .minScalar(0, S16)
1357       .scalarize(0)
1358       .lower();
1359   } else {
1360     // Make sure we legalize the shift amount type first, as the general
1361     // expansion for the shifted type will produce much worse code if it hasn't
1362     // been truncated already.
1363     Shifts.clampScalar(1, S32, S32);
1364     Shifts.clampScalar(0, S32, S64);
1365     Shifts.widenScalarToNextPow2(0, 32);
1366 
1367     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1368       .minScalar(0, S32)
1369       .scalarize(0)
1370       .lower();
1371   }
1372   Shifts.scalarize(0);
1373 
1374   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1375     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1376     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1377     unsigned IdxTypeIdx = 2;
1378 
1379     getActionDefinitionsBuilder(Op)
1380       .customIf([=](const LegalityQuery &Query) {
1381           const LLT EltTy = Query.Types[EltTypeIdx];
1382           const LLT VecTy = Query.Types[VecTypeIdx];
1383           const LLT IdxTy = Query.Types[IdxTypeIdx];
1384           const unsigned EltSize = EltTy.getSizeInBits();
1385           return (EltSize == 32 || EltSize == 64) &&
1386                   VecTy.getSizeInBits() % 32 == 0 &&
1387                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1388                   IdxTy.getSizeInBits() == 32;
1389         })
1390       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1391                  bitcastToVectorElement32(VecTypeIdx))
1392       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1393       .bitcastIf(
1394         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1395         [=](const LegalityQuery &Query) {
1396           // For > 64-bit element types, try to turn this into a 64-bit
1397           // element vector since we may be able to do better indexing
1398           // if this is scalar. If not, fall back to 32.
1399           const LLT EltTy = Query.Types[EltTypeIdx];
1400           const LLT VecTy = Query.Types[VecTypeIdx];
1401           const unsigned DstEltSize = EltTy.getSizeInBits();
1402           const unsigned VecSize = VecTy.getSizeInBits();
1403 
1404           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1405           return std::make_pair(
1406             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1407         })
1408       .clampScalar(EltTypeIdx, S32, S64)
1409       .clampScalar(VecTypeIdx, S32, S64)
1410       .clampScalar(IdxTypeIdx, S32, S32)
1411       .clampMaxNumElements(VecTypeIdx, S32, 32)
1412       // TODO: Clamp elements for 64-bit vectors?
1413       // It should only be necessary with variable indexes.
1414       // As a last resort, lower to the stack
1415       .lower();
1416   }
1417 
1418   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1419     .unsupportedIf([=](const LegalityQuery &Query) {
1420         const LLT &EltTy = Query.Types[1].getElementType();
1421         return Query.Types[0] != EltTy;
1422       });
1423 
1424   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1425     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1426     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1427 
1428     // FIXME: Doesn't handle extract of illegal sizes.
1429     getActionDefinitionsBuilder(Op)
1430       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1431       // FIXME: Multiples of 16 should not be legal.
1432       .legalIf([=](const LegalityQuery &Query) {
1433           const LLT BigTy = Query.Types[BigTyIdx];
1434           const LLT LitTy = Query.Types[LitTyIdx];
1435           return (BigTy.getSizeInBits() % 32 == 0) &&
1436                  (LitTy.getSizeInBits() % 16 == 0);
1437         })
1438       .widenScalarIf(
1439         [=](const LegalityQuery &Query) {
1440           const LLT BigTy = Query.Types[BigTyIdx];
1441           return (BigTy.getScalarSizeInBits() < 16);
1442         },
1443         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1444       .widenScalarIf(
1445         [=](const LegalityQuery &Query) {
1446           const LLT LitTy = Query.Types[LitTyIdx];
1447           return (LitTy.getScalarSizeInBits() < 16);
1448         },
1449         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1450       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1451       .widenScalarToNextPow2(BigTyIdx, 32);
1452 
1453   }
1454 
1455   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1456     .legalForCartesianProduct(AllS32Vectors, {S32})
1457     .legalForCartesianProduct(AllS64Vectors, {S64})
1458     .clampNumElements(0, V16S32, V32S32)
1459     .clampNumElements(0, V2S64, V16S64)
1460     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1461 
1462   if (ST.hasScalarPackInsts()) {
1463     BuildVector
1464       // FIXME: Should probably widen s1 vectors straight to s32
1465       .minScalarOrElt(0, S16)
1466       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1467       .minScalar(1, S32);
1468 
1469     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1470       .legalFor({V2S16, S32})
1471       .lower();
1472     BuildVector.minScalarOrElt(0, S32);
1473   } else {
1474     BuildVector.customFor({V2S16, S16});
1475     BuildVector.minScalarOrElt(0, S32);
1476 
1477     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1478       .customFor({V2S16, S32})
1479       .lower();
1480   }
1481 
1482   BuildVector.legalIf(isRegisterType(0));
1483 
1484   // FIXME: Clamp maximum size
1485   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1486     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1487     .clampMaxNumElements(0, S32, 32)
1488     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1489     .clampMaxNumElements(0, S16, 64);
1490 
1491   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1492   // pre-legalize.
1493   if (ST.hasVOP3PInsts()) {
1494     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1495       .customFor({V2S16, V2S16})
1496       .lower();
1497   } else
1498     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1499 
1500   // Merge/Unmerge
1501   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1502     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1503     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1504 
1505     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1506       const LLT Ty = Query.Types[TypeIdx];
1507       if (Ty.isVector()) {
1508         const LLT &EltTy = Ty.getElementType();
1509         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1510           return true;
1511         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1512           return true;
1513       }
1514       return false;
1515     };
1516 
1517     auto &Builder = getActionDefinitionsBuilder(Op)
1518       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1519       .lowerFor({{S16, V2S16}})
1520       .lowerIf([=](const LegalityQuery &Query) {
1521           const LLT BigTy = Query.Types[BigTyIdx];
1522           return BigTy.getSizeInBits() == 32;
1523         })
1524       // Try to widen to s16 first for small types.
1525       // TODO: Only do this on targets with legal s16 shifts
1526       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1527       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1528       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1529       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1530                            elementTypeIs(1, S16)),
1531                        changeTo(1, V2S16))
1532       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1533       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1534       // valid.
1535       .clampScalar(LitTyIdx, S32, S512)
1536       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1537       // Break up vectors with weird elements into scalars
1538       .fewerElementsIf(
1539         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1540         scalarize(0))
1541       .fewerElementsIf(
1542         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1543         scalarize(1))
1544       .clampScalar(BigTyIdx, S32, MaxScalar);
1545 
1546     if (Op == G_MERGE_VALUES) {
1547       Builder.widenScalarIf(
1548         // TODO: Use 16-bit shifts if legal for 8-bit values?
1549         [=](const LegalityQuery &Query) {
1550           const LLT Ty = Query.Types[LitTyIdx];
1551           return Ty.getSizeInBits() < 32;
1552         },
1553         changeTo(LitTyIdx, S32));
1554     }
1555 
1556     Builder.widenScalarIf(
1557       [=](const LegalityQuery &Query) {
1558         const LLT Ty = Query.Types[BigTyIdx];
1559         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1560           Ty.getSizeInBits() % 16 != 0;
1561       },
1562       [=](const LegalityQuery &Query) {
1563         // Pick the next power of 2, or a multiple of 64 over 128.
1564         // Whichever is smaller.
1565         const LLT &Ty = Query.Types[BigTyIdx];
1566         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1567         if (NewSizeInBits >= 256) {
1568           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1569           if (RoundedTo < NewSizeInBits)
1570             NewSizeInBits = RoundedTo;
1571         }
1572         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1573       })
1574       // Any vectors left are the wrong size. Scalarize them.
1575       .scalarize(0)
1576       .scalarize(1);
1577   }
1578 
1579   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1580   // RegBankSelect.
1581   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1582     .legalFor({{S32}, {S64}});
1583 
1584   if (ST.hasVOP3PInsts()) {
1585     SextInReg.lowerFor({{V2S16}})
1586       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1587       // get more vector shift opportunities, since we'll get those when
1588       // expanded.
1589       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1590   } else if (ST.has16BitInsts()) {
1591     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1592   } else {
1593     // Prefer to promote to s32 before lowering if we don't have 16-bit
1594     // shifts. This avoid a lot of intermediate truncate and extend operations.
1595     SextInReg.lowerFor({{S32}, {S64}});
1596   }
1597 
1598   SextInReg
1599     .scalarize(0)
1600     .clampScalar(0, S32, S64)
1601     .lower();
1602 
1603   // TODO: Only Try to form v2s16 with legal packed instructions.
1604   getActionDefinitionsBuilder(G_FSHR)
1605     .legalFor({{S32, S32}})
1606     .lowerFor({{V2S16, V2S16}})
1607     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1608     .scalarize(0)
1609     .lower();
1610 
1611   if (ST.hasVOP3PInsts()) {
1612     getActionDefinitionsBuilder(G_FSHL)
1613       .lowerFor({{V2S16, V2S16}})
1614       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1615       .scalarize(0)
1616       .lower();
1617   } else {
1618     getActionDefinitionsBuilder(G_FSHL)
1619       .scalarize(0)
1620       .lower();
1621   }
1622 
1623   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1624     .legalFor({S64});
1625 
1626   getActionDefinitionsBuilder(G_FENCE)
1627     .alwaysLegal();
1628 
1629   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1630       .scalarize(0)
1631       .minScalar(0, S32)
1632       .lower();
1633 
1634   getActionDefinitionsBuilder({
1635       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1636       G_FCOPYSIGN,
1637 
1638       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1639       G_ATOMICRMW_NAND,
1640       G_ATOMICRMW_FSUB,
1641       G_READ_REGISTER,
1642       G_WRITE_REGISTER,
1643 
1644       G_SADDO, G_SSUBO,
1645 
1646        // TODO: Implement
1647       G_FMINIMUM, G_FMAXIMUM}).lower();
1648 
1649   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1650         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1651         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1652     .unsupported();
1653 
1654   computeTables();
1655   verify(*ST.getInstrInfo());
1656 }
1657 
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI) const1658 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1659                                          MachineInstr &MI) const {
1660   MachineIRBuilder &B = Helper.MIRBuilder;
1661   MachineRegisterInfo &MRI = *B.getMRI();
1662 
1663   switch (MI.getOpcode()) {
1664   case TargetOpcode::G_ADDRSPACE_CAST:
1665     return legalizeAddrSpaceCast(MI, MRI, B);
1666   case TargetOpcode::G_FRINT:
1667     return legalizeFrint(MI, MRI, B);
1668   case TargetOpcode::G_FCEIL:
1669     return legalizeFceil(MI, MRI, B);
1670   case TargetOpcode::G_FREM:
1671     return legalizeFrem(MI, MRI, B);
1672   case TargetOpcode::G_INTRINSIC_TRUNC:
1673     return legalizeIntrinsicTrunc(MI, MRI, B);
1674   case TargetOpcode::G_SITOFP:
1675     return legalizeITOFP(MI, MRI, B, true);
1676   case TargetOpcode::G_UITOFP:
1677     return legalizeITOFP(MI, MRI, B, false);
1678   case TargetOpcode::G_FPTOSI:
1679     return legalizeFPTOI(MI, MRI, B, true);
1680   case TargetOpcode::G_FPTOUI:
1681     return legalizeFPTOI(MI, MRI, B, false);
1682   case TargetOpcode::G_FMINNUM:
1683   case TargetOpcode::G_FMAXNUM:
1684   case TargetOpcode::G_FMINNUM_IEEE:
1685   case TargetOpcode::G_FMAXNUM_IEEE:
1686     return legalizeMinNumMaxNum(Helper, MI);
1687   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1688     return legalizeExtractVectorElt(MI, MRI, B);
1689   case TargetOpcode::G_INSERT_VECTOR_ELT:
1690     return legalizeInsertVectorElt(MI, MRI, B);
1691   case TargetOpcode::G_SHUFFLE_VECTOR:
1692     return legalizeShuffleVector(MI, MRI, B);
1693   case TargetOpcode::G_FSIN:
1694   case TargetOpcode::G_FCOS:
1695     return legalizeSinCos(MI, MRI, B);
1696   case TargetOpcode::G_GLOBAL_VALUE:
1697     return legalizeGlobalValue(MI, MRI, B);
1698   case TargetOpcode::G_LOAD:
1699     return legalizeLoad(Helper, MI);
1700   case TargetOpcode::G_FMAD:
1701     return legalizeFMad(MI, MRI, B);
1702   case TargetOpcode::G_FDIV:
1703     return legalizeFDIV(MI, MRI, B);
1704   case TargetOpcode::G_UDIV:
1705   case TargetOpcode::G_UREM:
1706     return legalizeUDIV_UREM(MI, MRI, B);
1707   case TargetOpcode::G_SDIV:
1708   case TargetOpcode::G_SREM:
1709     return legalizeSDIV_SREM(MI, MRI, B);
1710   case TargetOpcode::G_ATOMIC_CMPXCHG:
1711     return legalizeAtomicCmpXChg(MI, MRI, B);
1712   case TargetOpcode::G_FLOG:
1713     return legalizeFlog(MI, B, numbers::ln2f);
1714   case TargetOpcode::G_FLOG10:
1715     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1716   case TargetOpcode::G_FEXP:
1717     return legalizeFExp(MI, B);
1718   case TargetOpcode::G_FPOW:
1719     return legalizeFPow(MI, B);
1720   case TargetOpcode::G_FFLOOR:
1721     return legalizeFFloor(MI, MRI, B);
1722   case TargetOpcode::G_BUILD_VECTOR:
1723     return legalizeBuildVector(MI, MRI, B);
1724   default:
1725     return false;
1726   }
1727 
1728   llvm_unreachable("expected switch to return");
1729 }
1730 
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1731 Register AMDGPULegalizerInfo::getSegmentAperture(
1732   unsigned AS,
1733   MachineRegisterInfo &MRI,
1734   MachineIRBuilder &B) const {
1735   MachineFunction &MF = B.getMF();
1736   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1737   const LLT S32 = LLT::scalar(32);
1738 
1739   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1740 
1741   if (ST.hasApertureRegs()) {
1742     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1743     // getreg.
1744     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1745         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1746         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1747     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1748         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1749         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1750     unsigned Encoding =
1751         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1752         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1753         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1754 
1755     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1756 
1757     B.buildInstr(AMDGPU::S_GETREG_B32)
1758       .addDef(GetReg)
1759       .addImm(Encoding);
1760     MRI.setType(GetReg, S32);
1761 
1762     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1763     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1764   }
1765 
1766   Register QueuePtr = MRI.createGenericVirtualRegister(
1767     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1768 
1769   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1770     return Register();
1771 
1772   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1773   // private_segment_aperture_base_hi.
1774   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1775 
1776   // TODO: can we be smarter about machine pointer info?
1777   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1778   MachineMemOperand *MMO = MF.getMachineMemOperand(
1779       PtrInfo,
1780       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1781           MachineMemOperand::MOInvariant,
1782       4, commonAlignment(Align(64), StructOffset));
1783 
1784   Register LoadAddr;
1785 
1786   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1787   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1788 }
1789 
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1790 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1791   MachineInstr &MI, MachineRegisterInfo &MRI,
1792   MachineIRBuilder &B) const {
1793   MachineFunction &MF = B.getMF();
1794 
1795   const LLT S32 = LLT::scalar(32);
1796   Register Dst = MI.getOperand(0).getReg();
1797   Register Src = MI.getOperand(1).getReg();
1798 
1799   LLT DstTy = MRI.getType(Dst);
1800   LLT SrcTy = MRI.getType(Src);
1801   unsigned DestAS = DstTy.getAddressSpace();
1802   unsigned SrcAS = SrcTy.getAddressSpace();
1803 
1804   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1805   // vector element.
1806   assert(!DstTy.isVector());
1807 
1808   const AMDGPUTargetMachine &TM
1809     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1810 
1811   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1812     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1813     return true;
1814   }
1815 
1816   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1817     // Truncate.
1818     B.buildExtract(Dst, Src, 0);
1819     MI.eraseFromParent();
1820     return true;
1821   }
1822 
1823   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1824     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1825     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1826 
1827     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1828     // another. Merge operands are required to be the same type, but creating an
1829     // extra ptrtoint would be kind of pointless.
1830     auto HighAddr = B.buildConstant(
1831       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1832     B.buildMerge(Dst, {Src, HighAddr});
1833     MI.eraseFromParent();
1834     return true;
1835   }
1836 
1837   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1838     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1839            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1840     unsigned NullVal = TM.getNullPointerValue(DestAS);
1841 
1842     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1843     auto FlatNull = B.buildConstant(SrcTy, 0);
1844 
1845     // Extract low 32-bits of the pointer.
1846     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1847 
1848     auto CmpRes =
1849         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1850     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1851 
1852     MI.eraseFromParent();
1853     return true;
1854   }
1855 
1856   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1857     return false;
1858 
1859   if (!ST.hasFlatAddressSpace())
1860     return false;
1861 
1862   auto SegmentNull =
1863       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1864   auto FlatNull =
1865       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1866 
1867   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1868   if (!ApertureReg.isValid())
1869     return false;
1870 
1871   auto CmpRes =
1872       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1873 
1874   // Coerce the type of the low half of the result so we can use merge_values.
1875   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1876 
1877   // TODO: Should we allow mismatched types but matching sizes in merges to
1878   // avoid the ptrtoint?
1879   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1880   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1881 
1882   MI.eraseFromParent();
1883   return true;
1884 }
1885 
legalizeFrint(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1886 bool AMDGPULegalizerInfo::legalizeFrint(
1887   MachineInstr &MI, MachineRegisterInfo &MRI,
1888   MachineIRBuilder &B) const {
1889   Register Src = MI.getOperand(1).getReg();
1890   LLT Ty = MRI.getType(Src);
1891   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1892 
1893   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1894   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1895 
1896   auto C1 = B.buildFConstant(Ty, C1Val);
1897   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1898 
1899   // TODO: Should this propagate fast-math-flags?
1900   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1901   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1902 
1903   auto C2 = B.buildFConstant(Ty, C2Val);
1904   auto Fabs = B.buildFAbs(Ty, Src);
1905 
1906   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1907   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1908   MI.eraseFromParent();
1909   return true;
1910 }
1911 
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1912 bool AMDGPULegalizerInfo::legalizeFceil(
1913   MachineInstr &MI, MachineRegisterInfo &MRI,
1914   MachineIRBuilder &B) const {
1915 
1916   const LLT S1 = LLT::scalar(1);
1917   const LLT S64 = LLT::scalar(64);
1918 
1919   Register Src = MI.getOperand(1).getReg();
1920   assert(MRI.getType(Src) == S64);
1921 
1922   // result = trunc(src)
1923   // if (src > 0.0 && src != result)
1924   //   result += 1.0
1925 
1926   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1927 
1928   const auto Zero = B.buildFConstant(S64, 0.0);
1929   const auto One = B.buildFConstant(S64, 1.0);
1930   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1931   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1932   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1933   auto Add = B.buildSelect(S64, And, One, Zero);
1934 
1935   // TODO: Should this propagate fast-math-flags?
1936   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1937   return true;
1938 }
1939 
legalizeFrem(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1940 bool AMDGPULegalizerInfo::legalizeFrem(
1941   MachineInstr &MI, MachineRegisterInfo &MRI,
1942   MachineIRBuilder &B) const {
1943     Register DstReg = MI.getOperand(0).getReg();
1944     Register Src0Reg = MI.getOperand(1).getReg();
1945     Register Src1Reg = MI.getOperand(2).getReg();
1946     auto Flags = MI.getFlags();
1947     LLT Ty = MRI.getType(DstReg);
1948 
1949     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1950     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1951     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1952     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1953     MI.eraseFromParent();
1954     return true;
1955 }
1956 
extractF64Exponent(Register Hi,MachineIRBuilder & B)1957 static MachineInstrBuilder extractF64Exponent(Register Hi,
1958                                               MachineIRBuilder &B) {
1959   const unsigned FractBits = 52;
1960   const unsigned ExpBits = 11;
1961   LLT S32 = LLT::scalar(32);
1962 
1963   auto Const0 = B.buildConstant(S32, FractBits - 32);
1964   auto Const1 = B.buildConstant(S32, ExpBits);
1965 
1966   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1967     .addUse(Hi)
1968     .addUse(Const0.getReg(0))
1969     .addUse(Const1.getReg(0));
1970 
1971   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1972 }
1973 
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1974 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1975   MachineInstr &MI, MachineRegisterInfo &MRI,
1976   MachineIRBuilder &B) const {
1977   const LLT S1 = LLT::scalar(1);
1978   const LLT S32 = LLT::scalar(32);
1979   const LLT S64 = LLT::scalar(64);
1980 
1981   Register Src = MI.getOperand(1).getReg();
1982   assert(MRI.getType(Src) == S64);
1983 
1984   // TODO: Should this use extract since the low half is unused?
1985   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1986   Register Hi = Unmerge.getReg(1);
1987 
1988   // Extract the upper half, since this is where we will find the sign and
1989   // exponent.
1990   auto Exp = extractF64Exponent(Hi, B);
1991 
1992   const unsigned FractBits = 52;
1993 
1994   // Extract the sign bit.
1995   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1996   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1997 
1998   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1999 
2000   const auto Zero32 = B.buildConstant(S32, 0);
2001 
2002   // Extend back to 64-bits.
2003   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2004 
2005   auto Shr = B.buildAShr(S64, FractMask, Exp);
2006   auto Not = B.buildNot(S64, Shr);
2007   auto Tmp0 = B.buildAnd(S64, Src, Not);
2008   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2009 
2010   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2011   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2012 
2013   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2014   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2015   MI.eraseFromParent();
2016   return true;
2017 }
2018 
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2019 bool AMDGPULegalizerInfo::legalizeITOFP(
2020   MachineInstr &MI, MachineRegisterInfo &MRI,
2021   MachineIRBuilder &B, bool Signed) const {
2022 
2023   Register Dst = MI.getOperand(0).getReg();
2024   Register Src = MI.getOperand(1).getReg();
2025 
2026   const LLT S64 = LLT::scalar(64);
2027   const LLT S32 = LLT::scalar(32);
2028 
2029   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2030 
2031   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2032 
2033   auto CvtHi = Signed ?
2034     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2035     B.buildUITOFP(S64, Unmerge.getReg(1));
2036 
2037   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2038 
2039   auto ThirtyTwo = B.buildConstant(S32, 32);
2040   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2041     .addUse(CvtHi.getReg(0))
2042     .addUse(ThirtyTwo.getReg(0));
2043 
2044   // TODO: Should this propagate fast-math-flags?
2045   B.buildFAdd(Dst, LdExp, CvtLo);
2046   MI.eraseFromParent();
2047   return true;
2048 }
2049 
2050 // TODO: Copied from DAG implementation. Verify logic and document how this
2051 // actually works.
legalizeFPTOI(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2052 bool AMDGPULegalizerInfo::legalizeFPTOI(
2053   MachineInstr &MI, MachineRegisterInfo &MRI,
2054   MachineIRBuilder &B, bool Signed) const {
2055 
2056   Register Dst = MI.getOperand(0).getReg();
2057   Register Src = MI.getOperand(1).getReg();
2058 
2059   const LLT S64 = LLT::scalar(64);
2060   const LLT S32 = LLT::scalar(32);
2061 
2062   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2063 
2064   unsigned Flags = MI.getFlags();
2065 
2066   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2067   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2068   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2069 
2070   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2071   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2072   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2073 
2074   auto Hi = Signed ?
2075     B.buildFPTOSI(S32, FloorMul) :
2076     B.buildFPTOUI(S32, FloorMul);
2077   auto Lo = B.buildFPTOUI(S32, Fma);
2078 
2079   B.buildMerge(Dst, { Lo, Hi });
2080   MI.eraseFromParent();
2081 
2082   return true;
2083 }
2084 
legalizeMinNumMaxNum(LegalizerHelper & Helper,MachineInstr & MI) const2085 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2086                                                MachineInstr &MI) const {
2087   MachineFunction &MF = Helper.MIRBuilder.getMF();
2088   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2089 
2090   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2091                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2092 
2093   // With ieee_mode disabled, the instructions have the correct behavior
2094   // already for G_FMINNUM/G_FMAXNUM
2095   if (!MFI->getMode().IEEE)
2096     return !IsIEEEOp;
2097 
2098   if (IsIEEEOp)
2099     return true;
2100 
2101   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2102 }
2103 
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2104 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2105   MachineInstr &MI, MachineRegisterInfo &MRI,
2106   MachineIRBuilder &B) const {
2107   // TODO: Should move some of this into LegalizerHelper.
2108 
2109   // TODO: Promote dynamic indexing of s16 to s32
2110 
2111   // FIXME: Artifact combiner probably should have replaced the truncated
2112   // constant before this, so we shouldn't need
2113   // getConstantVRegValWithLookThrough.
2114   Optional<ValueAndVReg> MaybeIdxVal =
2115       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2116   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2117     return true;
2118   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2119 
2120   Register Dst = MI.getOperand(0).getReg();
2121   Register Vec = MI.getOperand(1).getReg();
2122 
2123   LLT VecTy = MRI.getType(Vec);
2124   LLT EltTy = VecTy.getElementType();
2125   assert(EltTy == MRI.getType(Dst));
2126 
2127   if (IdxVal < VecTy.getNumElements())
2128     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2129   else
2130     B.buildUndef(Dst);
2131 
2132   MI.eraseFromParent();
2133   return true;
2134 }
2135 
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2136 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2137   MachineInstr &MI, MachineRegisterInfo &MRI,
2138   MachineIRBuilder &B) const {
2139   // TODO: Should move some of this into LegalizerHelper.
2140 
2141   // TODO: Promote dynamic indexing of s16 to s32
2142 
2143   // FIXME: Artifact combiner probably should have replaced the truncated
2144   // constant before this, so we shouldn't need
2145   // getConstantVRegValWithLookThrough.
2146   Optional<ValueAndVReg> MaybeIdxVal =
2147       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2148   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2149     return true;
2150 
2151   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2152   Register Dst = MI.getOperand(0).getReg();
2153   Register Vec = MI.getOperand(1).getReg();
2154   Register Ins = MI.getOperand(2).getReg();
2155 
2156   LLT VecTy = MRI.getType(Vec);
2157   LLT EltTy = VecTy.getElementType();
2158   assert(EltTy == MRI.getType(Ins));
2159 
2160   if (IdxVal < VecTy.getNumElements())
2161     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2162   else
2163     B.buildUndef(Dst);
2164 
2165   MI.eraseFromParent();
2166   return true;
2167 }
2168 
legalizeShuffleVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2169 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2170   MachineInstr &MI, MachineRegisterInfo &MRI,
2171   MachineIRBuilder &B) const {
2172   const LLT V2S16 = LLT::vector(2, 16);
2173 
2174   Register Dst = MI.getOperand(0).getReg();
2175   Register Src0 = MI.getOperand(1).getReg();
2176   LLT DstTy = MRI.getType(Dst);
2177   LLT SrcTy = MRI.getType(Src0);
2178 
2179   if (SrcTy == V2S16 && DstTy == V2S16 &&
2180       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2181     return true;
2182 
2183   MachineIRBuilder HelperBuilder(MI);
2184   GISelObserverWrapper DummyObserver;
2185   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2186   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2187 }
2188 
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2189 bool AMDGPULegalizerInfo::legalizeSinCos(
2190   MachineInstr &MI, MachineRegisterInfo &MRI,
2191   MachineIRBuilder &B) const {
2192 
2193   Register DstReg = MI.getOperand(0).getReg();
2194   Register SrcReg = MI.getOperand(1).getReg();
2195   LLT Ty = MRI.getType(DstReg);
2196   unsigned Flags = MI.getFlags();
2197 
2198   Register TrigVal;
2199   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2200   if (ST.hasTrigReducedRange()) {
2201     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2202     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2203       .addUse(MulVal.getReg(0))
2204       .setMIFlags(Flags).getReg(0);
2205   } else
2206     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2207 
2208   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2209     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2210   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2211     .addUse(TrigVal)
2212     .setMIFlags(Flags);
2213   MI.eraseFromParent();
2214   return true;
2215 }
2216 
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,int64_t Offset,unsigned GAFlags) const2217 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2218                                                   MachineIRBuilder &B,
2219                                                   const GlobalValue *GV,
2220                                                   int64_t Offset,
2221                                                   unsigned GAFlags) const {
2222   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2223   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2224   // to the following code sequence:
2225   //
2226   // For constant address space:
2227   //   s_getpc_b64 s[0:1]
2228   //   s_add_u32 s0, s0, $symbol
2229   //   s_addc_u32 s1, s1, 0
2230   //
2231   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2232   //   a fixup or relocation is emitted to replace $symbol with a literal
2233   //   constant, which is a pc-relative offset from the encoding of the $symbol
2234   //   operand to the global variable.
2235   //
2236   // For global address space:
2237   //   s_getpc_b64 s[0:1]
2238   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2239   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2240   //
2241   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2242   //   fixups or relocations are emitted to replace $symbol@*@lo and
2243   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2244   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2245   //   operand to the global variable.
2246   //
2247   // What we want here is an offset from the value returned by s_getpc
2248   // (which is the address of the s_add_u32 instruction) to the global
2249   // variable, but since the encoding of $symbol starts 4 bytes after the start
2250   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2251   // small. This requires us to add 4 to the global variable offset in order to
2252   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2253   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2254   // instruction.
2255 
2256   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2257 
2258   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2259     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2260 
2261   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2262     .addDef(PCReg);
2263 
2264   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2265   if (GAFlags == SIInstrInfo::MO_NONE)
2266     MIB.addImm(0);
2267   else
2268     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2269 
2270   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2271 
2272   if (PtrTy.getSizeInBits() == 32)
2273     B.buildExtract(DstReg, PCReg, 0);
2274   return true;
2275  }
2276 
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2277 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2278   MachineInstr &MI, MachineRegisterInfo &MRI,
2279   MachineIRBuilder &B) const {
2280   Register DstReg = MI.getOperand(0).getReg();
2281   LLT Ty = MRI.getType(DstReg);
2282   unsigned AS = Ty.getAddressSpace();
2283 
2284   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2285   MachineFunction &MF = B.getMF();
2286   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2287 
2288   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2289     if (!MFI->isModuleEntryFunction() &&
2290         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2291       const Function &Fn = MF.getFunction();
2292       DiagnosticInfoUnsupported BadLDSDecl(
2293         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2294         DS_Warning);
2295       Fn.getContext().diagnose(BadLDSDecl);
2296 
2297       // We currently don't have a way to correctly allocate LDS objects that
2298       // aren't directly associated with a kernel. We do force inlining of
2299       // functions that use local objects. However, if these dead functions are
2300       // not eliminated, we don't want a compile time error. Just emit a warning
2301       // and a trap, since there should be no callable path here.
2302       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2303       B.buildUndef(DstReg);
2304       MI.eraseFromParent();
2305       return true;
2306     }
2307 
2308     // TODO: We could emit code to handle the initialization somewhere.
2309     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2310       const SITargetLowering *TLI = ST.getTargetLowering();
2311       if (!TLI->shouldUseLDSConstAddress(GV)) {
2312         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2313         return true; // Leave in place;
2314       }
2315 
2316       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2317         Type *Ty = GV->getValueType();
2318         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2319         // zero-sized type in other languages to declare the dynamic shared
2320         // memory which size is not known at the compile time. They will be
2321         // allocated by the runtime and placed directly after the static
2322         // allocated ones. They all share the same offset.
2323         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2324           // Adjust alignment for that dynamic shared memory array.
2325           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2326           LLT S32 = LLT::scalar(32);
2327           auto Sz =
2328               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2329           B.buildIntToPtr(DstReg, Sz);
2330           MI.eraseFromParent();
2331           return true;
2332         }
2333       }
2334 
2335       B.buildConstant(
2336           DstReg,
2337           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2338       MI.eraseFromParent();
2339       return true;
2340     }
2341 
2342     const Function &Fn = MF.getFunction();
2343     DiagnosticInfoUnsupported BadInit(
2344       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2345     Fn.getContext().diagnose(BadInit);
2346     return true;
2347   }
2348 
2349   const SITargetLowering *TLI = ST.getTargetLowering();
2350 
2351   if (TLI->shouldEmitFixup(GV)) {
2352     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2353     MI.eraseFromParent();
2354     return true;
2355   }
2356 
2357   if (TLI->shouldEmitPCReloc(GV)) {
2358     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2359     MI.eraseFromParent();
2360     return true;
2361   }
2362 
2363   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2364   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2365 
2366   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2367       MachinePointerInfo::getGOT(MF),
2368       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2369           MachineMemOperand::MOInvariant,
2370       8 /*Size*/, Align(8));
2371 
2372   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2373 
2374   if (Ty.getSizeInBits() == 32) {
2375     // Truncate if this is a 32-bit constant adrdess.
2376     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2377     B.buildExtract(DstReg, Load, 0);
2378   } else
2379     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2380 
2381   MI.eraseFromParent();
2382   return true;
2383 }
2384 
widenToNextPowerOf2(LLT Ty)2385 static LLT widenToNextPowerOf2(LLT Ty) {
2386   if (Ty.isVector())
2387     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2388   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2389 }
2390 
legalizeLoad(LegalizerHelper & Helper,MachineInstr & MI) const2391 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2392                                        MachineInstr &MI) const {
2393   MachineIRBuilder &B = Helper.MIRBuilder;
2394   MachineRegisterInfo &MRI = *B.getMRI();
2395   GISelChangeObserver &Observer = Helper.Observer;
2396 
2397   Register PtrReg = MI.getOperand(1).getReg();
2398   LLT PtrTy = MRI.getType(PtrReg);
2399   unsigned AddrSpace = PtrTy.getAddressSpace();
2400 
2401   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2402     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2403     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2404     Observer.changingInstr(MI);
2405     MI.getOperand(1).setReg(Cast.getReg(0));
2406     Observer.changedInstr(MI);
2407     return true;
2408   }
2409 
2410   Register ValReg = MI.getOperand(0).getReg();
2411   LLT ValTy = MRI.getType(ValReg);
2412 
2413   MachineMemOperand *MMO = *MI.memoperands_begin();
2414   const unsigned ValSize = ValTy.getSizeInBits();
2415   const unsigned MemSize = 8 * MMO->getSize();
2416   const Align MemAlign = MMO->getAlign();
2417   const unsigned AlignInBits = 8 * MemAlign.value();
2418 
2419   // Widen non-power-of-2 loads to the alignment if needed
2420   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2421     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2422 
2423     // This was already the correct extending load result type, so just adjust
2424     // the memory type.
2425     if (WideMemSize == ValSize) {
2426       MachineFunction &MF = B.getMF();
2427 
2428       MachineMemOperand *WideMMO =
2429           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2430       Observer.changingInstr(MI);
2431       MI.setMemRefs(MF, {WideMMO});
2432       Observer.changedInstr(MI);
2433       return true;
2434     }
2435 
2436     // Don't bother handling edge case that should probably never be produced.
2437     if (ValSize > WideMemSize)
2438       return false;
2439 
2440     LLT WideTy = widenToNextPowerOf2(ValTy);
2441 
2442     Register WideLoad;
2443     if (!WideTy.isVector()) {
2444       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2445       B.buildTrunc(ValReg, WideLoad).getReg(0);
2446     } else {
2447       // Extract the subvector.
2448 
2449       if (isRegisterType(ValTy)) {
2450         // If this a case where G_EXTRACT is legal, use it.
2451         // (e.g. <3 x s32> -> <4 x s32>)
2452         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2453         B.buildExtract(ValReg, WideLoad, 0);
2454       } else {
2455         // For cases where the widened type isn't a nice register value, unmerge
2456         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2457         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2458         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2459         B.setInsertPt(B.getMBB(), MI.getIterator());
2460         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2461       }
2462     }
2463 
2464     MI.eraseFromParent();
2465     return true;
2466   }
2467 
2468   return false;
2469 }
2470 
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2471 bool AMDGPULegalizerInfo::legalizeFMad(
2472   MachineInstr &MI, MachineRegisterInfo &MRI,
2473   MachineIRBuilder &B) const {
2474   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2475   assert(Ty.isScalar());
2476 
2477   MachineFunction &MF = B.getMF();
2478   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2479 
2480   // TODO: Always legal with future ftz flag.
2481   // FIXME: Do we need just output?
2482   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2483     return true;
2484   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2485     return true;
2486 
2487   MachineIRBuilder HelperBuilder(MI);
2488   GISelObserverWrapper DummyObserver;
2489   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2490   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2491 }
2492 
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2493 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2494   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2495   Register DstReg = MI.getOperand(0).getReg();
2496   Register PtrReg = MI.getOperand(1).getReg();
2497   Register CmpVal = MI.getOperand(2).getReg();
2498   Register NewVal = MI.getOperand(3).getReg();
2499 
2500   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2501          "this should not have been custom lowered");
2502 
2503   LLT ValTy = MRI.getType(CmpVal);
2504   LLT VecTy = LLT::vector(2, ValTy);
2505 
2506   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2507 
2508   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2509     .addDef(DstReg)
2510     .addUse(PtrReg)
2511     .addUse(PackedVal)
2512     .setMemRefs(MI.memoperands());
2513 
2514   MI.eraseFromParent();
2515   return true;
2516 }
2517 
legalizeFlog(MachineInstr & MI,MachineIRBuilder & B,double Log2BaseInverted) const2518 bool AMDGPULegalizerInfo::legalizeFlog(
2519   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2520   Register Dst = MI.getOperand(0).getReg();
2521   Register Src = MI.getOperand(1).getReg();
2522   LLT Ty = B.getMRI()->getType(Dst);
2523   unsigned Flags = MI.getFlags();
2524 
2525   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2526   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2527 
2528   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2529   MI.eraseFromParent();
2530   return true;
2531 }
2532 
legalizeFExp(MachineInstr & MI,MachineIRBuilder & B) const2533 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2534                                        MachineIRBuilder &B) const {
2535   Register Dst = MI.getOperand(0).getReg();
2536   Register Src = MI.getOperand(1).getReg();
2537   unsigned Flags = MI.getFlags();
2538   LLT Ty = B.getMRI()->getType(Dst);
2539 
2540   auto K = B.buildFConstant(Ty, numbers::log2e);
2541   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2542   B.buildFExp2(Dst, Mul, Flags);
2543   MI.eraseFromParent();
2544   return true;
2545 }
2546 
legalizeFPow(MachineInstr & MI,MachineIRBuilder & B) const2547 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2548                                        MachineIRBuilder &B) const {
2549   Register Dst = MI.getOperand(0).getReg();
2550   Register Src0 = MI.getOperand(1).getReg();
2551   Register Src1 = MI.getOperand(2).getReg();
2552   unsigned Flags = MI.getFlags();
2553   LLT Ty = B.getMRI()->getType(Dst);
2554   const LLT S16 = LLT::scalar(16);
2555   const LLT S32 = LLT::scalar(32);
2556 
2557   if (Ty == S32) {
2558     auto Log = B.buildFLog2(S32, Src0, Flags);
2559     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2560       .addUse(Log.getReg(0))
2561       .addUse(Src1)
2562       .setMIFlags(Flags);
2563     B.buildFExp2(Dst, Mul, Flags);
2564   } else if (Ty == S16) {
2565     // There's no f16 fmul_legacy, so we need to convert for it.
2566     auto Log = B.buildFLog2(S16, Src0, Flags);
2567     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2568     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2569     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2570       .addUse(Ext0.getReg(0))
2571       .addUse(Ext1.getReg(0))
2572       .setMIFlags(Flags);
2573 
2574     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2575   } else
2576     return false;
2577 
2578   MI.eraseFromParent();
2579   return true;
2580 }
2581 
2582 // Find a source register, ignoring any possible source modifiers.
stripAnySourceMods(Register OrigSrc,MachineRegisterInfo & MRI)2583 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2584   Register ModSrc = OrigSrc;
2585   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2586     ModSrc = SrcFNeg->getOperand(1).getReg();
2587     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2588       ModSrc = SrcFAbs->getOperand(1).getReg();
2589   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2590     ModSrc = SrcFAbs->getOperand(1).getReg();
2591   return ModSrc;
2592 }
2593 
legalizeFFloor(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2594 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2595                                          MachineRegisterInfo &MRI,
2596                                          MachineIRBuilder &B) const {
2597 
2598   const LLT S1 = LLT::scalar(1);
2599   const LLT S64 = LLT::scalar(64);
2600   Register Dst = MI.getOperand(0).getReg();
2601   Register OrigSrc = MI.getOperand(1).getReg();
2602   unsigned Flags = MI.getFlags();
2603   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2604          "this should not have been custom lowered");
2605 
2606   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2607   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2608   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2609   // V_FRACT bug is:
2610   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2611   //
2612   // Convert floor(x) to (x - fract(x))
2613 
2614   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2615     .addUse(OrigSrc)
2616     .setMIFlags(Flags);
2617 
2618   // Give source modifier matching some assistance before obscuring a foldable
2619   // pattern.
2620 
2621   // TODO: We can avoid the neg on the fract? The input sign to fract
2622   // shouldn't matter?
2623   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2624 
2625   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2626 
2627   Register Min = MRI.createGenericVirtualRegister(S64);
2628 
2629   // We don't need to concern ourselves with the snan handling difference, so
2630   // use the one which will directly select.
2631   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2632   if (MFI->getMode().IEEE)
2633     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2634   else
2635     B.buildFMinNum(Min, Fract, Const, Flags);
2636 
2637   Register CorrectedFract = Min;
2638   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2639     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2640     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2641   }
2642 
2643   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2644   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2645 
2646   MI.eraseFromParent();
2647   return true;
2648 }
2649 
2650 // Turn an illegal packed v2s16 build vector into bit operations.
2651 // TODO: This should probably be a bitcast action in LegalizerHelper.
legalizeBuildVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2652 bool AMDGPULegalizerInfo::legalizeBuildVector(
2653   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2654   Register Dst = MI.getOperand(0).getReg();
2655   const LLT S32 = LLT::scalar(32);
2656   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2657 
2658   Register Src0 = MI.getOperand(1).getReg();
2659   Register Src1 = MI.getOperand(2).getReg();
2660   assert(MRI.getType(Src0) == LLT::scalar(16));
2661 
2662   auto Merge = B.buildMerge(S32, {Src0, Src1});
2663   B.buildBitcast(Dst, Merge);
2664 
2665   MI.eraseFromParent();
2666   return true;
2667 }
2668 
2669 // Check that this is a G_XOR x, -1
isNot(const MachineRegisterInfo & MRI,const MachineInstr & MI)2670 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2671   if (MI.getOpcode() != TargetOpcode::G_XOR)
2672     return false;
2673   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2674   return ConstVal && *ConstVal == -1;
2675 }
2676 
2677 // Return the use branch instruction, otherwise null if the usage is invalid.
2678 static MachineInstr *
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br,MachineBasicBlock * & UncondBrTarget,bool & Negated)2679 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2680                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2681   Register CondDef = MI.getOperand(0).getReg();
2682   if (!MRI.hasOneNonDBGUse(CondDef))
2683     return nullptr;
2684 
2685   MachineBasicBlock *Parent = MI.getParent();
2686   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2687 
2688   if (isNot(MRI, *UseMI)) {
2689     Register NegatedCond = UseMI->getOperand(0).getReg();
2690     if (!MRI.hasOneNonDBGUse(NegatedCond))
2691       return nullptr;
2692 
2693     // We're deleting the def of this value, so we need to remove it.
2694     UseMI->eraseFromParent();
2695 
2696     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2697     Negated = true;
2698   }
2699 
2700   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2701     return nullptr;
2702 
2703   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2704   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2705   if (Next == Parent->end()) {
2706     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2707     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2708       return nullptr;
2709     UncondBrTarget = &*NextMBB;
2710   } else {
2711     if (Next->getOpcode() != AMDGPU::G_BR)
2712       return nullptr;
2713     Br = &*Next;
2714     UncondBrTarget = Br->getOperand(0).getMBB();
2715   }
2716 
2717   return UseMI;
2718 }
2719 
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg,const TargetRegisterClass * ArgRC,LLT ArgTy) const2720 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2721                                          const ArgDescriptor *Arg,
2722                                          const TargetRegisterClass *ArgRC,
2723                                          LLT ArgTy) const {
2724   MCRegister SrcReg = Arg->getRegister();
2725   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2726   assert(DstReg.isVirtual() && "Virtual register expected");
2727 
2728   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2729                                              ArgTy);
2730   if (Arg->isMasked()) {
2731     // TODO: Should we try to emit this once in the entry block?
2732     const LLT S32 = LLT::scalar(32);
2733     const unsigned Mask = Arg->getMask();
2734     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2735 
2736     Register AndMaskSrc = LiveIn;
2737 
2738     if (Shift != 0) {
2739       auto ShiftAmt = B.buildConstant(S32, Shift);
2740       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2741     }
2742 
2743     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2744   } else {
2745     B.buildCopy(DstReg, LiveIn);
2746   }
2747 
2748   return true;
2749 }
2750 
loadInputValue(Register DstReg,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2751 bool AMDGPULegalizerInfo::loadInputValue(
2752     Register DstReg, MachineIRBuilder &B,
2753     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2754   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2755   const ArgDescriptor *Arg;
2756   const TargetRegisterClass *ArgRC;
2757   LLT ArgTy;
2758   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2759 
2760   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2761     return false; // TODO: Handle these
2762   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2763 }
2764 
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2765 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2766     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2767     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2768   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2769     return false;
2770 
2771   MI.eraseFromParent();
2772   return true;
2773 }
2774 
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2775 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2776                                        MachineRegisterInfo &MRI,
2777                                        MachineIRBuilder &B) const {
2778   Register Dst = MI.getOperand(0).getReg();
2779   LLT DstTy = MRI.getType(Dst);
2780   LLT S16 = LLT::scalar(16);
2781   LLT S32 = LLT::scalar(32);
2782   LLT S64 = LLT::scalar(64);
2783 
2784   if (DstTy == S16)
2785     return legalizeFDIV16(MI, MRI, B);
2786   if (DstTy == S32)
2787     return legalizeFDIV32(MI, MRI, B);
2788   if (DstTy == S64)
2789     return legalizeFDIV64(MI, MRI, B);
2790 
2791   return false;
2792 }
2793 
legalizeUDIV_UREM32Impl(MachineIRBuilder & B,Register DstReg,Register X,Register Y,bool IsDiv) const2794 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2795                                                   Register DstReg,
2796                                                   Register X,
2797                                                   Register Y,
2798                                                   bool IsDiv) const {
2799   const LLT S1 = LLT::scalar(1);
2800   const LLT S32 = LLT::scalar(32);
2801 
2802   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2803   // algorithm used here.
2804 
2805   // Initial estimate of inv(y).
2806   auto FloatY = B.buildUITOFP(S32, Y);
2807   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2808   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2809   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2810   auto Z = B.buildFPTOUI(S32, ScaledY);
2811 
2812   // One round of UNR.
2813   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2814   auto NegYZ = B.buildMul(S32, NegY, Z);
2815   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2816 
2817   // Quotient/remainder estimate.
2818   auto Q = B.buildUMulH(S32, X, Z);
2819   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2820 
2821   // First quotient/remainder refinement.
2822   auto One = B.buildConstant(S32, 1);
2823   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2824   if (IsDiv)
2825     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2826   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2827 
2828   // Second quotient/remainder refinement.
2829   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2830   if (IsDiv)
2831     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2832   else
2833     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2834 }
2835 
2836 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2837 //
2838 // Return lo, hi of result
2839 //
2840 // %cvt.lo = G_UITOFP Val.lo
2841 // %cvt.hi = G_UITOFP Val.hi
2842 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2843 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2844 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2845 // %mul2 = G_FMUL %mul1, 2**(-32)
2846 // %trunc = G_INTRINSIC_TRUNC %mul2
2847 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2848 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
emitReciprocalU64(MachineIRBuilder & B,Register Val)2849 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2850                                                        Register Val) {
2851   const LLT S32 = LLT::scalar(32);
2852   auto Unmerge = B.buildUnmerge(S32, Val);
2853 
2854   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2855   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2856 
2857   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2858                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2859 
2860   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2861   auto Mul1 =
2862       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2863 
2864   // 2**(-32)
2865   auto Mul2 =
2866       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2867   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2868 
2869   // -(2**32)
2870   auto Mad2 = B.buildFMAD(S32, Trunc,
2871                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2872 
2873   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2874   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2875 
2876   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2877 }
2878 
legalizeUDIV_UREM64Impl(MachineIRBuilder & B,Register DstReg,Register Numer,Register Denom,bool IsDiv) const2879 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2880                                                   Register DstReg,
2881                                                   Register Numer,
2882                                                   Register Denom,
2883                                                   bool IsDiv) const {
2884   const LLT S32 = LLT::scalar(32);
2885   const LLT S64 = LLT::scalar(64);
2886   const LLT S1 = LLT::scalar(1);
2887   Register RcpLo, RcpHi;
2888 
2889   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2890 
2891   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2892 
2893   auto Zero64 = B.buildConstant(S64, 0);
2894   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2895 
2896   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2897   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2898 
2899   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2900   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2901   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2902 
2903   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2904   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2905   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2906   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2907 
2908   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2909   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2910   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2911   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2912   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2913 
2914   auto Zero32 = B.buildConstant(S32, 0);
2915   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2916   auto Add2_HiC =
2917       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2918   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2919   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2920 
2921   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2922   Register NumerLo = UnmergeNumer.getReg(0);
2923   Register NumerHi = UnmergeNumer.getReg(1);
2924 
2925   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2926   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2927   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2928   Register Mul3_Lo = UnmergeMul3.getReg(0);
2929   Register Mul3_Hi = UnmergeMul3.getReg(1);
2930   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2931   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2932   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2933   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2934 
2935   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2936   Register DenomLo = UnmergeDenom.getReg(0);
2937   Register DenomHi = UnmergeDenom.getReg(1);
2938 
2939   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2940   auto C1 = B.buildSExt(S32, CmpHi);
2941 
2942   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2943   auto C2 = B.buildSExt(S32, CmpLo);
2944 
2945   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2946   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2947 
2948   // TODO: Here and below portions of the code can be enclosed into if/endif.
2949   // Currently control flow is unconditional and we have 4 selects after
2950   // potential endif to substitute PHIs.
2951 
2952   // if C3 != 0 ...
2953   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2954   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2955   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2956   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2957 
2958   auto One64 = B.buildConstant(S64, 1);
2959   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2960 
2961   auto C4 =
2962       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2963   auto C5 =
2964       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2965   auto C6 = B.buildSelect(
2966       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2967 
2968   // if (C6 != 0)
2969   auto Add4 = B.buildAdd(S64, Add3, One64);
2970   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2971 
2972   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2973   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2974   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2975 
2976   // endif C6
2977   // endif C3
2978 
2979   if (IsDiv) {
2980     auto Sel1 = B.buildSelect(
2981         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2982     B.buildSelect(DstReg,
2983                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2984   } else {
2985     auto Sel2 = B.buildSelect(
2986         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2987     B.buildSelect(DstReg,
2988                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2989   }
2990 }
2991 
legalizeUDIV_UREM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2992 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2993                                             MachineRegisterInfo &MRI,
2994                                             MachineIRBuilder &B) const {
2995   const LLT S64 = LLT::scalar(64);
2996   const LLT S32 = LLT::scalar(32);
2997   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2998   Register DstReg = MI.getOperand(0).getReg();
2999   Register Num = MI.getOperand(1).getReg();
3000   Register Den = MI.getOperand(2).getReg();
3001   LLT Ty = MRI.getType(DstReg);
3002 
3003   if (Ty == S32)
3004     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
3005   else if (Ty == S64)
3006     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
3007   else
3008     return false;
3009 
3010   MI.eraseFromParent();
3011   return true;
3012 
3013 }
3014 
legalizeSDIV_SREM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3015 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3016                                             MachineRegisterInfo &MRI,
3017                                             MachineIRBuilder &B) const {
3018   const LLT S64 = LLT::scalar(64);
3019   const LLT S32 = LLT::scalar(32);
3020 
3021   Register DstReg = MI.getOperand(0).getReg();
3022   const LLT Ty = MRI.getType(DstReg);
3023   if (Ty != S32 && Ty != S64)
3024     return false;
3025 
3026   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3027 
3028   Register LHS = MI.getOperand(1).getReg();
3029   Register RHS = MI.getOperand(2).getReg();
3030 
3031   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3032   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3033   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3034 
3035   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3036   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3037 
3038   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3039   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3040 
3041   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3042   if (Ty == S32)
3043     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3044   else
3045     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3046 
3047   Register Sign;
3048   if (IsDiv)
3049     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3050   else
3051     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3052 
3053   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3054   B.buildSub(DstReg, UDivRem, Sign);
3055 
3056   MI.eraseFromParent();
3057   return true;
3058 }
3059 
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3060 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3061                                                  MachineRegisterInfo &MRI,
3062                                                  MachineIRBuilder &B) const {
3063   Register Res = MI.getOperand(0).getReg();
3064   Register LHS = MI.getOperand(1).getReg();
3065   Register RHS = MI.getOperand(2).getReg();
3066   uint16_t Flags = MI.getFlags();
3067   LLT ResTy = MRI.getType(Res);
3068 
3069   const MachineFunction &MF = B.getMF();
3070   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3071                             MI.getFlag(MachineInstr::FmAfn);
3072 
3073   if (!AllowInaccurateRcp)
3074     return false;
3075 
3076   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3077     // 1 / x -> RCP(x)
3078     if (CLHS->isExactlyValue(1.0)) {
3079       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3080         .addUse(RHS)
3081         .setMIFlags(Flags);
3082 
3083       MI.eraseFromParent();
3084       return true;
3085     }
3086 
3087     // -1 / x -> RCP( FNEG(x) )
3088     if (CLHS->isExactlyValue(-1.0)) {
3089       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3090       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3091         .addUse(FNeg.getReg(0))
3092         .setMIFlags(Flags);
3093 
3094       MI.eraseFromParent();
3095       return true;
3096     }
3097   }
3098 
3099   // x / y -> x * (1.0 / y)
3100   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3101     .addUse(RHS)
3102     .setMIFlags(Flags);
3103   B.buildFMul(Res, LHS, RCP, Flags);
3104 
3105   MI.eraseFromParent();
3106   return true;
3107 }
3108 
legalizeFastUnsafeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3109 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3110                                                    MachineRegisterInfo &MRI,
3111                                                    MachineIRBuilder &B) const {
3112   Register Res = MI.getOperand(0).getReg();
3113   Register X = MI.getOperand(1).getReg();
3114   Register Y = MI.getOperand(2).getReg();
3115   uint16_t Flags = MI.getFlags();
3116   LLT ResTy = MRI.getType(Res);
3117 
3118   const MachineFunction &MF = B.getMF();
3119   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3120                             MI.getFlag(MachineInstr::FmAfn);
3121 
3122   if (!AllowInaccurateRcp)
3123     return false;
3124 
3125   auto NegY = B.buildFNeg(ResTy, Y);
3126   auto One = B.buildFConstant(ResTy, 1.0);
3127 
3128   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3129     .addUse(Y)
3130     .setMIFlags(Flags);
3131 
3132   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3133   R = B.buildFMA(ResTy, Tmp0, R, R);
3134 
3135   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3136   R = B.buildFMA(ResTy, Tmp1, R, R);
3137 
3138   auto Ret = B.buildFMul(ResTy, X, R);
3139   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3140 
3141   B.buildFMA(Res, Tmp2, R, Ret);
3142   MI.eraseFromParent();
3143   return true;
3144 }
3145 
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3146 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3147                                          MachineRegisterInfo &MRI,
3148                                          MachineIRBuilder &B) const {
3149   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3150     return true;
3151 
3152   Register Res = MI.getOperand(0).getReg();
3153   Register LHS = MI.getOperand(1).getReg();
3154   Register RHS = MI.getOperand(2).getReg();
3155 
3156   uint16_t Flags = MI.getFlags();
3157 
3158   LLT S16 = LLT::scalar(16);
3159   LLT S32 = LLT::scalar(32);
3160 
3161   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3162   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3163 
3164   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3165     .addUse(RHSExt.getReg(0))
3166     .setMIFlags(Flags);
3167 
3168   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3169   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3170 
3171   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3172     .addUse(RDst.getReg(0))
3173     .addUse(RHS)
3174     .addUse(LHS)
3175     .setMIFlags(Flags);
3176 
3177   MI.eraseFromParent();
3178   return true;
3179 }
3180 
3181 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3182 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,AMDGPU::SIModeRegisterDefaults Mode)3183 static void toggleSPDenormMode(bool Enable,
3184                                MachineIRBuilder &B,
3185                                const GCNSubtarget &ST,
3186                                AMDGPU::SIModeRegisterDefaults Mode) {
3187   // Set SP denorm mode to this value.
3188   unsigned SPDenormMode =
3189     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3190 
3191   if (ST.hasDenormModeInst()) {
3192     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3193     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3194 
3195     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3196     B.buildInstr(AMDGPU::S_DENORM_MODE)
3197       .addImm(NewDenormModeValue);
3198 
3199   } else {
3200     // Select FP32 bit field in mode register.
3201     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3202                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3203                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3204 
3205     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3206       .addImm(SPDenormMode)
3207       .addImm(SPDenormModeBitField);
3208   }
3209 }
3210 
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3211 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3212                                          MachineRegisterInfo &MRI,
3213                                          MachineIRBuilder &B) const {
3214   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3215     return true;
3216 
3217   Register Res = MI.getOperand(0).getReg();
3218   Register LHS = MI.getOperand(1).getReg();
3219   Register RHS = MI.getOperand(2).getReg();
3220   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3221   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3222 
3223   uint16_t Flags = MI.getFlags();
3224 
3225   LLT S32 = LLT::scalar(32);
3226   LLT S1 = LLT::scalar(1);
3227 
3228   auto One = B.buildFConstant(S32, 1.0f);
3229 
3230   auto DenominatorScaled =
3231     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3232       .addUse(LHS)
3233       .addUse(RHS)
3234       .addImm(0)
3235       .setMIFlags(Flags);
3236   auto NumeratorScaled =
3237     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3238       .addUse(LHS)
3239       .addUse(RHS)
3240       .addImm(1)
3241       .setMIFlags(Flags);
3242 
3243   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3244     .addUse(DenominatorScaled.getReg(0))
3245     .setMIFlags(Flags);
3246   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3247 
3248   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3249   // aren't modeled as reading it.
3250   if (!Mode.allFP32Denormals())
3251     toggleSPDenormMode(true, B, ST, Mode);
3252 
3253   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3254   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3255   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3256   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3257   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3258   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3259 
3260   if (!Mode.allFP32Denormals())
3261     toggleSPDenormMode(false, B, ST, Mode);
3262 
3263   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3264     .addUse(Fma4.getReg(0))
3265     .addUse(Fma1.getReg(0))
3266     .addUse(Fma3.getReg(0))
3267     .addUse(NumeratorScaled.getReg(1))
3268     .setMIFlags(Flags);
3269 
3270   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3271     .addUse(Fmas.getReg(0))
3272     .addUse(RHS)
3273     .addUse(LHS)
3274     .setMIFlags(Flags);
3275 
3276   MI.eraseFromParent();
3277   return true;
3278 }
3279 
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3280 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3281                                          MachineRegisterInfo &MRI,
3282                                          MachineIRBuilder &B) const {
3283   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3284     return true;
3285 
3286   Register Res = MI.getOperand(0).getReg();
3287   Register LHS = MI.getOperand(1).getReg();
3288   Register RHS = MI.getOperand(2).getReg();
3289 
3290   uint16_t Flags = MI.getFlags();
3291 
3292   LLT S64 = LLT::scalar(64);
3293   LLT S1 = LLT::scalar(1);
3294 
3295   auto One = B.buildFConstant(S64, 1.0);
3296 
3297   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3298     .addUse(LHS)
3299     .addUse(RHS)
3300     .addImm(0)
3301     .setMIFlags(Flags);
3302 
3303   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3304 
3305   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3306     .addUse(DivScale0.getReg(0))
3307     .setMIFlags(Flags);
3308 
3309   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3310   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3311   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3312 
3313   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3314     .addUse(LHS)
3315     .addUse(RHS)
3316     .addImm(1)
3317     .setMIFlags(Flags);
3318 
3319   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3320   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3321   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3322 
3323   Register Scale;
3324   if (!ST.hasUsableDivScaleConditionOutput()) {
3325     // Workaround a hardware bug on SI where the condition output from div_scale
3326     // is not usable.
3327 
3328     LLT S32 = LLT::scalar(32);
3329 
3330     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3331     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3332     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3333     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3334 
3335     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3336                               Scale1Unmerge.getReg(1));
3337     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3338                               Scale0Unmerge.getReg(1));
3339     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3340   } else {
3341     Scale = DivScale1.getReg(1);
3342   }
3343 
3344   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3345     .addUse(Fma4.getReg(0))
3346     .addUse(Fma3.getReg(0))
3347     .addUse(Mul.getReg(0))
3348     .addUse(Scale)
3349     .setMIFlags(Flags);
3350 
3351   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3352     .addUse(Fmas.getReg(0))
3353     .addUse(RHS)
3354     .addUse(LHS)
3355     .setMIFlags(Flags);
3356 
3357   MI.eraseFromParent();
3358   return true;
3359 }
3360 
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3361 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3362                                                  MachineRegisterInfo &MRI,
3363                                                  MachineIRBuilder &B) const {
3364   Register Res = MI.getOperand(0).getReg();
3365   Register LHS = MI.getOperand(2).getReg();
3366   Register RHS = MI.getOperand(3).getReg();
3367   uint16_t Flags = MI.getFlags();
3368 
3369   LLT S32 = LLT::scalar(32);
3370   LLT S1 = LLT::scalar(1);
3371 
3372   auto Abs = B.buildFAbs(S32, RHS, Flags);
3373   const APFloat C0Val(1.0f);
3374 
3375   auto C0 = B.buildConstant(S32, 0x6f800000);
3376   auto C1 = B.buildConstant(S32, 0x2f800000);
3377   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3378 
3379   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3380   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3381 
3382   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3383 
3384   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3385     .addUse(Mul0.getReg(0))
3386     .setMIFlags(Flags);
3387 
3388   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3389 
3390   B.buildFMul(Res, Sel, Mul1, Flags);
3391 
3392   MI.eraseFromParent();
3393   return true;
3394 }
3395 
3396 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3397 // FIXME: Why do we handle this one but not other removed instructions?
3398 //
3399 // Reciprocal square root.  The clamp prevents infinite results, clamping
3400 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3401 // +-max_float.
legalizeRsqClampIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3402 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3403                                                     MachineRegisterInfo &MRI,
3404                                                     MachineIRBuilder &B) const {
3405   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3406     return true;
3407 
3408   Register Dst = MI.getOperand(0).getReg();
3409   Register Src = MI.getOperand(2).getReg();
3410   auto Flags = MI.getFlags();
3411 
3412   LLT Ty = MRI.getType(Dst);
3413 
3414   const fltSemantics *FltSemantics;
3415   if (Ty == LLT::scalar(32))
3416     FltSemantics = &APFloat::IEEEsingle();
3417   else if (Ty == LLT::scalar(64))
3418     FltSemantics = &APFloat::IEEEdouble();
3419   else
3420     return false;
3421 
3422   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3423     .addUse(Src)
3424     .setMIFlags(Flags);
3425 
3426   // We don't need to concern ourselves with the snan handling difference, since
3427   // the rsq quieted (or not) so use the one which will directly select.
3428   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3429   const bool UseIEEE = MFI->getMode().IEEE;
3430 
3431   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3432   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3433                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3434 
3435   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3436 
3437   if (UseIEEE)
3438     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3439   else
3440     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3441   MI.eraseFromParent();
3442   return true;
3443 }
3444 
getDSFPAtomicOpcode(Intrinsic::ID IID)3445 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3446   switch (IID) {
3447   case Intrinsic::amdgcn_ds_fadd:
3448     return AMDGPU::G_ATOMICRMW_FADD;
3449   case Intrinsic::amdgcn_ds_fmin:
3450     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3451   case Intrinsic::amdgcn_ds_fmax:
3452     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3453   default:
3454     llvm_unreachable("not a DS FP intrinsic");
3455   }
3456 }
3457 
legalizeDSAtomicFPIntrinsic(LegalizerHelper & Helper,MachineInstr & MI,Intrinsic::ID IID) const3458 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3459                                                       MachineInstr &MI,
3460                                                       Intrinsic::ID IID) const {
3461   GISelChangeObserver &Observer = Helper.Observer;
3462   Observer.changingInstr(MI);
3463 
3464   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3465 
3466   // The remaining operands were used to set fields in the MemOperand on
3467   // construction.
3468   for (int I = 6; I > 3; --I)
3469     MI.RemoveOperand(I);
3470 
3471   MI.RemoveOperand(1); // Remove the intrinsic ID.
3472   Observer.changedInstr(MI);
3473   return true;
3474 }
3475 
getImplicitArgPtr(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3476 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3477                                             MachineRegisterInfo &MRI,
3478                                             MachineIRBuilder &B) const {
3479   uint64_t Offset =
3480     ST.getTargetLowering()->getImplicitParameterOffset(
3481       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3482   LLT DstTy = MRI.getType(DstReg);
3483   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3484 
3485   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3486   if (!loadInputValue(KernargPtrReg, B,
3487                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3488     return false;
3489 
3490   // FIXME: This should be nuw
3491   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3492   return true;
3493 }
3494 
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3495 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3496                                                  MachineRegisterInfo &MRI,
3497                                                  MachineIRBuilder &B) const {
3498   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3499   if (!MFI->isEntryFunction()) {
3500     return legalizePreloadedArgIntrin(MI, MRI, B,
3501                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3502   }
3503 
3504   Register DstReg = MI.getOperand(0).getReg();
3505   if (!getImplicitArgPtr(DstReg, MRI, B))
3506     return false;
3507 
3508   MI.eraseFromParent();
3509   return true;
3510 }
3511 
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const3512 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3513                                               MachineRegisterInfo &MRI,
3514                                               MachineIRBuilder &B,
3515                                               unsigned AddrSpace) const {
3516   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3517   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3518   Register Hi32 = Unmerge.getReg(1);
3519 
3520   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3521   MI.eraseFromParent();
3522   return true;
3523 }
3524 
3525 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3526 // offset (the offset that is included in bounds checking and swizzling, to be
3527 // split between the instruction's voffset and immoffset fields) and soffset
3528 // (the offset that is excluded from bounds checking and swizzling, to go in
3529 // the instruction's soffset field).  This function takes the first kind of
3530 // offset and figures out how to split it between voffset and immoffset.
3531 std::tuple<Register, unsigned, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const3532 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3533                                         Register OrigOffset) const {
3534   const unsigned MaxImm = 4095;
3535   Register BaseReg;
3536   unsigned TotalConstOffset;
3537   const LLT S32 = LLT::scalar(32);
3538   MachineRegisterInfo &MRI = *B.getMRI();
3539 
3540   std::tie(BaseReg, TotalConstOffset) =
3541       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3542 
3543   unsigned ImmOffset = TotalConstOffset;
3544 
3545   // If BaseReg is a pointer, convert it to int.
3546   if (MRI.getType(BaseReg).isPointer())
3547     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3548 
3549   // If the immediate value is too big for the immoffset field, put the value
3550   // and -4096 into the immoffset field so that the value that is copied/added
3551   // for the voffset field is a multiple of 4096, and it stands more chance
3552   // of being CSEd with the copy/add for another similar load/store.
3553   // However, do not do that rounding down to a multiple of 4096 if that is a
3554   // negative number, as it appears to be illegal to have a negative offset
3555   // in the vgpr, even if adding the immediate offset makes it positive.
3556   unsigned Overflow = ImmOffset & ~MaxImm;
3557   ImmOffset -= Overflow;
3558   if ((int32_t)Overflow < 0) {
3559     Overflow += ImmOffset;
3560     ImmOffset = 0;
3561   }
3562 
3563   if (Overflow != 0) {
3564     if (!BaseReg) {
3565       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3566     } else {
3567       auto OverflowVal = B.buildConstant(S32, Overflow);
3568       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3569     }
3570   }
3571 
3572   if (!BaseReg)
3573     BaseReg = B.buildConstant(S32, 0).getReg(0);
3574 
3575   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3576 }
3577 
3578 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg,bool ImageStore) const3579 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3580                                              MachineRegisterInfo &MRI,
3581                                              Register Reg,
3582                                              bool ImageStore) const {
3583   const LLT S16 = LLT::scalar(16);
3584   const LLT S32 = LLT::scalar(32);
3585   LLT StoreVT = MRI.getType(Reg);
3586   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3587 
3588   if (ST.hasUnpackedD16VMem()) {
3589     auto Unmerge = B.buildUnmerge(S16, Reg);
3590 
3591     SmallVector<Register, 4> WideRegs;
3592     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3593       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3594 
3595     int NumElts = StoreVT.getNumElements();
3596 
3597     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3598   }
3599 
3600   if (ImageStore && ST.hasImageStoreD16Bug()) {
3601     if (StoreVT.getNumElements() == 2) {
3602       SmallVector<Register, 4> PackedRegs;
3603       Reg = B.buildBitcast(S32, Reg).getReg(0);
3604       PackedRegs.push_back(Reg);
3605       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3606       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3607     }
3608 
3609     if (StoreVT.getNumElements() == 3) {
3610       SmallVector<Register, 4> PackedRegs;
3611       auto Unmerge = B.buildUnmerge(S16, Reg);
3612       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3613         PackedRegs.push_back(Unmerge.getReg(I));
3614       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3615       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3616       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3617     }
3618 
3619     if (StoreVT.getNumElements() == 4) {
3620       SmallVector<Register, 4> PackedRegs;
3621       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3622       auto Unmerge = B.buildUnmerge(S32, Reg);
3623       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3624         PackedRegs.push_back(Unmerge.getReg(I));
3625       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3626       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3627     }
3628 
3629     llvm_unreachable("invalid data type");
3630   }
3631 
3632   return Reg;
3633 }
3634 
fixStoreSourceType(MachineIRBuilder & B,Register VData,bool IsFormat) const3635 Register AMDGPULegalizerInfo::fixStoreSourceType(
3636   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3637   MachineRegisterInfo *MRI = B.getMRI();
3638   LLT Ty = MRI->getType(VData);
3639 
3640   const LLT S16 = LLT::scalar(16);
3641 
3642   // Fixup illegal register types for i8 stores.
3643   if (Ty == LLT::scalar(8) || Ty == S16) {
3644     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3645     return AnyExt;
3646   }
3647 
3648   if (Ty.isVector()) {
3649     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3650       if (IsFormat)
3651         return handleD16VData(B, *MRI, VData);
3652     }
3653   }
3654 
3655   return VData;
3656 }
3657 
legalizeBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsTyped,bool IsFormat) const3658 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3659                                               MachineRegisterInfo &MRI,
3660                                               MachineIRBuilder &B,
3661                                               bool IsTyped,
3662                                               bool IsFormat) const {
3663   Register VData = MI.getOperand(1).getReg();
3664   LLT Ty = MRI.getType(VData);
3665   LLT EltTy = Ty.getScalarType();
3666   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3667   const LLT S32 = LLT::scalar(32);
3668 
3669   VData = fixStoreSourceType(B, VData, IsFormat);
3670   Register RSrc = MI.getOperand(2).getReg();
3671 
3672   MachineMemOperand *MMO = *MI.memoperands_begin();
3673   const int MemSize = MMO->getSize();
3674 
3675   unsigned ImmOffset;
3676   unsigned TotalOffset;
3677 
3678   // The typed intrinsics add an immediate after the registers.
3679   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3680 
3681   // The struct intrinsic variants add one additional operand over raw.
3682   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3683   Register VIndex;
3684   int OpOffset = 0;
3685   if (HasVIndex) {
3686     VIndex = MI.getOperand(3).getReg();
3687     OpOffset = 1;
3688   }
3689 
3690   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3691   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3692 
3693   unsigned Format = 0;
3694   if (IsTyped) {
3695     Format = MI.getOperand(5 + OpOffset).getImm();
3696     ++OpOffset;
3697   }
3698 
3699   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3700 
3701   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3702   if (TotalOffset != 0)
3703     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3704 
3705   unsigned Opc;
3706   if (IsTyped) {
3707     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3708                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3709   } else if (IsFormat) {
3710     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3711                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3712   } else {
3713     switch (MemSize) {
3714     case 1:
3715       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3716       break;
3717     case 2:
3718       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3719       break;
3720     default:
3721       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3722       break;
3723     }
3724   }
3725 
3726   if (!VIndex)
3727     VIndex = B.buildConstant(S32, 0).getReg(0);
3728 
3729   auto MIB = B.buildInstr(Opc)
3730     .addUse(VData)              // vdata
3731     .addUse(RSrc)               // rsrc
3732     .addUse(VIndex)             // vindex
3733     .addUse(VOffset)            // voffset
3734     .addUse(SOffset)            // soffset
3735     .addImm(ImmOffset);         // offset(imm)
3736 
3737   if (IsTyped)
3738     MIB.addImm(Format);
3739 
3740   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3741      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3742      .addMemOperand(MMO);
3743 
3744   MI.eraseFromParent();
3745   return true;
3746 }
3747 
legalizeBufferLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat,bool IsTyped) const3748 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3749                                              MachineRegisterInfo &MRI,
3750                                              MachineIRBuilder &B,
3751                                              bool IsFormat,
3752                                              bool IsTyped) const {
3753   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3754   MachineMemOperand *MMO = *MI.memoperands_begin();
3755   const int MemSize = MMO->getSize();
3756   const LLT S32 = LLT::scalar(32);
3757 
3758   Register Dst = MI.getOperand(0).getReg();
3759   Register RSrc = MI.getOperand(2).getReg();
3760 
3761   // The typed intrinsics add an immediate after the registers.
3762   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3763 
3764   // The struct intrinsic variants add one additional operand over raw.
3765   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3766   Register VIndex;
3767   int OpOffset = 0;
3768   if (HasVIndex) {
3769     VIndex = MI.getOperand(3).getReg();
3770     OpOffset = 1;
3771   }
3772 
3773   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3774   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3775 
3776   unsigned Format = 0;
3777   if (IsTyped) {
3778     Format = MI.getOperand(5 + OpOffset).getImm();
3779     ++OpOffset;
3780   }
3781 
3782   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3783   unsigned ImmOffset;
3784   unsigned TotalOffset;
3785 
3786   LLT Ty = MRI.getType(Dst);
3787   LLT EltTy = Ty.getScalarType();
3788   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3789   const bool Unpacked = ST.hasUnpackedD16VMem();
3790 
3791   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3792   if (TotalOffset != 0)
3793     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3794 
3795   unsigned Opc;
3796 
3797   if (IsTyped) {
3798     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3799                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3800   } else if (IsFormat) {
3801     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3802                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3803   } else {
3804     switch (MemSize) {
3805     case 1:
3806       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3807       break;
3808     case 2:
3809       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3810       break;
3811     default:
3812       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3813       break;
3814     }
3815   }
3816 
3817   Register LoadDstReg;
3818 
3819   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3820   LLT UnpackedTy = Ty.changeElementSize(32);
3821 
3822   if (IsExtLoad)
3823     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3824   else if (Unpacked && IsD16 && Ty.isVector())
3825     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3826   else
3827     LoadDstReg = Dst;
3828 
3829   if (!VIndex)
3830     VIndex = B.buildConstant(S32, 0).getReg(0);
3831 
3832   auto MIB = B.buildInstr(Opc)
3833     .addDef(LoadDstReg)         // vdata
3834     .addUse(RSrc)               // rsrc
3835     .addUse(VIndex)             // vindex
3836     .addUse(VOffset)            // voffset
3837     .addUse(SOffset)            // soffset
3838     .addImm(ImmOffset);         // offset(imm)
3839 
3840   if (IsTyped)
3841     MIB.addImm(Format);
3842 
3843   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3844      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3845      .addMemOperand(MMO);
3846 
3847   if (LoadDstReg != Dst) {
3848     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3849 
3850     // Widen result for extending loads was widened.
3851     if (IsExtLoad)
3852       B.buildTrunc(Dst, LoadDstReg);
3853     else {
3854       // Repack to original 16-bit vector result
3855       // FIXME: G_TRUNC should work, but legalization currently fails
3856       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3857       SmallVector<Register, 4> Repack;
3858       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3859         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3860       B.buildMerge(Dst, Repack);
3861     }
3862   }
3863 
3864   MI.eraseFromParent();
3865   return true;
3866 }
3867 
legalizeAtomicIncDec(MachineInstr & MI,MachineIRBuilder & B,bool IsInc) const3868 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3869                                                MachineIRBuilder &B,
3870                                                bool IsInc) const {
3871   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3872                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3873   B.buildInstr(Opc)
3874     .addDef(MI.getOperand(0).getReg())
3875     .addUse(MI.getOperand(2).getReg())
3876     .addUse(MI.getOperand(3).getReg())
3877     .cloneMemRefs(MI);
3878   MI.eraseFromParent();
3879   return true;
3880 }
3881 
getBufferAtomicPseudo(Intrinsic::ID IntrID)3882 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3883   switch (IntrID) {
3884   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3885   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3886     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3887   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3888   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3889     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3890   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3891   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3892     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3893   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3894   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3895     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3896   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3897   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3898     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3899   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3900   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3901     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3902   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3903   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3904     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3905   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3906   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3907     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3908   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3909   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3910     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3911   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3912   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3913     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3914   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3915   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3916     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3917   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3918   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3919     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3920   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3921   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3922     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3923   case Intrinsic::amdgcn_buffer_atomic_fadd:
3924   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3925   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3926     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3927   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3928   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3929     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3930   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3931   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3932     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3933   default:
3934     llvm_unreachable("unhandled atomic opcode");
3935   }
3936 }
3937 
legalizeBufferAtomic(MachineInstr & MI,MachineIRBuilder & B,Intrinsic::ID IID) const3938 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3939                                                MachineIRBuilder &B,
3940                                                Intrinsic::ID IID) const {
3941   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3942                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3943   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3944 
3945   Register Dst;
3946 
3947   int OpOffset = 0;
3948   if (HasReturn) {
3949     // A few FP atomics do not support return values.
3950     Dst = MI.getOperand(0).getReg();
3951   } else {
3952     OpOffset = -1;
3953   }
3954 
3955   Register VData = MI.getOperand(2 + OpOffset).getReg();
3956   Register CmpVal;
3957 
3958   if (IsCmpSwap) {
3959     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3960     ++OpOffset;
3961   }
3962 
3963   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3964   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3965 
3966   // The struct intrinsic variants add one additional operand over raw.
3967   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3968   Register VIndex;
3969   if (HasVIndex) {
3970     VIndex = MI.getOperand(4 + OpOffset).getReg();
3971     ++OpOffset;
3972   }
3973 
3974   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3975   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3976   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3977 
3978   MachineMemOperand *MMO = *MI.memoperands_begin();
3979 
3980   unsigned ImmOffset;
3981   unsigned TotalOffset;
3982   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3983   if (TotalOffset != 0)
3984     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3985 
3986   if (!VIndex)
3987     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3988 
3989   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3990 
3991   if (HasReturn)
3992     MIB.addDef(Dst);
3993 
3994   MIB.addUse(VData); // vdata
3995 
3996   if (IsCmpSwap)
3997     MIB.addReg(CmpVal);
3998 
3999   MIB.addUse(RSrc)               // rsrc
4000      .addUse(VIndex)             // vindex
4001      .addUse(VOffset)            // voffset
4002      .addUse(SOffset)            // soffset
4003      .addImm(ImmOffset)          // offset(imm)
4004      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4005      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4006      .addMemOperand(MMO);
4007 
4008   MI.eraseFromParent();
4009   return true;
4010 }
4011 
4012 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4013 /// vector with s16 typed elements.
packImage16bitOpsToDwords(MachineIRBuilder & B,MachineInstr & MI,SmallVectorImpl<Register> & PackedAddrs,unsigned ArgOffset,const AMDGPU::ImageDimIntrinsicInfo * Intr,bool IsA16,bool IsG16)4014 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4015                                       SmallVectorImpl<Register> &PackedAddrs,
4016                                       unsigned ArgOffset,
4017                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4018                                       bool IsA16, bool IsG16) {
4019   const LLT S16 = LLT::scalar(16);
4020   const LLT V2S16 = LLT::vector(2, 16);
4021   auto EndIdx = Intr->VAddrEnd;
4022 
4023   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4024     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4025     if (!SrcOp.isReg())
4026       continue; // _L to _LZ may have eliminated this.
4027 
4028     Register AddrReg = SrcOp.getReg();
4029 
4030     if (I < Intr->GradientStart) {
4031       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4032       PackedAddrs.push_back(AddrReg);
4033     } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4034                (I >= Intr->CoordStart && !IsA16)) {
4035       // Handle any gradient or coordinate operands that should not be packed
4036       PackedAddrs.push_back(AddrReg);
4037     } else {
4038       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4039       // derivatives dx/dh and dx/dv are packed with undef.
4040       if (((I + 1) >= EndIdx) ||
4041           ((Intr->NumGradients / 2) % 2 == 1 &&
4042            (I == static_cast<unsigned>(Intr->GradientStart +
4043                                        (Intr->NumGradients / 2) - 1) ||
4044             I == static_cast<unsigned>(Intr->GradientStart +
4045                                        Intr->NumGradients - 1))) ||
4046           // Check for _L to _LZ optimization
4047           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4048         PackedAddrs.push_back(
4049             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4050                 .getReg(0));
4051       } else {
4052         PackedAddrs.push_back(
4053             B.buildBuildVector(
4054                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4055                 .getReg(0));
4056         ++I;
4057       }
4058     }
4059   }
4060 }
4061 
4062 /// Convert from separate vaddr components to a single vector address register,
4063 /// and replace the remaining operands with $noreg.
convertImageAddrToPacked(MachineIRBuilder & B,MachineInstr & MI,int DimIdx,int NumVAddrs)4064 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4065                                      int DimIdx, int NumVAddrs) {
4066   const LLT S32 = LLT::scalar(32);
4067 
4068   SmallVector<Register, 8> AddrRegs;
4069   for (int I = 0; I != NumVAddrs; ++I) {
4070     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4071     if (SrcOp.isReg()) {
4072       AddrRegs.push_back(SrcOp.getReg());
4073       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4074     }
4075   }
4076 
4077   int NumAddrRegs = AddrRegs.size();
4078   if (NumAddrRegs != 1) {
4079     // Round up to 8 elements for v5-v7
4080     // FIXME: Missing intermediate sized register classes and instructions.
4081     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4082       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4083       auto Undef = B.buildUndef(S32);
4084       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4085       NumAddrRegs = RoundedNumRegs;
4086     }
4087 
4088     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4089     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4090   }
4091 
4092   for (int I = 1; I != NumVAddrs; ++I) {
4093     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4094     if (SrcOp.isReg())
4095       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4096   }
4097 }
4098 
4099 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4100 ///
4101 /// Depending on the subtarget, load/store with 16-bit element data need to be
4102 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4103 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4104 /// registers.
4105 ///
4106 /// We don't want to directly select image instructions just yet, but also want
4107 /// to exposes all register repacking to the legalizer/combiners. We also don't
4108 /// want a selected instrution entering RegBankSelect. In order to avoid
4109 /// defining a multitude of intermediate image instructions, directly hack on
4110 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4111 /// now unnecessary arguments with $noreg.
legalizeImageIntrinsic(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer,const AMDGPU::ImageDimIntrinsicInfo * Intr) const4112 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4113     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4114     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4115 
4116   const unsigned NumDefs = MI.getNumExplicitDefs();
4117   const unsigned ArgOffset = NumDefs + 1;
4118   bool IsTFE = NumDefs == 2;
4119   // We are only processing the operands of d16 image operations on subtargets
4120   // that use the unpacked register layout, or need to repack the TFE result.
4121 
4122   // TODO: Do we need to guard against already legalized intrinsics?
4123   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4124       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4125 
4126   MachineRegisterInfo *MRI = B.getMRI();
4127   const LLT S32 = LLT::scalar(32);
4128   const LLT S16 = LLT::scalar(16);
4129   const LLT V2S16 = LLT::vector(2, 16);
4130 
4131   unsigned DMask = 0;
4132 
4133   // Check for 16 bit addresses and pack if true.
4134   LLT GradTy =
4135       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4136   LLT AddrTy =
4137       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4138   const bool IsG16 = GradTy == S16;
4139   const bool IsA16 = AddrTy == S16;
4140 
4141   int DMaskLanes = 0;
4142   if (!BaseOpcode->Atomic) {
4143     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4144     if (BaseOpcode->Gather4) {
4145       DMaskLanes = 4;
4146     } else if (DMask != 0) {
4147       DMaskLanes = countPopulation(DMask);
4148     } else if (!IsTFE && !BaseOpcode->Store) {
4149       // If dmask is 0, this is a no-op load. This can be eliminated.
4150       B.buildUndef(MI.getOperand(0));
4151       MI.eraseFromParent();
4152       return true;
4153     }
4154   }
4155 
4156   Observer.changingInstr(MI);
4157   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4158 
4159   unsigned NewOpcode = NumDefs == 0 ?
4160     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4161 
4162   // Track that we legalized this
4163   MI.setDesc(B.getTII().get(NewOpcode));
4164 
4165   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4166   // dmask to be at least 1 otherwise the instruction will fail
4167   if (IsTFE && DMask == 0) {
4168     DMask = 0x1;
4169     DMaskLanes = 1;
4170     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4171   }
4172 
4173   if (BaseOpcode->Atomic) {
4174     Register VData0 = MI.getOperand(2).getReg();
4175     LLT Ty = MRI->getType(VData0);
4176 
4177     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4178     if (Ty.isVector())
4179       return false;
4180 
4181     if (BaseOpcode->AtomicX2) {
4182       Register VData1 = MI.getOperand(3).getReg();
4183       // The two values are packed in one register.
4184       LLT PackedTy = LLT::vector(2, Ty);
4185       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4186       MI.getOperand(2).setReg(Concat.getReg(0));
4187       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4188     }
4189   }
4190 
4191   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4192 
4193   // Optimize _L to _LZ when _L is zero
4194   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4195           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4196     const ConstantFP *ConstantLod;
4197 
4198     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4199                  m_GFCst(ConstantLod))) {
4200       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4201         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4202         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4203             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4204                                                       Intr->Dim);
4205 
4206         // The starting indexes should remain in the same place.
4207         --CorrectedNumVAddrs;
4208 
4209         MI.getOperand(MI.getNumExplicitDefs())
4210             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4211         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4212         Intr = NewImageDimIntr;
4213       }
4214     }
4215   }
4216 
4217   // Optimize _mip away, when 'lod' is zero
4218   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4219     int64_t ConstantLod;
4220     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4221                  m_ICst(ConstantLod))) {
4222       if (ConstantLod == 0) {
4223         // TODO: Change intrinsic opcode and remove operand instead or replacing
4224         // it with 0, as the _L to _LZ handling is done above.
4225         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4226         --CorrectedNumVAddrs;
4227       }
4228     }
4229   }
4230 
4231   // Rewrite the addressing register layout before doing anything else.
4232   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4233     // 16 bit gradients are supported, but are tied to the A16 control
4234     // so both gradients and addresses must be 16 bit
4235     return false;
4236   }
4237 
4238   if (IsA16 && !ST.hasA16()) {
4239     // A16 not supported
4240     return false;
4241   }
4242 
4243   if (IsA16 || IsG16) {
4244     if (Intr->NumVAddrs > 1) {
4245       SmallVector<Register, 4> PackedRegs;
4246 
4247       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4248                                 IsG16);
4249 
4250       // See also below in the non-a16 branch
4251       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4252 
4253       if (!UseNSA && PackedRegs.size() > 1) {
4254         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4255         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4256         PackedRegs[0] = Concat.getReg(0);
4257         PackedRegs.resize(1);
4258       }
4259 
4260       const unsigned NumPacked = PackedRegs.size();
4261       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4262         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4263         if (!SrcOp.isReg()) {
4264           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4265           continue;
4266         }
4267 
4268         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4269 
4270         if (I - Intr->VAddrStart < NumPacked)
4271           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4272         else
4273           SrcOp.setReg(AMDGPU::NoRegister);
4274       }
4275     }
4276   } else {
4277     // If the register allocator cannot place the address registers contiguously
4278     // without introducing moves, then using the non-sequential address encoding
4279     // is always preferable, since it saves VALU instructions and is usually a
4280     // wash in terms of code size or even better.
4281     //
4282     // However, we currently have no way of hinting to the register allocator
4283     // that MIMG addresses should be placed contiguously when it is possible to
4284     // do so, so force non-NSA for the common 2-address case as a heuristic.
4285     //
4286     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4287     // allocation when possible.
4288     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4289 
4290     if (!UseNSA && Intr->NumVAddrs > 1)
4291       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4292                                Intr->NumVAddrs);
4293   }
4294 
4295   int Flags = 0;
4296   if (IsA16)
4297     Flags |= 1;
4298   if (IsG16)
4299     Flags |= 2;
4300   MI.addOperand(MachineOperand::CreateImm(Flags));
4301 
4302   if (BaseOpcode->Store) { // No TFE for stores?
4303     // TODO: Handle dmask trim
4304     Register VData = MI.getOperand(1).getReg();
4305     LLT Ty = MRI->getType(VData);
4306     if (!Ty.isVector() || Ty.getElementType() != S16)
4307       return true;
4308 
4309     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4310     if (RepackedReg != VData) {
4311       MI.getOperand(1).setReg(RepackedReg);
4312     }
4313 
4314     return true;
4315   }
4316 
4317   Register DstReg = MI.getOperand(0).getReg();
4318   LLT Ty = MRI->getType(DstReg);
4319   const LLT EltTy = Ty.getScalarType();
4320   const bool IsD16 = Ty.getScalarType() == S16;
4321   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4322 
4323   // Confirm that the return type is large enough for the dmask specified
4324   if (NumElts < DMaskLanes)
4325     return false;
4326 
4327   if (NumElts > 4 || DMaskLanes > 4)
4328     return false;
4329 
4330   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4331   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4332 
4333   // The raw dword aligned data component of the load. The only legal cases
4334   // where this matters should be when using the packed D16 format, for
4335   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4336   LLT RoundedTy;
4337 
4338   // S32 vector to to cover all data, plus TFE result element.
4339   LLT TFETy;
4340 
4341   // Register type to use for each loaded component. Will be S32 or V2S16.
4342   LLT RegTy;
4343 
4344   if (IsD16 && ST.hasUnpackedD16VMem()) {
4345     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4346     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4347     RegTy = S32;
4348   } else {
4349     unsigned EltSize = EltTy.getSizeInBits();
4350     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4351     unsigned RoundedSize = 32 * RoundedElts;
4352     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4353     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4354     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4355   }
4356 
4357   // The return type does not need adjustment.
4358   // TODO: Should we change s16 case to s32 or <2 x s16>?
4359   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4360     return true;
4361 
4362   Register Dst1Reg;
4363 
4364   // Insert after the instruction.
4365   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4366 
4367   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4368   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4369   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4370   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4371 
4372   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4373 
4374   MI.getOperand(0).setReg(NewResultReg);
4375 
4376   // In the IR, TFE is supposed to be used with a 2 element struct return
4377   // type. The intruction really returns these two values in one contiguous
4378   // register, with one additional dword beyond the loaded data. Rewrite the
4379   // return type to use a single register result.
4380 
4381   if (IsTFE) {
4382     Dst1Reg = MI.getOperand(1).getReg();
4383     if (MRI->getType(Dst1Reg) != S32)
4384       return false;
4385 
4386     // TODO: Make sure the TFE operand bit is set.
4387     MI.RemoveOperand(1);
4388 
4389     // Handle the easy case that requires no repack instructions.
4390     if (Ty == S32) {
4391       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4392       return true;
4393     }
4394   }
4395 
4396   // Now figure out how to copy the new result register back into the old
4397   // result.
4398   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4399 
4400   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4401 
4402   if (ResultNumRegs == 1) {
4403     assert(!IsTFE);
4404     ResultRegs[0] = NewResultReg;
4405   } else {
4406     // We have to repack into a new vector of some kind.
4407     for (int I = 0; I != NumDataRegs; ++I)
4408       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4409     B.buildUnmerge(ResultRegs, NewResultReg);
4410 
4411     // Drop the final TFE element to get the data part. The TFE result is
4412     // directly written to the right place already.
4413     if (IsTFE)
4414       ResultRegs.resize(NumDataRegs);
4415   }
4416 
4417   // For an s16 scalar result, we form an s32 result with a truncate regardless
4418   // of packed vs. unpacked.
4419   if (IsD16 && !Ty.isVector()) {
4420     B.buildTrunc(DstReg, ResultRegs[0]);
4421     return true;
4422   }
4423 
4424   // Avoid a build/concat_vector of 1 entry.
4425   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4426     B.buildBitcast(DstReg, ResultRegs[0]);
4427     return true;
4428   }
4429 
4430   assert(Ty.isVector());
4431 
4432   if (IsD16) {
4433     // For packed D16 results with TFE enabled, all the data components are
4434     // S32. Cast back to the expected type.
4435     //
4436     // TODO: We don't really need to use load s32 elements. We would only need one
4437     // cast for the TFE result if a multiple of v2s16 was used.
4438     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4439       for (Register &Reg : ResultRegs)
4440         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4441     } else if (ST.hasUnpackedD16VMem()) {
4442       for (Register &Reg : ResultRegs)
4443         Reg = B.buildTrunc(S16, Reg).getReg(0);
4444     }
4445   }
4446 
4447   auto padWithUndef = [&](LLT Ty, int NumElts) {
4448     if (NumElts == 0)
4449       return;
4450     Register Undef = B.buildUndef(Ty).getReg(0);
4451     for (int I = 0; I != NumElts; ++I)
4452       ResultRegs.push_back(Undef);
4453   };
4454 
4455   // Pad out any elements eliminated due to the dmask.
4456   LLT ResTy = MRI->getType(ResultRegs[0]);
4457   if (!ResTy.isVector()) {
4458     padWithUndef(ResTy, NumElts - ResultRegs.size());
4459     B.buildBuildVector(DstReg, ResultRegs);
4460     return true;
4461   }
4462 
4463   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4464   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4465 
4466   // Deal with the one annoying legal case.
4467   const LLT V3S16 = LLT::vector(3, 16);
4468   if (Ty == V3S16) {
4469     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4470     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4471     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4472     return true;
4473   }
4474 
4475   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4476   B.buildConcatVectors(DstReg, ResultRegs);
4477   return true;
4478 }
4479 
legalizeSBufferLoad(LegalizerHelper & Helper,MachineInstr & MI) const4480 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4481   LegalizerHelper &Helper, MachineInstr &MI) const {
4482   MachineIRBuilder &B = Helper.MIRBuilder;
4483   GISelChangeObserver &Observer = Helper.Observer;
4484 
4485   Register Dst = MI.getOperand(0).getReg();
4486   LLT Ty = B.getMRI()->getType(Dst);
4487   unsigned Size = Ty.getSizeInBits();
4488   MachineFunction &MF = B.getMF();
4489 
4490   Observer.changingInstr(MI);
4491 
4492   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4493     Ty = getBitcastRegisterType(Ty);
4494     Helper.bitcastDst(MI, Ty, 0);
4495     Dst = MI.getOperand(0).getReg();
4496     B.setInsertPt(B.getMBB(), MI);
4497   }
4498 
4499   // FIXME: We don't really need this intermediate instruction. The intrinsic
4500   // should be fixed to have a memory operand. Since it's readnone, we're not
4501   // allowed to add one.
4502   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4503   MI.RemoveOperand(1); // Remove intrinsic ID
4504 
4505   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4506   // TODO: Should this use datalayout alignment?
4507   const unsigned MemSize = (Size + 7) / 8;
4508   const Align MemAlign(4);
4509   MachineMemOperand *MMO = MF.getMachineMemOperand(
4510       MachinePointerInfo(),
4511       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4512           MachineMemOperand::MOInvariant,
4513       MemSize, MemAlign);
4514   MI.addMemOperand(MF, MMO);
4515 
4516   // There are no 96-bit result scalar loads, but widening to 128-bit should
4517   // always be legal. We may need to restore this to a 96-bit result if it turns
4518   // out this needs to be converted to a vector load during RegBankSelect.
4519   if (!isPowerOf2_32(Size)) {
4520     if (Ty.isVector())
4521       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4522     else
4523       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4524   }
4525 
4526   Observer.changedInstr(MI);
4527   return true;
4528 }
4529 
4530 // TODO: Move to selection
legalizeTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4531 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4532                                                 MachineRegisterInfo &MRI,
4533                                                 MachineIRBuilder &B) const {
4534   if (!ST.isTrapHandlerEnabled() ||
4535       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4536     return legalizeTrapEndpgm(MI, MRI, B);
4537 
4538   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4539     switch (*HsaAbiVer) {
4540     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4541     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4542       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4543     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4544       return ST.supportsGetDoorbellID() ?
4545           legalizeTrapHsa(MI, MRI, B) :
4546           legalizeTrapHsaQueuePtr(MI, MRI, B);
4547     }
4548   }
4549 
4550   llvm_unreachable("Unknown trap handler");
4551 }
4552 
legalizeTrapEndpgm(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4553 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4554     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4555   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4556   MI.eraseFromParent();
4557   return true;
4558 }
4559 
legalizeTrapHsaQueuePtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4560 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4561     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4562   // Pass queue pointer to trap handler as input, and insert trap instruction
4563   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4564   Register LiveIn =
4565     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4566   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4567     return false;
4568 
4569   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4570   B.buildCopy(SGPR01, LiveIn);
4571   B.buildInstr(AMDGPU::S_TRAP)
4572       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4573       .addReg(SGPR01, RegState::Implicit);
4574 
4575   MI.eraseFromParent();
4576   return true;
4577 }
4578 
legalizeTrapHsa(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4579 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4580     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4581   B.buildInstr(AMDGPU::S_TRAP)
4582       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4583   MI.eraseFromParent();
4584   return true;
4585 }
4586 
legalizeDebugTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4587 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4588     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4589   // Is non-HSA path or trap-handler disabled? then, report a warning
4590   // accordingly
4591   if (!ST.isTrapHandlerEnabled() ||
4592       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4593     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4594                                      "debugtrap handler not supported",
4595                                      MI.getDebugLoc(), DS_Warning);
4596     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4597     Ctx.diagnose(NoTrap);
4598   } else {
4599     // Insert debug-trap instruction
4600     B.buildInstr(AMDGPU::S_TRAP)
4601         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4602   }
4603 
4604   MI.eraseFromParent();
4605   return true;
4606 }
4607 
legalizeBVHIntrinsic(MachineInstr & MI,MachineIRBuilder & B) const4608 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4609                                                MachineIRBuilder &B) const {
4610   MachineRegisterInfo &MRI = *B.getMRI();
4611   const LLT S16 = LLT::scalar(16);
4612   const LLT S32 = LLT::scalar(32);
4613 
4614   Register DstReg = MI.getOperand(0).getReg();
4615   Register NodePtr = MI.getOperand(2).getReg();
4616   Register RayExtent = MI.getOperand(3).getReg();
4617   Register RayOrigin = MI.getOperand(4).getReg();
4618   Register RayDir = MI.getOperand(5).getReg();
4619   Register RayInvDir = MI.getOperand(6).getReg();
4620   Register TDescr = MI.getOperand(7).getReg();
4621 
4622   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4623   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4624   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4625                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4626                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4627                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4628 
4629   SmallVector<Register, 12> Ops;
4630   if (Is64) {
4631     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4632     Ops.push_back(Unmerge.getReg(0));
4633     Ops.push_back(Unmerge.getReg(1));
4634   } else {
4635     Ops.push_back(NodePtr);
4636   }
4637   Ops.push_back(RayExtent);
4638 
4639   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4640     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4641     Ops.push_back(Unmerge.getReg(0));
4642     Ops.push_back(Unmerge.getReg(1));
4643     Ops.push_back(Unmerge.getReg(2));
4644   };
4645 
4646   packLanes(RayOrigin);
4647   if (IsA16) {
4648     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4649     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4650     Register R1 = MRI.createGenericVirtualRegister(S32);
4651     Register R2 = MRI.createGenericVirtualRegister(S32);
4652     Register R3 = MRI.createGenericVirtualRegister(S32);
4653     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4654     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4655     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4656     Ops.push_back(R1);
4657     Ops.push_back(R2);
4658     Ops.push_back(R3);
4659   } else {
4660     packLanes(RayDir);
4661     packLanes(RayInvDir);
4662   }
4663 
4664   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4665     .addDef(DstReg)
4666     .addImm(Opcode);
4667 
4668   for (Register R : Ops) {
4669     MIB.addUse(R);
4670   }
4671 
4672   MIB.addUse(TDescr)
4673      .addImm(IsA16 ? 1 : 0)
4674      .cloneMemRefs(MI);
4675 
4676   MI.eraseFromParent();
4677   return true;
4678 }
4679 
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const4680 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4681                                             MachineInstr &MI) const {
4682   MachineIRBuilder &B = Helper.MIRBuilder;
4683   MachineRegisterInfo &MRI = *B.getMRI();
4684 
4685   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4686   auto IntrID = MI.getIntrinsicID();
4687   switch (IntrID) {
4688   case Intrinsic::amdgcn_if:
4689   case Intrinsic::amdgcn_else: {
4690     MachineInstr *Br = nullptr;
4691     MachineBasicBlock *UncondBrTarget = nullptr;
4692     bool Negated = false;
4693     if (MachineInstr *BrCond =
4694             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4695       const SIRegisterInfo *TRI
4696         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4697 
4698       Register Def = MI.getOperand(1).getReg();
4699       Register Use = MI.getOperand(3).getReg();
4700 
4701       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4702 
4703       if (Negated)
4704         std::swap(CondBrTarget, UncondBrTarget);
4705 
4706       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4707       if (IntrID == Intrinsic::amdgcn_if) {
4708         B.buildInstr(AMDGPU::SI_IF)
4709           .addDef(Def)
4710           .addUse(Use)
4711           .addMBB(UncondBrTarget);
4712       } else {
4713         B.buildInstr(AMDGPU::SI_ELSE)
4714             .addDef(Def)
4715             .addUse(Use)
4716             .addMBB(UncondBrTarget);
4717       }
4718 
4719       if (Br) {
4720         Br->getOperand(0).setMBB(CondBrTarget);
4721       } else {
4722         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4723         // since we're swapping branch targets it needs to be reinserted.
4724         // FIXME: IRTranslator should probably not do this
4725         B.buildBr(*CondBrTarget);
4726       }
4727 
4728       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4729       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4730       MI.eraseFromParent();
4731       BrCond->eraseFromParent();
4732       return true;
4733     }
4734 
4735     return false;
4736   }
4737   case Intrinsic::amdgcn_loop: {
4738     MachineInstr *Br = nullptr;
4739     MachineBasicBlock *UncondBrTarget = nullptr;
4740     bool Negated = false;
4741     if (MachineInstr *BrCond =
4742             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4743       const SIRegisterInfo *TRI
4744         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4745 
4746       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4747       Register Reg = MI.getOperand(2).getReg();
4748 
4749       if (Negated)
4750         std::swap(CondBrTarget, UncondBrTarget);
4751 
4752       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4753       B.buildInstr(AMDGPU::SI_LOOP)
4754         .addUse(Reg)
4755         .addMBB(UncondBrTarget);
4756 
4757       if (Br)
4758         Br->getOperand(0).setMBB(CondBrTarget);
4759       else
4760         B.buildBr(*CondBrTarget);
4761 
4762       MI.eraseFromParent();
4763       BrCond->eraseFromParent();
4764       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4765       return true;
4766     }
4767 
4768     return false;
4769   }
4770   case Intrinsic::amdgcn_kernarg_segment_ptr:
4771     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4772       // This only makes sense to call in a kernel, so just lower to null.
4773       B.buildConstant(MI.getOperand(0).getReg(), 0);
4774       MI.eraseFromParent();
4775       return true;
4776     }
4777 
4778     return legalizePreloadedArgIntrin(
4779       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4780   case Intrinsic::amdgcn_implicitarg_ptr:
4781     return legalizeImplicitArgPtr(MI, MRI, B);
4782   case Intrinsic::amdgcn_workitem_id_x:
4783     return legalizePreloadedArgIntrin(MI, MRI, B,
4784                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4785   case Intrinsic::amdgcn_workitem_id_y:
4786     return legalizePreloadedArgIntrin(MI, MRI, B,
4787                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4788   case Intrinsic::amdgcn_workitem_id_z:
4789     return legalizePreloadedArgIntrin(MI, MRI, B,
4790                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4791   case Intrinsic::amdgcn_workgroup_id_x:
4792     return legalizePreloadedArgIntrin(MI, MRI, B,
4793                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4794   case Intrinsic::amdgcn_workgroup_id_y:
4795     return legalizePreloadedArgIntrin(MI, MRI, B,
4796                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4797   case Intrinsic::amdgcn_workgroup_id_z:
4798     return legalizePreloadedArgIntrin(MI, MRI, B,
4799                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4800   case Intrinsic::amdgcn_dispatch_ptr:
4801     return legalizePreloadedArgIntrin(MI, MRI, B,
4802                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4803   case Intrinsic::amdgcn_queue_ptr:
4804     return legalizePreloadedArgIntrin(MI, MRI, B,
4805                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4806   case Intrinsic::amdgcn_implicit_buffer_ptr:
4807     return legalizePreloadedArgIntrin(
4808       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4809   case Intrinsic::amdgcn_dispatch_id:
4810     return legalizePreloadedArgIntrin(MI, MRI, B,
4811                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4812   case Intrinsic::amdgcn_fdiv_fast:
4813     return legalizeFDIVFastIntrin(MI, MRI, B);
4814   case Intrinsic::amdgcn_is_shared:
4815     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4816   case Intrinsic::amdgcn_is_private:
4817     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4818   case Intrinsic::amdgcn_wavefrontsize: {
4819     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4820     MI.eraseFromParent();
4821     return true;
4822   }
4823   case Intrinsic::amdgcn_s_buffer_load:
4824     return legalizeSBufferLoad(Helper, MI);
4825   case Intrinsic::amdgcn_raw_buffer_store:
4826   case Intrinsic::amdgcn_struct_buffer_store:
4827     return legalizeBufferStore(MI, MRI, B, false, false);
4828   case Intrinsic::amdgcn_raw_buffer_store_format:
4829   case Intrinsic::amdgcn_struct_buffer_store_format:
4830     return legalizeBufferStore(MI, MRI, B, false, true);
4831   case Intrinsic::amdgcn_raw_tbuffer_store:
4832   case Intrinsic::amdgcn_struct_tbuffer_store:
4833     return legalizeBufferStore(MI, MRI, B, true, true);
4834   case Intrinsic::amdgcn_raw_buffer_load:
4835   case Intrinsic::amdgcn_struct_buffer_load:
4836     return legalizeBufferLoad(MI, MRI, B, false, false);
4837   case Intrinsic::amdgcn_raw_buffer_load_format:
4838   case Intrinsic::amdgcn_struct_buffer_load_format:
4839     return legalizeBufferLoad(MI, MRI, B, true, false);
4840   case Intrinsic::amdgcn_raw_tbuffer_load:
4841   case Intrinsic::amdgcn_struct_tbuffer_load:
4842     return legalizeBufferLoad(MI, MRI, B, true, true);
4843   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4844   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4845   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4846   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4847   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4848   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4849   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4850   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4851   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4852   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4853   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4854   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4855   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4856   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4857   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4858   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4859   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4860   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4861   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4862   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4863   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4864   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4865   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4866   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4867   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4868   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4869   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4870   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4871   case Intrinsic::amdgcn_buffer_atomic_fadd:
4872   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4873   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4874   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4875   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4876     return legalizeBufferAtomic(MI, B, IntrID);
4877   case Intrinsic::amdgcn_atomic_inc:
4878     return legalizeAtomicIncDec(MI, B, true);
4879   case Intrinsic::amdgcn_atomic_dec:
4880     return legalizeAtomicIncDec(MI, B, false);
4881   case Intrinsic::trap:
4882     return legalizeTrapIntrinsic(MI, MRI, B);
4883   case Intrinsic::debugtrap:
4884     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4885   case Intrinsic::amdgcn_rsq_clamp:
4886     return legalizeRsqClampIntrinsic(MI, MRI, B);
4887   case Intrinsic::amdgcn_ds_fadd:
4888   case Intrinsic::amdgcn_ds_fmin:
4889   case Intrinsic::amdgcn_ds_fmax:
4890     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4891   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4892     return legalizeBVHIntrinsic(MI, B);
4893   default: {
4894     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4895             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4896       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4897     return true;
4898   }
4899   }
4900 
4901   return true;
4902 }
4903