1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
getPow2VectorType(LLT Ty)49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
getPow2ScalarType(LLT Ty)56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
isSmallOddVector(unsigned TypeIdx)65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
sizeIsMultipleOf32(unsigned TypeIdx)79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
isWideVec16(unsigned TypeIdx)86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
oneMoreElement(unsigned TypeIdx)94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
fewerEltsToSize64Vector(unsigned TypeIdx)103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
moreEltsToNext32Bit(unsigned TypeIdx)118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
getBitcastRegisterType(const LLT Ty)134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
bitcastToRegisterType(unsigned TypeIdx)147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
bitcastToVectorElement32(unsigned TypeIdx)154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
vectorSmallerThan(unsigned TypeIdx,unsigned Size)164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
vectorWiderThan(unsigned TypeIdx,unsigned Size)171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
numElementsNotEven(unsigned TypeIdx)178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
isRegisterSize(unsigned Size)185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
isRegisterVectorElementType(LLT EltTy)189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
isRegisterVectorType(LLT Ty)194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
isRegisterType(LLT Ty)201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
isRegisterType(unsigned TypeIdx)213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
elementTypeIsLegal(unsigned TypeIdx)219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
isWideScalarExtLoadTruncStore(unsigned TypeIdx)231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
maxSizeForAddrSpace(const GCNSubtarget & ST,unsigned AS,bool IsLoad)242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
isLoadStoreSizeLegal(const GCNSubtarget & ST,const LegalityQuery & Query)267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
loadStoreBitcastWorkaround(const LLT Ty)337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
isLoadStoreLegal(const GCNSubtarget & ST,const LegalityQuery & Query)355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
shouldBitcastLoadStoreType(const GCNSubtarget & ST,const LLT Ty,const LLT MemTy)363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
shouldWidenLoad(const GCNSubtarget & ST,LLT MemoryTy,unsigned AlignInBits,unsigned AddrSpace,unsigned Opcode)382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
shouldWidenLoad(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .minScalar(0, S16)
536       .clampMaxNumElements(0, S16, 2)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32)
539       .scalarize(0);
540 
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542       .legalFor({S32, S16, V2S16}) // Clamp modifier
543       .minScalarOrElt(0, S16)
544       .clampMaxNumElements(0, S16, 2)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32)
547       .lower();
548   } else if (ST.has16BitInsts()) {
549     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550       .legalFor({S32, S16})
551       .minScalar(0, S16)
552       .widenScalarToNextMultipleOf(0, 32)
553       .maxScalar(0, S32)
554       .scalarize(0);
555 
556     // Technically the saturating operations require clamp bit support, but this
557     // was introduced at the same time as 16-bit operations.
558     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559       .legalFor({S32, S16}) // Clamp modifier
560       .minScalar(0, S16)
561       .scalarize(0)
562       .widenScalarToNextPow2(0, 16)
563       .lower();
564 
565     // We're just lowering this, but it helps get a better result to try to
566     // coerce to the desired type first.
567     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568       .minScalar(0, S16)
569       .scalarize(0)
570       .lower();
571   } else {
572     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573       .legalFor({S32})
574       .widenScalarToNextMultipleOf(0, 32)
575       .clampScalar(0, S32, S32)
576       .scalarize(0);
577 
578     if (ST.hasIntClamp()) {
579       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580         .legalFor({S32}) // Clamp modifier.
581         .scalarize(0)
582         .minScalarOrElt(0, S32)
583         .lower();
584     } else {
585       // Clamp bit support was added in VI, along with 16-bit operations.
586       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587         .minScalar(0, S32)
588         .scalarize(0)
589         .lower();
590     }
591 
592     // FIXME: DAG expansion gets better results. The widening uses the smaller
593     // range values and goes for the min/max lowering directly.
594     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595       .minScalar(0, S32)
596       .scalarize(0)
597       .lower();
598   }
599 
600   getActionDefinitionsBuilder(
601       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602       .customFor({S32, S64})
603       .clampScalar(0, S32, S64)
604       .widenScalarToNextPow2(0, 32)
605       .scalarize(0);
606 
607   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608                    .legalFor({S32})
609                    .maxScalar(0, S32);
610 
611   if (ST.hasVOP3PInsts()) {
612     Mulh
613       .clampMaxNumElements(0, S8, 2)
614       .lowerFor({V2S8});
615   }
616 
617   Mulh
618     .scalarize(0)
619     .lower();
620 
621   // Report legal for any types we can handle anywhere. For the cases only legal
622   // on the SALU, RegBankSelect will be able to re-legalize.
623   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625     .clampScalar(0, S32, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628     .widenScalarToNextPow2(0)
629     .scalarize(0);
630 
631   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633     .legalFor({{S32, S1}, {S32, S32}})
634     .minScalar(0, S32)
635     // TODO: .scalarize(0)
636     .lower();
637 
638   getActionDefinitionsBuilder(G_BITCAST)
639     // Don't worry about the size constraint.
640     .legalIf(all(isRegisterType(0), isRegisterType(1)))
641     .lower();
642 
643 
644   getActionDefinitionsBuilder(G_CONSTANT)
645     .legalFor({S1, S32, S64, S16, GlobalPtr,
646                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647     .legalIf(isPointer(0))
648     .clampScalar(0, S32, S64)
649     .widenScalarToNextPow2(0);
650 
651   getActionDefinitionsBuilder(G_FCONSTANT)
652     .legalFor({S32, S64, S16})
653     .clampScalar(0, S16, S64);
654 
655   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656       .legalIf(isRegisterType(0))
657       // s1 and s16 are special cases because they have legal operations on
658       // them, but don't really occupy registers in the normal way.
659       .legalFor({S1, S16})
660       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661       .clampScalarOrElt(0, S32, MaxScalar)
662       .widenScalarToNextPow2(0, 32)
663       .clampMaxNumElements(0, S32, 16);
664 
665   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667   // If the amount is divergent, we have to do a wave reduction to get the
668   // maximum value, so this is expanded during RegBankSelect.
669   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670     .legalFor({{PrivatePtr, S32}});
671 
672   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673     .customIf(typeIsNot(0, PrivatePtr));
674 
675   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677   auto &FPOpActions = getActionDefinitionsBuilder(
678     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679     .legalFor({S32, S64});
680   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681     .customFor({S32, S64});
682   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683     .customFor({S32, S64});
684 
685   if (ST.has16BitInsts()) {
686     if (ST.hasVOP3PInsts())
687       FPOpActions.legalFor({S16, V2S16});
688     else
689       FPOpActions.legalFor({S16});
690 
691     TrigActions.customFor({S16});
692     FDIVActions.customFor({S16});
693   }
694 
695   auto &MinNumMaxNum = getActionDefinitionsBuilder({
696       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698   if (ST.hasVOP3PInsts()) {
699     MinNumMaxNum.customFor(FPTypesPK16)
700       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701       .clampMaxNumElements(0, S16, 2)
702       .clampScalar(0, S16, S64)
703       .scalarize(0);
704   } else if (ST.has16BitInsts()) {
705     MinNumMaxNum.customFor(FPTypes16)
706       .clampScalar(0, S16, S64)
707       .scalarize(0);
708   } else {
709     MinNumMaxNum.customFor(FPTypesBase)
710       .clampScalar(0, S32, S64)
711       .scalarize(0);
712   }
713 
714   if (ST.hasVOP3PInsts())
715     FPOpActions.clampMaxNumElements(0, S16, 2);
716 
717   FPOpActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   TrigActions
722     .scalarize(0)
723     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725   FDIVActions
726     .scalarize(0)
727     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729   getActionDefinitionsBuilder({G_FNEG, G_FABS})
730     .legalFor(FPTypesPK16)
731     .clampMaxNumElements(0, S16, 2)
732     .scalarize(0)
733     .clampScalar(0, S16, S64);
734 
735   if (ST.has16BitInsts()) {
736     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737       .legalFor({S32, S64, S16})
738       .scalarize(0)
739       .clampScalar(0, S16, S64);
740   } else {
741     getActionDefinitionsBuilder(G_FSQRT)
742       .legalFor({S32, S64})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746     if (ST.hasFractBug()) {
747       getActionDefinitionsBuilder(G_FFLOOR)
748         .customFor({S64})
749         .legalFor({S32, S64})
750         .scalarize(0)
751         .clampScalar(0, S32, S64);
752     } else {
753       getActionDefinitionsBuilder(G_FFLOOR)
754         .legalFor({S32, S64})
755         .scalarize(0)
756         .clampScalar(0, S32, S64);
757     }
758   }
759 
760   getActionDefinitionsBuilder(G_FPTRUNC)
761     .legalFor({{S32, S64}, {S16, S32}})
762     .scalarize(0)
763     .lower();
764 
765   getActionDefinitionsBuilder(G_FPEXT)
766     .legalFor({{S64, S32}, {S32, S16}})
767     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768     .scalarize(0);
769 
770   getActionDefinitionsBuilder(G_FSUB)
771       // Use actual fsub instruction
772       .legalFor({S32})
773       // Must use fadd + fneg
774       .lowerFor({S64, S16, V2S16})
775       .scalarize(0)
776       .clampScalar(0, S32, S64);
777 
778   // Whether this is legal depends on the floating point mode for the function.
779   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781     FMad.customFor({S32, S16});
782   else if (ST.hasMadMacF32Insts())
783     FMad.customFor({S32});
784   else if (ST.hasMadF16())
785     FMad.customFor({S16});
786   FMad.scalarize(0)
787       .lower();
788 
789   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790   if (ST.has16BitInsts()) {
791     FRem.customFor({S16, S32, S64});
792   } else {
793     FRem.minScalar(0, S32)
794         .customFor({S32, S64});
795   }
796   FRem.scalarize(0);
797 
798   // TODO: Do we need to clamp maximum bitwidth?
799   getActionDefinitionsBuilder(G_TRUNC)
800     .legalIf(isScalar(0))
801     .legalFor({{V2S16, V2S32}})
802     .clampMaxNumElements(0, S16, 2)
803     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804     // situations (like an invalid implicit use), we don't want to infinite loop
805     // in the legalizer.
806     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807     .alwaysLegal();
808 
809   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811                {S32, S1}, {S64, S1}, {S16, S1}})
812     .scalarize(0)
813     .clampScalar(0, S32, S64)
814     .widenScalarToNextPow2(1, 32);
815 
816   // TODO: Split s1->s64 during regbankselect for VALU.
817   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819                     .lowerIf(typeIs(1, S1))
820                     .customFor({{S32, S64}, {S64, S64}});
821   if (ST.has16BitInsts())
822     IToFP.legalFor({{S16, S16}});
823   IToFP.clampScalar(1, S32, S64)
824        .minScalar(0, S32)
825        .scalarize(0)
826        .widenScalarToNextPow2(1);
827 
828   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830     .customFor({{S64, S32}, {S64, S64}})
831     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832   if (ST.has16BitInsts())
833     FPToI.legalFor({{S16, S16}});
834   else
835     FPToI.minScalar(1, S32);
836 
837   FPToI.minScalar(0, S32)
838        .widenScalarToNextPow2(0, 32)
839        .scalarize(0)
840        .lower();
841 
842   // Lower roundeven into G_FRINT
843   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844     .scalarize(0)
845     .lower();
846 
847   if (ST.has16BitInsts()) {
848     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849       .legalFor({S16, S32, S64})
850       .clampScalar(0, S16, S64)
851       .scalarize(0);
852   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854       .legalFor({S32, S64})
855       .clampScalar(0, S32, S64)
856       .scalarize(0);
857   } else {
858     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859       .legalFor({S32})
860       .customFor({S64})
861       .clampScalar(0, S32, S64)
862       .scalarize(0);
863   }
864 
865   getActionDefinitionsBuilder(G_PTR_ADD)
866     .legalIf(all(isPointer(0), sameSize(0, 1)))
867     .scalarize(0)
868     .scalarSameSizeAs(1, 0);
869 
870   getActionDefinitionsBuilder(G_PTRMASK)
871     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872     .scalarSameSizeAs(1, 0)
873     .scalarize(0);
874 
875   auto &CmpBuilder =
876     getActionDefinitionsBuilder(G_ICMP)
877     // The compare output type differs based on the register bank of the output,
878     // so make both s1 and s32 legal.
879     //
880     // Scalar compares producing output in scc will be promoted to s32, as that
881     // is the allocatable register type that will be needed for the copy from
882     // scc. This will be promoted during RegBankSelect, and we assume something
883     // before that won't try to use s32 result types.
884     //
885     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886     // bank.
887     .legalForCartesianProduct(
888       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889     .legalForCartesianProduct(
890       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891   if (ST.has16BitInsts()) {
892     CmpBuilder.legalFor({{S1, S16}});
893   }
894 
895   CmpBuilder
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0)
899     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900 
901   getActionDefinitionsBuilder(G_FCMP)
902     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903     .widenScalarToNextPow2(1)
904     .clampScalar(1, S32, S64)
905     .scalarize(0);
906 
907   // FIXME: fpow has a selection pattern that should move to custom lowering.
908   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909   if (ST.has16BitInsts())
910     Exp2Ops.legalFor({S32, S16});
911   else
912     Exp2Ops.legalFor({S32});
913   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914   Exp2Ops.scalarize(0);
915 
916   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917   if (ST.has16BitInsts())
918     ExpOps.customFor({{S32}, {S16}});
919   else
920     ExpOps.customFor({S32});
921   ExpOps.clampScalar(0, MinScalarFPTy, S32)
922         .scalarize(0);
923 
924   getActionDefinitionsBuilder(G_FPOWI)
925     .clampScalar(0, MinScalarFPTy, S32)
926     .lower();
927 
928   // The 64-bit versions produce 32-bit results, but only on the SALU.
929   getActionDefinitionsBuilder(G_CTPOP)
930     .legalFor({{S32, S32}, {S32, S64}})
931     .clampScalar(0, S32, S32)
932     .clampScalar(1, S32, S64)
933     .scalarize(0)
934     .widenScalarToNextPow2(0, 32)
935     .widenScalarToNextPow2(1, 32);
936 
937   // The hardware instructions return a different result on 0 than the generic
938   // instructions expect. The hardware produces -1, but these produce the
939   // bitwidth.
940   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
941     .scalarize(0)
942     .clampScalar(0, S32, S32)
943     .clampScalar(1, S32, S64)
944     .widenScalarToNextPow2(0, 32)
945     .widenScalarToNextPow2(1, 32)
946     .custom();
947 
948   // The 64-bit versions produce 32-bit results, but only on the SALU.
949   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
950     .legalFor({{S32, S32}, {S32, S64}})
951     .clampScalar(0, S32, S32)
952     .clampScalar(1, S32, S64)
953     .scalarize(0)
954     .widenScalarToNextPow2(0, 32)
955     .widenScalarToNextPow2(1, 32);
956 
957   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958   // RegBankSelect.
959   getActionDefinitionsBuilder(G_BITREVERSE)
960     .legalFor({S32, S64})
961     .clampScalar(0, S32, S64)
962     .scalarize(0)
963     .widenScalarToNextPow2(0);
964 
965   if (ST.has16BitInsts()) {
966     getActionDefinitionsBuilder(G_BSWAP)
967       .legalFor({S16, S32, V2S16})
968       .clampMaxNumElements(0, S16, 2)
969       // FIXME: Fixing non-power-of-2 before clamp is workaround for
970       // narrowScalar limitation.
971       .widenScalarToNextPow2(0)
972       .clampScalar(0, S16, S32)
973       .scalarize(0);
974 
975     if (ST.hasVOP3PInsts()) {
976       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
977         .legalFor({S32, S16, V2S16})
978         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
979         .clampMaxNumElements(0, S16, 2)
980         .minScalar(0, S16)
981         .widenScalarToNextPow2(0)
982         .scalarize(0)
983         .lower();
984     } else {
985       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
986         .legalFor({S32, S16})
987         .widenScalarToNextPow2(0)
988         .minScalar(0, S16)
989         .scalarize(0)
990         .lower();
991     }
992   } else {
993     // TODO: Should have same legality without v_perm_b32
994     getActionDefinitionsBuilder(G_BSWAP)
995       .legalFor({S32})
996       .lowerIf(scalarNarrowerThan(0, 32))
997       // FIXME: Fixing non-power-of-2 before clamp is workaround for
998       // narrowScalar limitation.
999       .widenScalarToNextPow2(0)
1000       .maxScalar(0, S32)
1001       .scalarize(0)
1002       .lower();
1003 
1004     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1005       .legalFor({S32})
1006       .minScalar(0, S32)
1007       .widenScalarToNextPow2(0)
1008       .scalarize(0)
1009       .lower();
1010   }
1011 
1012   getActionDefinitionsBuilder(G_INTTOPTR)
1013     // List the common cases
1014     .legalForCartesianProduct(AddrSpaces64, {S64})
1015     .legalForCartesianProduct(AddrSpaces32, {S32})
1016     .scalarize(0)
1017     // Accept any address space as long as the size matches
1018     .legalIf(sameSize(0, 1))
1019     .widenScalarIf(smallerThan(1, 0),
1020       [](const LegalityQuery &Query) {
1021         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1022       })
1023     .narrowScalarIf(largerThan(1, 0),
1024       [](const LegalityQuery &Query) {
1025         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1026       });
1027 
1028   getActionDefinitionsBuilder(G_PTRTOINT)
1029     // List the common cases
1030     .legalForCartesianProduct(AddrSpaces64, {S64})
1031     .legalForCartesianProduct(AddrSpaces32, {S32})
1032     .scalarize(0)
1033     // Accept any address space as long as the size matches
1034     .legalIf(sameSize(0, 1))
1035     .widenScalarIf(smallerThan(0, 1),
1036       [](const LegalityQuery &Query) {
1037         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1038       })
1039     .narrowScalarIf(
1040       largerThan(0, 1),
1041       [](const LegalityQuery &Query) {
1042         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043       });
1044 
1045   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1046     .scalarize(0)
1047     .custom();
1048 
1049   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1050                                     bool IsLoad) -> bool {
1051     const LLT DstTy = Query.Types[0];
1052 
1053     // Split vector extloads.
1054     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1056 
1057     if (MemSize < DstTy.getSizeInBits())
1058       MemSize = std::max(MemSize, AlignBits);
1059 
1060     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1061       return true;
1062 
1063     const LLT PtrTy = Query.Types[1];
1064     unsigned AS = PtrTy.getAddressSpace();
1065     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1066       return true;
1067 
1068     // Catch weird sized loads that don't evenly divide into the access sizes
1069     // TODO: May be able to widen depending on alignment etc.
1070     unsigned NumRegs = (MemSize + 31) / 32;
1071     if (NumRegs == 3) {
1072       if (!ST.hasDwordx3LoadStores())
1073         return true;
1074     } else {
1075       // If the alignment allows, these should have been widened.
1076       if (!isPowerOf2_32(NumRegs))
1077         return true;
1078     }
1079 
1080     if (AlignBits < MemSize) {
1081       const SITargetLowering *TLI = ST.getTargetLowering();
1082       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1083                                                       Align(AlignBits / 8));
1084     }
1085 
1086     return false;
1087   };
1088 
1089   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1090   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1091   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1092 
1093   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1094   // LDS
1095   // TODO: Unsupported flat for SI.
1096 
1097   for (unsigned Op : {G_LOAD, G_STORE}) {
1098     const bool IsStore = Op == G_STORE;
1099 
1100     auto &Actions = getActionDefinitionsBuilder(Op);
1101     // Explicitly list some common cases.
1102     // TODO: Does this help compile time at all?
1103     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1104                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1105                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1106                                       {S64, GlobalPtr, S64, GlobalAlign32},
1107                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1108                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1109                                       {S32, GlobalPtr, S8, GlobalAlign8},
1110                                       {S32, GlobalPtr, S16, GlobalAlign16},
1111 
1112                                       {S32, LocalPtr, S32, 32},
1113                                       {S64, LocalPtr, S64, 32},
1114                                       {V2S32, LocalPtr, V2S32, 32},
1115                                       {S32, LocalPtr, S8, 8},
1116                                       {S32, LocalPtr, S16, 16},
1117                                       {V2S16, LocalPtr, S32, 32},
1118 
1119                                       {S32, PrivatePtr, S32, 32},
1120                                       {S32, PrivatePtr, S8, 8},
1121                                       {S32, PrivatePtr, S16, 16},
1122                                       {V2S16, PrivatePtr, S32, 32},
1123 
1124                                       {S32, ConstantPtr, S32, GlobalAlign32},
1125                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1126                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1127                                       {S64, ConstantPtr, S64, GlobalAlign32},
1128                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1129     Actions.legalIf(
1130       [=](const LegalityQuery &Query) -> bool {
1131         return isLoadStoreLegal(ST, Query);
1132       });
1133 
1134     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1135     // 64-bits.
1136     //
1137     // TODO: Should generalize bitcast action into coerce, which will also cover
1138     // inserting addrspacecasts.
1139     Actions.customIf(typeIs(1, Constant32Ptr));
1140 
1141     // Turn any illegal element vectors into something easier to deal
1142     // with. These will ultimately produce 32-bit scalar shifts to extract the
1143     // parts anyway.
1144     //
1145     // For odd 16-bit element vectors, prefer to split those into pieces with
1146     // 16-bit vector parts.
1147     Actions.bitcastIf(
1148       [=](const LegalityQuery &Query) -> bool {
1149         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1150                                           Query.MMODescrs[0].MemoryTy);
1151       }, bitcastToRegisterType(0));
1152 
1153     if (!IsStore) {
1154       // Widen suitably aligned loads by loading extra bytes. The standard
1155       // legalization actions can't properly express widening memory operands.
1156       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1157         return shouldWidenLoad(ST, Query, G_LOAD);
1158       });
1159     }
1160 
1161     // FIXME: load/store narrowing should be moved to lower action
1162     Actions
1163         .narrowScalarIf(
1164             [=](const LegalityQuery &Query) -> bool {
1165               return !Query.Types[0].isVector() &&
1166                      needToSplitMemOp(Query, Op == G_LOAD);
1167             },
1168             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1169               const LLT DstTy = Query.Types[0];
1170               const LLT PtrTy = Query.Types[1];
1171 
1172               const unsigned DstSize = DstTy.getSizeInBits();
1173               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1174 
1175               // Split extloads.
1176               if (DstSize > MemSize)
1177                 return std::make_pair(0, LLT::scalar(MemSize));
1178 
1179               if (!isPowerOf2_32(DstSize)) {
1180                 // We're probably decomposing an odd sized store. Try to split
1181                 // to the widest type. TODO: Account for alignment. As-is it
1182                 // should be OK, since the new parts will be further legalized.
1183                 unsigned FloorSize = PowerOf2Floor(DstSize);
1184                 return std::make_pair(0, LLT::scalar(FloorSize));
1185               }
1186 
1187               if (DstSize > 32 && (DstSize % 32 != 0)) {
1188                 // FIXME: Need a way to specify non-extload of larger size if
1189                 // suitably aligned.
1190                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1191               }
1192 
1193               unsigned MaxSize = maxSizeForAddrSpace(ST,
1194                                                      PtrTy.getAddressSpace(),
1195                                                      Op == G_LOAD);
1196               if (MemSize > MaxSize)
1197                 return std::make_pair(0, LLT::scalar(MaxSize));
1198 
1199               unsigned Align = Query.MMODescrs[0].AlignInBits;
1200               return std::make_pair(0, LLT::scalar(Align));
1201             })
1202         .fewerElementsIf(
1203             [=](const LegalityQuery &Query) -> bool {
1204               return Query.Types[0].isVector() &&
1205                      needToSplitMemOp(Query, Op == G_LOAD);
1206             },
1207             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1208               const LLT DstTy = Query.Types[0];
1209               const LLT PtrTy = Query.Types[1];
1210 
1211               LLT EltTy = DstTy.getElementType();
1212               unsigned MaxSize = maxSizeForAddrSpace(ST,
1213                                                      PtrTy.getAddressSpace(),
1214                                                      Op == G_LOAD);
1215 
1216               // FIXME: Handle widened to power of 2 results better. This ends
1217               // up scalarizing.
1218               // FIXME: 3 element stores scalarized on SI
1219 
1220               // Split if it's too large for the address space.
1221               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1222               if (MemSize > MaxSize) {
1223                 unsigned NumElts = DstTy.getNumElements();
1224                 unsigned EltSize = EltTy.getSizeInBits();
1225 
1226                 if (MaxSize % EltSize == 0) {
1227                   return std::make_pair(
1228                       0, LLT::scalarOrVector(
1229                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1230                 }
1231 
1232                 unsigned NumPieces = MemSize / MaxSize;
1233 
1234                 // FIXME: Refine when odd breakdowns handled
1235                 // The scalars will need to be re-legalized.
1236                 if (NumPieces == 1 || NumPieces >= NumElts ||
1237                     NumElts % NumPieces != 0)
1238                   return std::make_pair(0, EltTy);
1239 
1240                 return std::make_pair(
1241                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1242               }
1243 
1244               // FIXME: We could probably handle weird extending loads better.
1245               if (DstTy.getSizeInBits() > MemSize)
1246                 return std::make_pair(0, EltTy);
1247 
1248               unsigned EltSize = EltTy.getSizeInBits();
1249               unsigned DstSize = DstTy.getSizeInBits();
1250               if (!isPowerOf2_32(DstSize)) {
1251                 // We're probably decomposing an odd sized store. Try to split
1252                 // to the widest type. TODO: Account for alignment. As-is it
1253                 // should be OK, since the new parts will be further legalized.
1254                 unsigned FloorSize = PowerOf2Floor(DstSize);
1255                 return std::make_pair(
1256                     0, LLT::scalarOrVector(
1257                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1258               }
1259 
1260               // Need to split because of alignment.
1261               unsigned Align = Query.MMODescrs[0].AlignInBits;
1262               if (EltSize > Align &&
1263                   (EltSize / Align < DstTy.getNumElements())) {
1264                 return std::make_pair(
1265                     0, LLT::fixed_vector(EltSize / Align, EltTy));
1266               }
1267 
1268               // May need relegalization for the scalars.
1269               return std::make_pair(0, EltTy);
1270             })
1271     .minScalar(0, S32)
1272     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1273     .widenScalarToNextPow2(0)
1274     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1275     .lower();
1276   }
1277 
1278   // FIXME: Unaligned accesses not lowered.
1279   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1280                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1281                                                   {S32, GlobalPtr, S16, 2 * 8},
1282                                                   {S32, LocalPtr, S8, 8},
1283                                                   {S32, LocalPtr, S16, 16},
1284                                                   {S32, PrivatePtr, S8, 8},
1285                                                   {S32, PrivatePtr, S16, 16},
1286                                                   {S32, ConstantPtr, S8, 8},
1287                                                   {S32, ConstantPtr, S16, 2 * 8}})
1288                        .legalIf(
1289                          [=](const LegalityQuery &Query) -> bool {
1290                            return isLoadStoreLegal(ST, Query);
1291                          });
1292 
1293   if (ST.hasFlatAddressSpace()) {
1294     ExtLoads.legalForTypesWithMemDesc(
1295         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1296   }
1297 
1298   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1299   // 64-bits.
1300   //
1301   // TODO: Should generalize bitcast action into coerce, which will also cover
1302   // inserting addrspacecasts.
1303   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1304 
1305   ExtLoads.clampScalar(0, S32, S32)
1306           .widenScalarToNextPow2(0)
1307           .lower();
1308 
1309   auto &Atomics = getActionDefinitionsBuilder(
1310     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1311      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1312      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1313      G_ATOMICRMW_UMIN})
1314     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1315                {S64, GlobalPtr}, {S64, LocalPtr},
1316                {S32, RegionPtr}, {S64, RegionPtr}});
1317   if (ST.hasFlatAddressSpace()) {
1318     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1319   }
1320 
1321   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1322   if (ST.hasLDSFPAtomicAdd()) {
1323     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1324     if (ST.hasGFX90AInsts())
1325       Atomic.legalFor({{S64, LocalPtr}});
1326   }
1327   if (ST.hasAtomicFaddInsts())
1328     Atomic.legalFor({{S32, GlobalPtr}});
1329 
1330   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1331   // demarshalling
1332   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1333     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1334                 {S32, FlatPtr}, {S64, FlatPtr}})
1335     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1336                {S32, RegionPtr}, {S64, RegionPtr}});
1337   // TODO: Pointer types, any 32-bit or 64-bit vector
1338 
1339   // Condition should be s32 for scalar, s1 for vector.
1340   getActionDefinitionsBuilder(G_SELECT)
1341       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1342                                  LocalPtr, FlatPtr, PrivatePtr,
1343                                  LLT::fixed_vector(2, LocalPtr),
1344                                  LLT::fixed_vector(2, PrivatePtr)},
1345                                 {S1, S32})
1346       .clampScalar(0, S16, S64)
1347       .scalarize(1)
1348       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1349       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1350       .clampMaxNumElements(0, S32, 2)
1351       .clampMaxNumElements(0, LocalPtr, 2)
1352       .clampMaxNumElements(0, PrivatePtr, 2)
1353       .scalarize(0)
1354       .widenScalarToNextPow2(0)
1355       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1356 
1357   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1358   // be more flexible with the shift amount type.
1359   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1360     .legalFor({{S32, S32}, {S64, S32}});
1361   if (ST.has16BitInsts()) {
1362     if (ST.hasVOP3PInsts()) {
1363       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1364             .clampMaxNumElements(0, S16, 2);
1365     } else
1366       Shifts.legalFor({{S16, S16}});
1367 
1368     // TODO: Support 16-bit shift amounts for all types
1369     Shifts.widenScalarIf(
1370       [=](const LegalityQuery &Query) {
1371         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1372         // 32-bit amount.
1373         const LLT ValTy = Query.Types[0];
1374         const LLT AmountTy = Query.Types[1];
1375         return ValTy.getSizeInBits() <= 16 &&
1376                AmountTy.getSizeInBits() < 16;
1377       }, changeTo(1, S16));
1378     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1379     Shifts.clampScalar(1, S32, S32);
1380     Shifts.clampScalar(0, S16, S64);
1381     Shifts.widenScalarToNextPow2(0, 16);
1382 
1383     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1384       .minScalar(0, S16)
1385       .scalarize(0)
1386       .lower();
1387   } else {
1388     // Make sure we legalize the shift amount type first, as the general
1389     // expansion for the shifted type will produce much worse code if it hasn't
1390     // been truncated already.
1391     Shifts.clampScalar(1, S32, S32);
1392     Shifts.clampScalar(0, S32, S64);
1393     Shifts.widenScalarToNextPow2(0, 32);
1394 
1395     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1396       .minScalar(0, S32)
1397       .scalarize(0)
1398       .lower();
1399   }
1400   Shifts.scalarize(0);
1401 
1402   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1403     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1404     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1405     unsigned IdxTypeIdx = 2;
1406 
1407     getActionDefinitionsBuilder(Op)
1408       .customIf([=](const LegalityQuery &Query) {
1409           const LLT EltTy = Query.Types[EltTypeIdx];
1410           const LLT VecTy = Query.Types[VecTypeIdx];
1411           const LLT IdxTy = Query.Types[IdxTypeIdx];
1412           const unsigned EltSize = EltTy.getSizeInBits();
1413           return (EltSize == 32 || EltSize == 64) &&
1414                   VecTy.getSizeInBits() % 32 == 0 &&
1415                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1416                   IdxTy.getSizeInBits() == 32;
1417         })
1418       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1419                  bitcastToVectorElement32(VecTypeIdx))
1420       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1421       .bitcastIf(
1422         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1423         [=](const LegalityQuery &Query) {
1424           // For > 64-bit element types, try to turn this into a 64-bit
1425           // element vector since we may be able to do better indexing
1426           // if this is scalar. If not, fall back to 32.
1427           const LLT EltTy = Query.Types[EltTypeIdx];
1428           const LLT VecTy = Query.Types[VecTypeIdx];
1429           const unsigned DstEltSize = EltTy.getSizeInBits();
1430           const unsigned VecSize = VecTy.getSizeInBits();
1431 
1432           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1433           return std::make_pair(
1434               VecTypeIdx,
1435               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1436         })
1437       .clampScalar(EltTypeIdx, S32, S64)
1438       .clampScalar(VecTypeIdx, S32, S64)
1439       .clampScalar(IdxTypeIdx, S32, S32)
1440       .clampMaxNumElements(VecTypeIdx, S32, 32)
1441       // TODO: Clamp elements for 64-bit vectors?
1442       // It should only be necessary with variable indexes.
1443       // As a last resort, lower to the stack
1444       .lower();
1445   }
1446 
1447   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1448     .unsupportedIf([=](const LegalityQuery &Query) {
1449         const LLT &EltTy = Query.Types[1].getElementType();
1450         return Query.Types[0] != EltTy;
1451       });
1452 
1453   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1454     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1455     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1456 
1457     // FIXME: Doesn't handle extract of illegal sizes.
1458     getActionDefinitionsBuilder(Op)
1459       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1460       // FIXME: Multiples of 16 should not be legal.
1461       .legalIf([=](const LegalityQuery &Query) {
1462           const LLT BigTy = Query.Types[BigTyIdx];
1463           const LLT LitTy = Query.Types[LitTyIdx];
1464           return (BigTy.getSizeInBits() % 32 == 0) &&
1465                  (LitTy.getSizeInBits() % 16 == 0);
1466         })
1467       .widenScalarIf(
1468         [=](const LegalityQuery &Query) {
1469           const LLT BigTy = Query.Types[BigTyIdx];
1470           return (BigTy.getScalarSizeInBits() < 16);
1471         },
1472         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1473       .widenScalarIf(
1474         [=](const LegalityQuery &Query) {
1475           const LLT LitTy = Query.Types[LitTyIdx];
1476           return (LitTy.getScalarSizeInBits() < 16);
1477         },
1478         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1479       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1480       .widenScalarToNextPow2(BigTyIdx, 32);
1481 
1482   }
1483 
1484   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1485     .legalForCartesianProduct(AllS32Vectors, {S32})
1486     .legalForCartesianProduct(AllS64Vectors, {S64})
1487     .clampNumElements(0, V16S32, V32S32)
1488     .clampNumElements(0, V2S64, V16S64)
1489     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1490 
1491   if (ST.hasScalarPackInsts()) {
1492     BuildVector
1493       // FIXME: Should probably widen s1 vectors straight to s32
1494       .minScalarOrElt(0, S16)
1495       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1496       .minScalar(1, S32);
1497 
1498     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1499       .legalFor({V2S16, S32})
1500       .lower();
1501     BuildVector.minScalarOrElt(0, S32);
1502   } else {
1503     BuildVector.customFor({V2S16, S16});
1504     BuildVector.minScalarOrElt(0, S32);
1505 
1506     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1507       .customFor({V2S16, S32})
1508       .lower();
1509   }
1510 
1511   BuildVector.legalIf(isRegisterType(0));
1512 
1513   // FIXME: Clamp maximum size
1514   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1515     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1516     .clampMaxNumElements(0, S32, 32)
1517     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1518     .clampMaxNumElements(0, S16, 64);
1519 
1520   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1521   // pre-legalize.
1522   if (ST.hasVOP3PInsts()) {
1523     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1524       .customFor({V2S16, V2S16})
1525       .lower();
1526   } else
1527     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1528 
1529   // Merge/Unmerge
1530   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1531     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1532     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1533 
1534     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1535       const LLT Ty = Query.Types[TypeIdx];
1536       if (Ty.isVector()) {
1537         const LLT &EltTy = Ty.getElementType();
1538         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1539           return true;
1540         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1541           return true;
1542       }
1543       return false;
1544     };
1545 
1546     auto &Builder = getActionDefinitionsBuilder(Op)
1547       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1548       .lowerFor({{S16, V2S16}})
1549       .lowerIf([=](const LegalityQuery &Query) {
1550           const LLT BigTy = Query.Types[BigTyIdx];
1551           return BigTy.getSizeInBits() == 32;
1552         })
1553       // Try to widen to s16 first for small types.
1554       // TODO: Only do this on targets with legal s16 shifts
1555       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1556       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1557       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1558       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1559                            elementTypeIs(1, S16)),
1560                        changeTo(1, V2S16))
1561       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1562       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1563       // valid.
1564       .clampScalar(LitTyIdx, S32, S512)
1565       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1566       // Break up vectors with weird elements into scalars
1567       .fewerElementsIf(
1568         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1569         scalarize(0))
1570       .fewerElementsIf(
1571         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1572         scalarize(1))
1573       .clampScalar(BigTyIdx, S32, MaxScalar);
1574 
1575     if (Op == G_MERGE_VALUES) {
1576       Builder.widenScalarIf(
1577         // TODO: Use 16-bit shifts if legal for 8-bit values?
1578         [=](const LegalityQuery &Query) {
1579           const LLT Ty = Query.Types[LitTyIdx];
1580           return Ty.getSizeInBits() < 32;
1581         },
1582         changeTo(LitTyIdx, S32));
1583     }
1584 
1585     Builder.widenScalarIf(
1586       [=](const LegalityQuery &Query) {
1587         const LLT Ty = Query.Types[BigTyIdx];
1588         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1589           Ty.getSizeInBits() % 16 != 0;
1590       },
1591       [=](const LegalityQuery &Query) {
1592         // Pick the next power of 2, or a multiple of 64 over 128.
1593         // Whichever is smaller.
1594         const LLT &Ty = Query.Types[BigTyIdx];
1595         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1596         if (NewSizeInBits >= 256) {
1597           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1598           if (RoundedTo < NewSizeInBits)
1599             NewSizeInBits = RoundedTo;
1600         }
1601         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1602       })
1603       // Any vectors left are the wrong size. Scalarize them.
1604       .scalarize(0)
1605       .scalarize(1);
1606   }
1607 
1608   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1609   // RegBankSelect.
1610   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1611     .legalFor({{S32}, {S64}});
1612 
1613   if (ST.hasVOP3PInsts()) {
1614     SextInReg.lowerFor({{V2S16}})
1615       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1616       // get more vector shift opportunities, since we'll get those when
1617       // expanded.
1618       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1619   } else if (ST.has16BitInsts()) {
1620     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1621   } else {
1622     // Prefer to promote to s32 before lowering if we don't have 16-bit
1623     // shifts. This avoid a lot of intermediate truncate and extend operations.
1624     SextInReg.lowerFor({{S32}, {S64}});
1625   }
1626 
1627   SextInReg
1628     .scalarize(0)
1629     .clampScalar(0, S32, S64)
1630     .lower();
1631 
1632   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1633     .scalarize(0)
1634     .lower();
1635 
1636   // TODO: Only Try to form v2s16 with legal packed instructions.
1637   getActionDefinitionsBuilder(G_FSHR)
1638     .legalFor({{S32, S32}})
1639     .lowerFor({{V2S16, V2S16}})
1640     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1641     .scalarize(0)
1642     .lower();
1643 
1644   if (ST.hasVOP3PInsts()) {
1645     getActionDefinitionsBuilder(G_FSHL)
1646       .lowerFor({{V2S16, V2S16}})
1647       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1648       .scalarize(0)
1649       .lower();
1650   } else {
1651     getActionDefinitionsBuilder(G_FSHL)
1652       .scalarize(0)
1653       .lower();
1654   }
1655 
1656   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1657     .legalFor({S64});
1658 
1659   getActionDefinitionsBuilder(G_FENCE)
1660     .alwaysLegal();
1661 
1662   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1663       .scalarize(0)
1664       .minScalar(0, S32)
1665       .lower();
1666 
1667   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1668       .legalFor({{S32, S32}, {S64, S32}})
1669       .clampScalar(1, S32, S32)
1670       .clampScalar(0, S32, S64)
1671       .widenScalarToNextPow2(0)
1672       .scalarize(0);
1673 
1674   getActionDefinitionsBuilder({
1675       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1676       G_FCOPYSIGN,
1677 
1678       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1679       G_ATOMICRMW_NAND,
1680       G_ATOMICRMW_FSUB,
1681       G_READ_REGISTER,
1682       G_WRITE_REGISTER,
1683 
1684       G_SADDO, G_SSUBO,
1685 
1686        // TODO: Implement
1687       G_FMINIMUM, G_FMAXIMUM}).lower();
1688 
1689   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1690       .lower();
1691 
1692   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1693         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1694         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1695     .unsupported();
1696 
1697   getLegacyLegalizerInfo().computeTables();
1698   verify(*ST.getInstrInfo());
1699 }
1700 
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI) const1701 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1702                                          MachineInstr &MI) const {
1703   MachineIRBuilder &B = Helper.MIRBuilder;
1704   MachineRegisterInfo &MRI = *B.getMRI();
1705 
1706   switch (MI.getOpcode()) {
1707   case TargetOpcode::G_ADDRSPACE_CAST:
1708     return legalizeAddrSpaceCast(MI, MRI, B);
1709   case TargetOpcode::G_FRINT:
1710     return legalizeFrint(MI, MRI, B);
1711   case TargetOpcode::G_FCEIL:
1712     return legalizeFceil(MI, MRI, B);
1713   case TargetOpcode::G_FREM:
1714     return legalizeFrem(MI, MRI, B);
1715   case TargetOpcode::G_INTRINSIC_TRUNC:
1716     return legalizeIntrinsicTrunc(MI, MRI, B);
1717   case TargetOpcode::G_SITOFP:
1718     return legalizeITOFP(MI, MRI, B, true);
1719   case TargetOpcode::G_UITOFP:
1720     return legalizeITOFP(MI, MRI, B, false);
1721   case TargetOpcode::G_FPTOSI:
1722     return legalizeFPTOI(MI, MRI, B, true);
1723   case TargetOpcode::G_FPTOUI:
1724     return legalizeFPTOI(MI, MRI, B, false);
1725   case TargetOpcode::G_FMINNUM:
1726   case TargetOpcode::G_FMAXNUM:
1727   case TargetOpcode::G_FMINNUM_IEEE:
1728   case TargetOpcode::G_FMAXNUM_IEEE:
1729     return legalizeMinNumMaxNum(Helper, MI);
1730   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1731     return legalizeExtractVectorElt(MI, MRI, B);
1732   case TargetOpcode::G_INSERT_VECTOR_ELT:
1733     return legalizeInsertVectorElt(MI, MRI, B);
1734   case TargetOpcode::G_SHUFFLE_VECTOR:
1735     return legalizeShuffleVector(MI, MRI, B);
1736   case TargetOpcode::G_FSIN:
1737   case TargetOpcode::G_FCOS:
1738     return legalizeSinCos(MI, MRI, B);
1739   case TargetOpcode::G_GLOBAL_VALUE:
1740     return legalizeGlobalValue(MI, MRI, B);
1741   case TargetOpcode::G_LOAD:
1742   case TargetOpcode::G_SEXTLOAD:
1743   case TargetOpcode::G_ZEXTLOAD:
1744     return legalizeLoad(Helper, MI);
1745   case TargetOpcode::G_FMAD:
1746     return legalizeFMad(MI, MRI, B);
1747   case TargetOpcode::G_FDIV:
1748     return legalizeFDIV(MI, MRI, B);
1749   case TargetOpcode::G_UDIV:
1750   case TargetOpcode::G_UREM:
1751   case TargetOpcode::G_UDIVREM:
1752     return legalizeUnsignedDIV_REM(MI, MRI, B);
1753   case TargetOpcode::G_SDIV:
1754   case TargetOpcode::G_SREM:
1755   case TargetOpcode::G_SDIVREM:
1756     return legalizeSignedDIV_REM(MI, MRI, B);
1757   case TargetOpcode::G_ATOMIC_CMPXCHG:
1758     return legalizeAtomicCmpXChg(MI, MRI, B);
1759   case TargetOpcode::G_FLOG:
1760     return legalizeFlog(MI, B, numbers::ln2f);
1761   case TargetOpcode::G_FLOG10:
1762     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1763   case TargetOpcode::G_FEXP:
1764     return legalizeFExp(MI, B);
1765   case TargetOpcode::G_FPOW:
1766     return legalizeFPow(MI, B);
1767   case TargetOpcode::G_FFLOOR:
1768     return legalizeFFloor(MI, MRI, B);
1769   case TargetOpcode::G_BUILD_VECTOR:
1770     return legalizeBuildVector(MI, MRI, B);
1771   case TargetOpcode::G_CTLZ:
1772   case TargetOpcode::G_CTTZ:
1773     return legalizeCTLZ_CTTZ(MI, MRI, B);
1774   default:
1775     return false;
1776   }
1777 
1778   llvm_unreachable("expected switch to return");
1779 }
1780 
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1781 Register AMDGPULegalizerInfo::getSegmentAperture(
1782   unsigned AS,
1783   MachineRegisterInfo &MRI,
1784   MachineIRBuilder &B) const {
1785   MachineFunction &MF = B.getMF();
1786   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1787   const LLT S32 = LLT::scalar(32);
1788 
1789   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1790 
1791   if (ST.hasApertureRegs()) {
1792     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1793     // getreg.
1794     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1795         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1796         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1797     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1798         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1799         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1800     unsigned Encoding =
1801         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1802         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1803         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1804 
1805     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1806 
1807     B.buildInstr(AMDGPU::S_GETREG_B32)
1808       .addDef(GetReg)
1809       .addImm(Encoding);
1810     MRI.setType(GetReg, S32);
1811 
1812     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1813     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1814   }
1815 
1816   Register QueuePtr = MRI.createGenericVirtualRegister(
1817     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1818 
1819   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1820     return Register();
1821 
1822   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1823   // private_segment_aperture_base_hi.
1824   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1825 
1826   // TODO: can we be smarter about machine pointer info?
1827   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1828   MachineMemOperand *MMO = MF.getMachineMemOperand(
1829       PtrInfo,
1830       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1831           MachineMemOperand::MOInvariant,
1832       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1833 
1834   Register LoadAddr;
1835 
1836   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1837   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1838 }
1839 
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1840 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1841   MachineInstr &MI, MachineRegisterInfo &MRI,
1842   MachineIRBuilder &B) const {
1843   MachineFunction &MF = B.getMF();
1844 
1845   const LLT S32 = LLT::scalar(32);
1846   Register Dst = MI.getOperand(0).getReg();
1847   Register Src = MI.getOperand(1).getReg();
1848 
1849   LLT DstTy = MRI.getType(Dst);
1850   LLT SrcTy = MRI.getType(Src);
1851   unsigned DestAS = DstTy.getAddressSpace();
1852   unsigned SrcAS = SrcTy.getAddressSpace();
1853 
1854   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1855   // vector element.
1856   assert(!DstTy.isVector());
1857 
1858   const AMDGPUTargetMachine &TM
1859     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1860 
1861   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1862     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1863     return true;
1864   }
1865 
1866   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1867     // Truncate.
1868     B.buildExtract(Dst, Src, 0);
1869     MI.eraseFromParent();
1870     return true;
1871   }
1872 
1873   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1874     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1875     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1876 
1877     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1878     // another. Merge operands are required to be the same type, but creating an
1879     // extra ptrtoint would be kind of pointless.
1880     auto HighAddr = B.buildConstant(
1881       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1882     B.buildMerge(Dst, {Src, HighAddr});
1883     MI.eraseFromParent();
1884     return true;
1885   }
1886 
1887   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1888     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1889            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1890     unsigned NullVal = TM.getNullPointerValue(DestAS);
1891 
1892     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1893     auto FlatNull = B.buildConstant(SrcTy, 0);
1894 
1895     // Extract low 32-bits of the pointer.
1896     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1897 
1898     auto CmpRes =
1899         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1900     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1901 
1902     MI.eraseFromParent();
1903     return true;
1904   }
1905 
1906   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1907     return false;
1908 
1909   if (!ST.hasFlatAddressSpace())
1910     return false;
1911 
1912   auto SegmentNull =
1913       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1914   auto FlatNull =
1915       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1916 
1917   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1918   if (!ApertureReg.isValid())
1919     return false;
1920 
1921   auto CmpRes =
1922       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1923 
1924   // Coerce the type of the low half of the result so we can use merge_values.
1925   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1926 
1927   // TODO: Should we allow mismatched types but matching sizes in merges to
1928   // avoid the ptrtoint?
1929   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1930   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1931 
1932   MI.eraseFromParent();
1933   return true;
1934 }
1935 
legalizeFrint(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1936 bool AMDGPULegalizerInfo::legalizeFrint(
1937   MachineInstr &MI, MachineRegisterInfo &MRI,
1938   MachineIRBuilder &B) const {
1939   Register Src = MI.getOperand(1).getReg();
1940   LLT Ty = MRI.getType(Src);
1941   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1942 
1943   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1944   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1945 
1946   auto C1 = B.buildFConstant(Ty, C1Val);
1947   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1948 
1949   // TODO: Should this propagate fast-math-flags?
1950   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1951   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1952 
1953   auto C2 = B.buildFConstant(Ty, C2Val);
1954   auto Fabs = B.buildFAbs(Ty, Src);
1955 
1956   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1957   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1958   MI.eraseFromParent();
1959   return true;
1960 }
1961 
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1962 bool AMDGPULegalizerInfo::legalizeFceil(
1963   MachineInstr &MI, MachineRegisterInfo &MRI,
1964   MachineIRBuilder &B) const {
1965 
1966   const LLT S1 = LLT::scalar(1);
1967   const LLT S64 = LLT::scalar(64);
1968 
1969   Register Src = MI.getOperand(1).getReg();
1970   assert(MRI.getType(Src) == S64);
1971 
1972   // result = trunc(src)
1973   // if (src > 0.0 && src != result)
1974   //   result += 1.0
1975 
1976   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1977 
1978   const auto Zero = B.buildFConstant(S64, 0.0);
1979   const auto One = B.buildFConstant(S64, 1.0);
1980   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1981   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1982   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1983   auto Add = B.buildSelect(S64, And, One, Zero);
1984 
1985   // TODO: Should this propagate fast-math-flags?
1986   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1987   return true;
1988 }
1989 
legalizeFrem(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1990 bool AMDGPULegalizerInfo::legalizeFrem(
1991   MachineInstr &MI, MachineRegisterInfo &MRI,
1992   MachineIRBuilder &B) const {
1993     Register DstReg = MI.getOperand(0).getReg();
1994     Register Src0Reg = MI.getOperand(1).getReg();
1995     Register Src1Reg = MI.getOperand(2).getReg();
1996     auto Flags = MI.getFlags();
1997     LLT Ty = MRI.getType(DstReg);
1998 
1999     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2000     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2001     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2002     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2003     MI.eraseFromParent();
2004     return true;
2005 }
2006 
extractF64Exponent(Register Hi,MachineIRBuilder & B)2007 static MachineInstrBuilder extractF64Exponent(Register Hi,
2008                                               MachineIRBuilder &B) {
2009   const unsigned FractBits = 52;
2010   const unsigned ExpBits = 11;
2011   LLT S32 = LLT::scalar(32);
2012 
2013   auto Const0 = B.buildConstant(S32, FractBits - 32);
2014   auto Const1 = B.buildConstant(S32, ExpBits);
2015 
2016   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2017     .addUse(Hi)
2018     .addUse(Const0.getReg(0))
2019     .addUse(Const1.getReg(0));
2020 
2021   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2022 }
2023 
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2024 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2025   MachineInstr &MI, MachineRegisterInfo &MRI,
2026   MachineIRBuilder &B) const {
2027   const LLT S1 = LLT::scalar(1);
2028   const LLT S32 = LLT::scalar(32);
2029   const LLT S64 = LLT::scalar(64);
2030 
2031   Register Src = MI.getOperand(1).getReg();
2032   assert(MRI.getType(Src) == S64);
2033 
2034   // TODO: Should this use extract since the low half is unused?
2035   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2036   Register Hi = Unmerge.getReg(1);
2037 
2038   // Extract the upper half, since this is where we will find the sign and
2039   // exponent.
2040   auto Exp = extractF64Exponent(Hi, B);
2041 
2042   const unsigned FractBits = 52;
2043 
2044   // Extract the sign bit.
2045   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2046   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2047 
2048   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2049 
2050   const auto Zero32 = B.buildConstant(S32, 0);
2051 
2052   // Extend back to 64-bits.
2053   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2054 
2055   auto Shr = B.buildAShr(S64, FractMask, Exp);
2056   auto Not = B.buildNot(S64, Shr);
2057   auto Tmp0 = B.buildAnd(S64, Src, Not);
2058   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2059 
2060   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2061   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2062 
2063   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2064   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2065   MI.eraseFromParent();
2066   return true;
2067 }
2068 
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2069 bool AMDGPULegalizerInfo::legalizeITOFP(
2070   MachineInstr &MI, MachineRegisterInfo &MRI,
2071   MachineIRBuilder &B, bool Signed) const {
2072 
2073   Register Dst = MI.getOperand(0).getReg();
2074   Register Src = MI.getOperand(1).getReg();
2075 
2076   const LLT S64 = LLT::scalar(64);
2077   const LLT S32 = LLT::scalar(32);
2078 
2079   assert(MRI.getType(Src) == S64);
2080 
2081   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2082   auto ThirtyTwo = B.buildConstant(S32, 32);
2083 
2084   if (MRI.getType(Dst) == S64) {
2085     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2086                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2087 
2088     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2089     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2090                      .addUse(CvtHi.getReg(0))
2091                      .addUse(ThirtyTwo.getReg(0));
2092 
2093     // TODO: Should this propagate fast-math-flags?
2094     B.buildFAdd(Dst, LdExp, CvtLo);
2095     MI.eraseFromParent();
2096     return true;
2097   }
2098 
2099   assert(MRI.getType(Dst) == S32);
2100 
2101   auto One = B.buildConstant(S32, 1);
2102 
2103   MachineInstrBuilder ShAmt;
2104   if (Signed) {
2105     auto ThirtyOne = B.buildConstant(S32, 31);
2106     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2107     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2108     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2109     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2110                                /*HasSideEffects=*/false)
2111                   .addUse(Unmerge.getReg(1));
2112     auto LS2 = B.buildSub(S32, LS, One);
2113     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2114   } else
2115     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2116   auto Norm = B.buildShl(S64, Src, ShAmt);
2117   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2118   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2119   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2120   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2121   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2122   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2123                    /*HasSideEffects=*/false)
2124       .addUse(FVal.getReg(0))
2125       .addUse(Scale.getReg(0));
2126   MI.eraseFromParent();
2127   return true;
2128 }
2129 
2130 // TODO: Copied from DAG implementation. Verify logic and document how this
2131 // actually works.
legalizeFPTOI(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2132 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2133                                         MachineRegisterInfo &MRI,
2134                                         MachineIRBuilder &B,
2135                                         bool Signed) const {
2136 
2137   Register Dst = MI.getOperand(0).getReg();
2138   Register Src = MI.getOperand(1).getReg();
2139 
2140   const LLT S64 = LLT::scalar(64);
2141   const LLT S32 = LLT::scalar(32);
2142 
2143   const LLT SrcLT = MRI.getType(Src);
2144   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2145 
2146   unsigned Flags = MI.getFlags();
2147 
2148   // The basic idea of converting a floating point number into a pair of 32-bit
2149   // integers is illustrated as follows:
2150   //
2151   //     tf := trunc(val);
2152   //    hif := floor(tf * 2^-32);
2153   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2154   //     hi := fptoi(hif);
2155   //     lo := fptoi(lof);
2156   //
2157   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2158   MachineInstrBuilder Sign;
2159   if (Signed && SrcLT == S32) {
2160     // However, a 32-bit floating point number has only 23 bits mantissa and
2161     // it's not enough to hold all the significant bits of `lof` if val is
2162     // negative. To avoid the loss of precision, We need to take the absolute
2163     // value after truncating and flip the result back based on the original
2164     // signedness.
2165     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2166     Trunc = B.buildFAbs(S32, Trunc, Flags);
2167   }
2168   MachineInstrBuilder K0, K1;
2169   if (SrcLT == S64) {
2170     K0 = B.buildFConstant(S64,
2171                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2172     K1 = B.buildFConstant(S64,
2173                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2174   } else {
2175     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2176     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2177   }
2178 
2179   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2180   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2181   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2182 
2183   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2184                                      : B.buildFPTOUI(S32, FloorMul);
2185   auto Lo = B.buildFPTOUI(S32, Fma);
2186 
2187   if (Signed && SrcLT == S32) {
2188     // Flip the result based on the signedness, which is either all 0s or 1s.
2189     Sign = B.buildMerge(S64, {Sign, Sign});
2190     // r := xor({lo, hi}, sign) - sign;
2191     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2192   } else
2193     B.buildMerge(Dst, {Lo, Hi});
2194   MI.eraseFromParent();
2195 
2196   return true;
2197 }
2198 
legalizeMinNumMaxNum(LegalizerHelper & Helper,MachineInstr & MI) const2199 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2200                                                MachineInstr &MI) const {
2201   MachineFunction &MF = Helper.MIRBuilder.getMF();
2202   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2203 
2204   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2205                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2206 
2207   // With ieee_mode disabled, the instructions have the correct behavior
2208   // already for G_FMINNUM/G_FMAXNUM
2209   if (!MFI->getMode().IEEE)
2210     return !IsIEEEOp;
2211 
2212   if (IsIEEEOp)
2213     return true;
2214 
2215   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2216 }
2217 
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2218 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2219   MachineInstr &MI, MachineRegisterInfo &MRI,
2220   MachineIRBuilder &B) const {
2221   // TODO: Should move some of this into LegalizerHelper.
2222 
2223   // TODO: Promote dynamic indexing of s16 to s32
2224 
2225   // FIXME: Artifact combiner probably should have replaced the truncated
2226   // constant before this, so we shouldn't need
2227   // getIConstantVRegValWithLookThrough.
2228   Optional<ValueAndVReg> MaybeIdxVal =
2229       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2230   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2231     return true;
2232   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2233 
2234   Register Dst = MI.getOperand(0).getReg();
2235   Register Vec = MI.getOperand(1).getReg();
2236 
2237   LLT VecTy = MRI.getType(Vec);
2238   LLT EltTy = VecTy.getElementType();
2239   assert(EltTy == MRI.getType(Dst));
2240 
2241   if (IdxVal < VecTy.getNumElements())
2242     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2243   else
2244     B.buildUndef(Dst);
2245 
2246   MI.eraseFromParent();
2247   return true;
2248 }
2249 
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2250 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2251   MachineInstr &MI, MachineRegisterInfo &MRI,
2252   MachineIRBuilder &B) const {
2253   // TODO: Should move some of this into LegalizerHelper.
2254 
2255   // TODO: Promote dynamic indexing of s16 to s32
2256 
2257   // FIXME: Artifact combiner probably should have replaced the truncated
2258   // constant before this, so we shouldn't need
2259   // getIConstantVRegValWithLookThrough.
2260   Optional<ValueAndVReg> MaybeIdxVal =
2261       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2262   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2263     return true;
2264 
2265   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2266   Register Dst = MI.getOperand(0).getReg();
2267   Register Vec = MI.getOperand(1).getReg();
2268   Register Ins = MI.getOperand(2).getReg();
2269 
2270   LLT VecTy = MRI.getType(Vec);
2271   LLT EltTy = VecTy.getElementType();
2272   assert(EltTy == MRI.getType(Ins));
2273 
2274   if (IdxVal < VecTy.getNumElements())
2275     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2276   else
2277     B.buildUndef(Dst);
2278 
2279   MI.eraseFromParent();
2280   return true;
2281 }
2282 
legalizeShuffleVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2283 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2284   MachineInstr &MI, MachineRegisterInfo &MRI,
2285   MachineIRBuilder &B) const {
2286   const LLT V2S16 = LLT::fixed_vector(2, 16);
2287 
2288   Register Dst = MI.getOperand(0).getReg();
2289   Register Src0 = MI.getOperand(1).getReg();
2290   LLT DstTy = MRI.getType(Dst);
2291   LLT SrcTy = MRI.getType(Src0);
2292 
2293   if (SrcTy == V2S16 && DstTy == V2S16 &&
2294       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2295     return true;
2296 
2297   MachineIRBuilder HelperBuilder(MI);
2298   GISelObserverWrapper DummyObserver;
2299   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2300   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2301 }
2302 
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2303 bool AMDGPULegalizerInfo::legalizeSinCos(
2304   MachineInstr &MI, MachineRegisterInfo &MRI,
2305   MachineIRBuilder &B) const {
2306 
2307   Register DstReg = MI.getOperand(0).getReg();
2308   Register SrcReg = MI.getOperand(1).getReg();
2309   LLT Ty = MRI.getType(DstReg);
2310   unsigned Flags = MI.getFlags();
2311 
2312   Register TrigVal;
2313   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2314   if (ST.hasTrigReducedRange()) {
2315     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2316     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2317       .addUse(MulVal.getReg(0))
2318       .setMIFlags(Flags).getReg(0);
2319   } else
2320     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2321 
2322   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2323     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2324   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2325     .addUse(TrigVal)
2326     .setMIFlags(Flags);
2327   MI.eraseFromParent();
2328   return true;
2329 }
2330 
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,int64_t Offset,unsigned GAFlags) const2331 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2332                                                   MachineIRBuilder &B,
2333                                                   const GlobalValue *GV,
2334                                                   int64_t Offset,
2335                                                   unsigned GAFlags) const {
2336   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2337   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2338   // to the following code sequence:
2339   //
2340   // For constant address space:
2341   //   s_getpc_b64 s[0:1]
2342   //   s_add_u32 s0, s0, $symbol
2343   //   s_addc_u32 s1, s1, 0
2344   //
2345   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2346   //   a fixup or relocation is emitted to replace $symbol with a literal
2347   //   constant, which is a pc-relative offset from the encoding of the $symbol
2348   //   operand to the global variable.
2349   //
2350   // For global address space:
2351   //   s_getpc_b64 s[0:1]
2352   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2353   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2354   //
2355   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2356   //   fixups or relocations are emitted to replace $symbol@*@lo and
2357   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2358   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2359   //   operand to the global variable.
2360   //
2361   // What we want here is an offset from the value returned by s_getpc
2362   // (which is the address of the s_add_u32 instruction) to the global
2363   // variable, but since the encoding of $symbol starts 4 bytes after the start
2364   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2365   // small. This requires us to add 4 to the global variable offset in order to
2366   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2367   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2368   // instruction.
2369 
2370   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2371 
2372   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2373     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2374 
2375   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2376     .addDef(PCReg);
2377 
2378   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2379   if (GAFlags == SIInstrInfo::MO_NONE)
2380     MIB.addImm(0);
2381   else
2382     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2383 
2384   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2385 
2386   if (PtrTy.getSizeInBits() == 32)
2387     B.buildExtract(DstReg, PCReg, 0);
2388   return true;
2389  }
2390 
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2391 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2392   MachineInstr &MI, MachineRegisterInfo &MRI,
2393   MachineIRBuilder &B) const {
2394   Register DstReg = MI.getOperand(0).getReg();
2395   LLT Ty = MRI.getType(DstReg);
2396   unsigned AS = Ty.getAddressSpace();
2397 
2398   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2399   MachineFunction &MF = B.getMF();
2400   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2401 
2402   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2403     if (!MFI->isModuleEntryFunction() &&
2404         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2405       const Function &Fn = MF.getFunction();
2406       DiagnosticInfoUnsupported BadLDSDecl(
2407         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2408         DS_Warning);
2409       Fn.getContext().diagnose(BadLDSDecl);
2410 
2411       // We currently don't have a way to correctly allocate LDS objects that
2412       // aren't directly associated with a kernel. We do force inlining of
2413       // functions that use local objects. However, if these dead functions are
2414       // not eliminated, we don't want a compile time error. Just emit a warning
2415       // and a trap, since there should be no callable path here.
2416       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2417       B.buildUndef(DstReg);
2418       MI.eraseFromParent();
2419       return true;
2420     }
2421 
2422     // TODO: We could emit code to handle the initialization somewhere.
2423     // We ignore the initializer for now and legalize it to allow selection.
2424     // The initializer will anyway get errored out during assembly emission.
2425     const SITargetLowering *TLI = ST.getTargetLowering();
2426     if (!TLI->shouldUseLDSConstAddress(GV)) {
2427       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2428       return true; // Leave in place;
2429     }
2430 
2431     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2432       Type *Ty = GV->getValueType();
2433       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2434       // zero-sized type in other languages to declare the dynamic shared
2435       // memory which size is not known at the compile time. They will be
2436       // allocated by the runtime and placed directly after the static
2437       // allocated ones. They all share the same offset.
2438       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2439         // Adjust alignment for that dynamic shared memory array.
2440         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2441         LLT S32 = LLT::scalar(32);
2442         auto Sz =
2443             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2444         B.buildIntToPtr(DstReg, Sz);
2445         MI.eraseFromParent();
2446         return true;
2447       }
2448     }
2449 
2450     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2451                                                    *cast<GlobalVariable>(GV)));
2452     MI.eraseFromParent();
2453     return true;
2454   }
2455 
2456   const SITargetLowering *TLI = ST.getTargetLowering();
2457 
2458   if (TLI->shouldEmitFixup(GV)) {
2459     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2460     MI.eraseFromParent();
2461     return true;
2462   }
2463 
2464   if (TLI->shouldEmitPCReloc(GV)) {
2465     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2466     MI.eraseFromParent();
2467     return true;
2468   }
2469 
2470   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2471   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2472 
2473   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2474   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2475       MachinePointerInfo::getGOT(MF),
2476       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2477           MachineMemOperand::MOInvariant,
2478       LoadTy, Align(8));
2479 
2480   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2481 
2482   if (Ty.getSizeInBits() == 32) {
2483     // Truncate if this is a 32-bit constant address.
2484     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2485     B.buildExtract(DstReg, Load, 0);
2486   } else
2487     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2488 
2489   MI.eraseFromParent();
2490   return true;
2491 }
2492 
widenToNextPowerOf2(LLT Ty)2493 static LLT widenToNextPowerOf2(LLT Ty) {
2494   if (Ty.isVector())
2495     return Ty.changeElementCount(
2496         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2497   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2498 }
2499 
legalizeLoad(LegalizerHelper & Helper,MachineInstr & MI) const2500 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2501                                        MachineInstr &MI) const {
2502   MachineIRBuilder &B = Helper.MIRBuilder;
2503   MachineRegisterInfo &MRI = *B.getMRI();
2504   GISelChangeObserver &Observer = Helper.Observer;
2505 
2506   Register PtrReg = MI.getOperand(1).getReg();
2507   LLT PtrTy = MRI.getType(PtrReg);
2508   unsigned AddrSpace = PtrTy.getAddressSpace();
2509 
2510   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2511     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2512     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2513     Observer.changingInstr(MI);
2514     MI.getOperand(1).setReg(Cast.getReg(0));
2515     Observer.changedInstr(MI);
2516     return true;
2517   }
2518 
2519   if (MI.getOpcode() != AMDGPU::G_LOAD)
2520     return false;
2521 
2522   Register ValReg = MI.getOperand(0).getReg();
2523   LLT ValTy = MRI.getType(ValReg);
2524 
2525   MachineMemOperand *MMO = *MI.memoperands_begin();
2526   const unsigned ValSize = ValTy.getSizeInBits();
2527   const LLT MemTy = MMO->getMemoryType();
2528   const Align MemAlign = MMO->getAlign();
2529   const unsigned MemSize = MemTy.getSizeInBits();
2530   const unsigned AlignInBits = 8 * MemAlign.value();
2531 
2532   // Widen non-power-of-2 loads to the alignment if needed
2533   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2534     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2535 
2536     // This was already the correct extending load result type, so just adjust
2537     // the memory type.
2538     if (WideMemSize == ValSize) {
2539       MachineFunction &MF = B.getMF();
2540 
2541       MachineMemOperand *WideMMO =
2542           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2543       Observer.changingInstr(MI);
2544       MI.setMemRefs(MF, {WideMMO});
2545       Observer.changedInstr(MI);
2546       return true;
2547     }
2548 
2549     // Don't bother handling edge case that should probably never be produced.
2550     if (ValSize > WideMemSize)
2551       return false;
2552 
2553     LLT WideTy = widenToNextPowerOf2(ValTy);
2554 
2555     Register WideLoad;
2556     if (!WideTy.isVector()) {
2557       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2558       B.buildTrunc(ValReg, WideLoad).getReg(0);
2559     } else {
2560       // Extract the subvector.
2561 
2562       if (isRegisterType(ValTy)) {
2563         // If this a case where G_EXTRACT is legal, use it.
2564         // (e.g. <3 x s32> -> <4 x s32>)
2565         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2566         B.buildExtract(ValReg, WideLoad, 0);
2567       } else {
2568         // For cases where the widened type isn't a nice register value, unmerge
2569         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2570         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2571         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2572         B.setInsertPt(B.getMBB(), MI.getIterator());
2573         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2574       }
2575     }
2576 
2577     MI.eraseFromParent();
2578     return true;
2579   }
2580 
2581   return false;
2582 }
2583 
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2584 bool AMDGPULegalizerInfo::legalizeFMad(
2585   MachineInstr &MI, MachineRegisterInfo &MRI,
2586   MachineIRBuilder &B) const {
2587   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2588   assert(Ty.isScalar());
2589 
2590   MachineFunction &MF = B.getMF();
2591   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2592 
2593   // TODO: Always legal with future ftz flag.
2594   // FIXME: Do we need just output?
2595   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2596     return true;
2597   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2598     return true;
2599 
2600   MachineIRBuilder HelperBuilder(MI);
2601   GISelObserverWrapper DummyObserver;
2602   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2603   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2604 }
2605 
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2606 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2607   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2608   Register DstReg = MI.getOperand(0).getReg();
2609   Register PtrReg = MI.getOperand(1).getReg();
2610   Register CmpVal = MI.getOperand(2).getReg();
2611   Register NewVal = MI.getOperand(3).getReg();
2612 
2613   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2614          "this should not have been custom lowered");
2615 
2616   LLT ValTy = MRI.getType(CmpVal);
2617   LLT VecTy = LLT::fixed_vector(2, ValTy);
2618 
2619   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2620 
2621   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2622     .addDef(DstReg)
2623     .addUse(PtrReg)
2624     .addUse(PackedVal)
2625     .setMemRefs(MI.memoperands());
2626 
2627   MI.eraseFromParent();
2628   return true;
2629 }
2630 
legalizeFlog(MachineInstr & MI,MachineIRBuilder & B,double Log2BaseInverted) const2631 bool AMDGPULegalizerInfo::legalizeFlog(
2632   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2633   Register Dst = MI.getOperand(0).getReg();
2634   Register Src = MI.getOperand(1).getReg();
2635   LLT Ty = B.getMRI()->getType(Dst);
2636   unsigned Flags = MI.getFlags();
2637 
2638   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2639   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2640 
2641   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2642   MI.eraseFromParent();
2643   return true;
2644 }
2645 
legalizeFExp(MachineInstr & MI,MachineIRBuilder & B) const2646 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2647                                        MachineIRBuilder &B) const {
2648   Register Dst = MI.getOperand(0).getReg();
2649   Register Src = MI.getOperand(1).getReg();
2650   unsigned Flags = MI.getFlags();
2651   LLT Ty = B.getMRI()->getType(Dst);
2652 
2653   auto K = B.buildFConstant(Ty, numbers::log2e);
2654   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2655   B.buildFExp2(Dst, Mul, Flags);
2656   MI.eraseFromParent();
2657   return true;
2658 }
2659 
legalizeFPow(MachineInstr & MI,MachineIRBuilder & B) const2660 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2661                                        MachineIRBuilder &B) const {
2662   Register Dst = MI.getOperand(0).getReg();
2663   Register Src0 = MI.getOperand(1).getReg();
2664   Register Src1 = MI.getOperand(2).getReg();
2665   unsigned Flags = MI.getFlags();
2666   LLT Ty = B.getMRI()->getType(Dst);
2667   const LLT S16 = LLT::scalar(16);
2668   const LLT S32 = LLT::scalar(32);
2669 
2670   if (Ty == S32) {
2671     auto Log = B.buildFLog2(S32, Src0, Flags);
2672     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2673       .addUse(Log.getReg(0))
2674       .addUse(Src1)
2675       .setMIFlags(Flags);
2676     B.buildFExp2(Dst, Mul, Flags);
2677   } else if (Ty == S16) {
2678     // There's no f16 fmul_legacy, so we need to convert for it.
2679     auto Log = B.buildFLog2(S16, Src0, Flags);
2680     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2681     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2682     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2683       .addUse(Ext0.getReg(0))
2684       .addUse(Ext1.getReg(0))
2685       .setMIFlags(Flags);
2686 
2687     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2688   } else
2689     return false;
2690 
2691   MI.eraseFromParent();
2692   return true;
2693 }
2694 
2695 // Find a source register, ignoring any possible source modifiers.
stripAnySourceMods(Register OrigSrc,MachineRegisterInfo & MRI)2696 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2697   Register ModSrc = OrigSrc;
2698   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2699     ModSrc = SrcFNeg->getOperand(1).getReg();
2700     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2701       ModSrc = SrcFAbs->getOperand(1).getReg();
2702   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2703     ModSrc = SrcFAbs->getOperand(1).getReg();
2704   return ModSrc;
2705 }
2706 
legalizeFFloor(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2707 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2708                                          MachineRegisterInfo &MRI,
2709                                          MachineIRBuilder &B) const {
2710 
2711   const LLT S1 = LLT::scalar(1);
2712   const LLT S64 = LLT::scalar(64);
2713   Register Dst = MI.getOperand(0).getReg();
2714   Register OrigSrc = MI.getOperand(1).getReg();
2715   unsigned Flags = MI.getFlags();
2716   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2717          "this should not have been custom lowered");
2718 
2719   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2720   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2721   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2722   // V_FRACT bug is:
2723   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2724   //
2725   // Convert floor(x) to (x - fract(x))
2726 
2727   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2728     .addUse(OrigSrc)
2729     .setMIFlags(Flags);
2730 
2731   // Give source modifier matching some assistance before obscuring a foldable
2732   // pattern.
2733 
2734   // TODO: We can avoid the neg on the fract? The input sign to fract
2735   // shouldn't matter?
2736   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2737 
2738   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2739 
2740   Register Min = MRI.createGenericVirtualRegister(S64);
2741 
2742   // We don't need to concern ourselves with the snan handling difference, so
2743   // use the one which will directly select.
2744   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2745   if (MFI->getMode().IEEE)
2746     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2747   else
2748     B.buildFMinNum(Min, Fract, Const, Flags);
2749 
2750   Register CorrectedFract = Min;
2751   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2752     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2753     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2754   }
2755 
2756   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2757   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2758 
2759   MI.eraseFromParent();
2760   return true;
2761 }
2762 
2763 // Turn an illegal packed v2s16 build vector into bit operations.
2764 // TODO: This should probably be a bitcast action in LegalizerHelper.
legalizeBuildVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2765 bool AMDGPULegalizerInfo::legalizeBuildVector(
2766   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2767   Register Dst = MI.getOperand(0).getReg();
2768   const LLT S32 = LLT::scalar(32);
2769   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2770 
2771   Register Src0 = MI.getOperand(1).getReg();
2772   Register Src1 = MI.getOperand(2).getReg();
2773   assert(MRI.getType(Src0) == LLT::scalar(16));
2774 
2775   auto Merge = B.buildMerge(S32, {Src0, Src1});
2776   B.buildBitcast(Dst, Merge);
2777 
2778   MI.eraseFromParent();
2779   return true;
2780 }
2781 
2782 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2783 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2784 // case with a single min instruction instead of a compare+select.
legalizeCTLZ_CTTZ(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2785 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2786                                             MachineRegisterInfo &MRI,
2787                                             MachineIRBuilder &B) const {
2788   Register Dst = MI.getOperand(0).getReg();
2789   Register Src = MI.getOperand(1).getReg();
2790   LLT DstTy = MRI.getType(Dst);
2791   LLT SrcTy = MRI.getType(Src);
2792 
2793   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2794                         ? AMDGPU::G_AMDGPU_FFBH_U32
2795                         : AMDGPU::G_AMDGPU_FFBL_B32;
2796   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2797   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2798 
2799   MI.eraseFromParent();
2800   return true;
2801 }
2802 
2803 // Check that this is a G_XOR x, -1
isNot(const MachineRegisterInfo & MRI,const MachineInstr & MI)2804 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2805   if (MI.getOpcode() != TargetOpcode::G_XOR)
2806     return false;
2807   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2808   return ConstVal && *ConstVal == -1;
2809 }
2810 
2811 // Return the use branch instruction, otherwise null if the usage is invalid.
2812 static MachineInstr *
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br,MachineBasicBlock * & UncondBrTarget,bool & Negated)2813 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2814                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2815   Register CondDef = MI.getOperand(0).getReg();
2816   if (!MRI.hasOneNonDBGUse(CondDef))
2817     return nullptr;
2818 
2819   MachineBasicBlock *Parent = MI.getParent();
2820   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2821 
2822   if (isNot(MRI, *UseMI)) {
2823     Register NegatedCond = UseMI->getOperand(0).getReg();
2824     if (!MRI.hasOneNonDBGUse(NegatedCond))
2825       return nullptr;
2826 
2827     // We're deleting the def of this value, so we need to remove it.
2828     eraseInstr(*UseMI, MRI);
2829 
2830     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2831     Negated = true;
2832   }
2833 
2834   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2835     return nullptr;
2836 
2837   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2838   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2839   if (Next == Parent->end()) {
2840     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2841     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2842       return nullptr;
2843     UncondBrTarget = &*NextMBB;
2844   } else {
2845     if (Next->getOpcode() != AMDGPU::G_BR)
2846       return nullptr;
2847     Br = &*Next;
2848     UncondBrTarget = Br->getOperand(0).getMBB();
2849   }
2850 
2851   return UseMI;
2852 }
2853 
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg,const TargetRegisterClass * ArgRC,LLT ArgTy) const2854 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2855                                          const ArgDescriptor *Arg,
2856                                          const TargetRegisterClass *ArgRC,
2857                                          LLT ArgTy) const {
2858   MCRegister SrcReg = Arg->getRegister();
2859   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2860   assert(DstReg.isVirtual() && "Virtual register expected");
2861 
2862   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2863                                              ArgTy);
2864   if (Arg->isMasked()) {
2865     // TODO: Should we try to emit this once in the entry block?
2866     const LLT S32 = LLT::scalar(32);
2867     const unsigned Mask = Arg->getMask();
2868     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2869 
2870     Register AndMaskSrc = LiveIn;
2871 
2872     if (Shift != 0) {
2873       auto ShiftAmt = B.buildConstant(S32, Shift);
2874       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2875     }
2876 
2877     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2878   } else {
2879     B.buildCopy(DstReg, LiveIn);
2880   }
2881 
2882   return true;
2883 }
2884 
loadInputValue(Register DstReg,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2885 bool AMDGPULegalizerInfo::loadInputValue(
2886     Register DstReg, MachineIRBuilder &B,
2887     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2888   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2889   const ArgDescriptor *Arg;
2890   const TargetRegisterClass *ArgRC;
2891   LLT ArgTy;
2892   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2893 
2894   if (!Arg) {
2895     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2896       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2897       // case the pointer argument may be missing and we use null.
2898       B.buildConstant(DstReg, 0);
2899       return true;
2900     }
2901 
2902     // It's undefined behavior if a function marked with the amdgpu-no-*
2903     // attributes uses the corresponding intrinsic.
2904     B.buildUndef(DstReg);
2905     return true;
2906   }
2907 
2908   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2909     return false; // TODO: Handle these
2910   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2911 }
2912 
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2913 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2914     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2915     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2916   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2917     return false;
2918 
2919   MI.eraseFromParent();
2920   return true;
2921 }
2922 
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2923 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2924                                        MachineRegisterInfo &MRI,
2925                                        MachineIRBuilder &B) const {
2926   Register Dst = MI.getOperand(0).getReg();
2927   LLT DstTy = MRI.getType(Dst);
2928   LLT S16 = LLT::scalar(16);
2929   LLT S32 = LLT::scalar(32);
2930   LLT S64 = LLT::scalar(64);
2931 
2932   if (DstTy == S16)
2933     return legalizeFDIV16(MI, MRI, B);
2934   if (DstTy == S32)
2935     return legalizeFDIV32(MI, MRI, B);
2936   if (DstTy == S64)
2937     return legalizeFDIV64(MI, MRI, B);
2938 
2939   return false;
2940 }
2941 
legalizeUnsignedDIV_REM32Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register X,Register Y) const2942 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2943                                                         Register DstDivReg,
2944                                                         Register DstRemReg,
2945                                                         Register X,
2946                                                         Register Y) const {
2947   const LLT S1 = LLT::scalar(1);
2948   const LLT S32 = LLT::scalar(32);
2949 
2950   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2951   // algorithm used here.
2952 
2953   // Initial estimate of inv(y).
2954   auto FloatY = B.buildUITOFP(S32, Y);
2955   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2956   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2957   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2958   auto Z = B.buildFPTOUI(S32, ScaledY);
2959 
2960   // One round of UNR.
2961   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2962   auto NegYZ = B.buildMul(S32, NegY, Z);
2963   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2964 
2965   // Quotient/remainder estimate.
2966   auto Q = B.buildUMulH(S32, X, Z);
2967   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2968 
2969   // First quotient/remainder refinement.
2970   auto One = B.buildConstant(S32, 1);
2971   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2972   if (DstDivReg)
2973     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2974   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2975 
2976   // Second quotient/remainder refinement.
2977   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2978   if (DstDivReg)
2979     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2980 
2981   if (DstRemReg)
2982     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2983 }
2984 
2985 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
2986 //
2987 // Return lo, hi of result
2988 //
2989 // %cvt.lo = G_UITOFP Val.lo
2990 // %cvt.hi = G_UITOFP Val.hi
2991 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2992 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2993 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2994 // %mul2 = G_FMUL %mul1, 2**(-32)
2995 // %trunc = G_INTRINSIC_TRUNC %mul2
2996 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2997 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
emitReciprocalU64(MachineIRBuilder & B,Register Val)2998 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2999                                                        Register Val) {
3000   const LLT S32 = LLT::scalar(32);
3001   auto Unmerge = B.buildUnmerge(S32, Val);
3002 
3003   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3004   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3005 
3006   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3007                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3008 
3009   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3010   auto Mul1 =
3011       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3012 
3013   // 2**(-32)
3014   auto Mul2 =
3015       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3016   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3017 
3018   // -(2**32)
3019   auto Mad2 = B.buildFMAD(S32, Trunc,
3020                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3021 
3022   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3023   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3024 
3025   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3026 }
3027 
legalizeUnsignedDIV_REM64Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register Numer,Register Denom) const3028 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3029                                                         Register DstDivReg,
3030                                                         Register DstRemReg,
3031                                                         Register Numer,
3032                                                         Register Denom) const {
3033   const LLT S32 = LLT::scalar(32);
3034   const LLT S64 = LLT::scalar(64);
3035   const LLT S1 = LLT::scalar(1);
3036   Register RcpLo, RcpHi;
3037 
3038   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3039 
3040   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3041 
3042   auto Zero64 = B.buildConstant(S64, 0);
3043   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3044 
3045   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3046   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3047 
3048   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3049   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3050   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3051 
3052   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3053   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3054   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
3055   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3056 
3057   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3058   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3059   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3060   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3061   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3062 
3063   auto Zero32 = B.buildConstant(S32, 0);
3064   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3065   auto Add2_HiC =
3066       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
3067   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
3068   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3069 
3070   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3071   Register NumerLo = UnmergeNumer.getReg(0);
3072   Register NumerHi = UnmergeNumer.getReg(1);
3073 
3074   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3075   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3076   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3077   Register Mul3_Lo = UnmergeMul3.getReg(0);
3078   Register Mul3_Hi = UnmergeMul3.getReg(1);
3079   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3080   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3081   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3082   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3083 
3084   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3085   Register DenomLo = UnmergeDenom.getReg(0);
3086   Register DenomHi = UnmergeDenom.getReg(1);
3087 
3088   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3089   auto C1 = B.buildSExt(S32, CmpHi);
3090 
3091   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3092   auto C2 = B.buildSExt(S32, CmpLo);
3093 
3094   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3095   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3096 
3097   // TODO: Here and below portions of the code can be enclosed into if/endif.
3098   // Currently control flow is unconditional and we have 4 selects after
3099   // potential endif to substitute PHIs.
3100 
3101   // if C3 != 0 ...
3102   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3103   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3104   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3105   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3106 
3107   auto One64 = B.buildConstant(S64, 1);
3108   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3109 
3110   auto C4 =
3111       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3112   auto C5 =
3113       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3114   auto C6 = B.buildSelect(
3115       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3116 
3117   // if (C6 != 0)
3118   auto Add4 = B.buildAdd(S64, Add3, One64);
3119   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3120 
3121   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3122   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3123   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3124 
3125   // endif C6
3126   // endif C3
3127 
3128   if (DstDivReg) {
3129     auto Sel1 = B.buildSelect(
3130         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3131     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3132                   Sel1, MulHi3);
3133   }
3134 
3135   if (DstRemReg) {
3136     auto Sel2 = B.buildSelect(
3137         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3138     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3139                   Sel2, Sub1);
3140   }
3141 }
3142 
legalizeUnsignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3143 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3144                                                   MachineRegisterInfo &MRI,
3145                                                   MachineIRBuilder &B) const {
3146   Register DstDivReg, DstRemReg;
3147   switch (MI.getOpcode()) {
3148   default:
3149     llvm_unreachable("Unexpected opcode!");
3150   case AMDGPU::G_UDIV: {
3151     DstDivReg = MI.getOperand(0).getReg();
3152     break;
3153   }
3154   case AMDGPU::G_UREM: {
3155     DstRemReg = MI.getOperand(0).getReg();
3156     break;
3157   }
3158   case AMDGPU::G_UDIVREM: {
3159     DstDivReg = MI.getOperand(0).getReg();
3160     DstRemReg = MI.getOperand(1).getReg();
3161     break;
3162   }
3163   }
3164 
3165   const LLT S64 = LLT::scalar(64);
3166   const LLT S32 = LLT::scalar(32);
3167   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3168   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3169   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3170   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3171 
3172   if (Ty == S32)
3173     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3174   else if (Ty == S64)
3175     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3176   else
3177     return false;
3178 
3179   MI.eraseFromParent();
3180   return true;
3181 }
3182 
legalizeSignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3183 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3184                                                 MachineRegisterInfo &MRI,
3185                                                 MachineIRBuilder &B) const {
3186   const LLT S64 = LLT::scalar(64);
3187   const LLT S32 = LLT::scalar(32);
3188 
3189   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3190   if (Ty != S32 && Ty != S64)
3191     return false;
3192 
3193   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3194   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3195   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3196 
3197   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3198   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3199   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3200 
3201   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3202   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3203 
3204   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3205   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3206 
3207   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3208   switch (MI.getOpcode()) {
3209   default:
3210     llvm_unreachable("Unexpected opcode!");
3211   case AMDGPU::G_SDIV: {
3212     DstDivReg = MI.getOperand(0).getReg();
3213     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3214     break;
3215   }
3216   case AMDGPU::G_SREM: {
3217     DstRemReg = MI.getOperand(0).getReg();
3218     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3219     break;
3220   }
3221   case AMDGPU::G_SDIVREM: {
3222     DstDivReg = MI.getOperand(0).getReg();
3223     DstRemReg = MI.getOperand(1).getReg();
3224     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3225     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3226     break;
3227   }
3228   }
3229 
3230   if (Ty == S32)
3231     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3232   else
3233     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3234 
3235   if (DstDivReg) {
3236     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3237     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3238     B.buildSub(DstDivReg, SignXor, Sign);
3239   }
3240 
3241   if (DstRemReg) {
3242     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3243     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3244     B.buildSub(DstRemReg, SignXor, Sign);
3245   }
3246 
3247   MI.eraseFromParent();
3248   return true;
3249 }
3250 
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3251 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3252                                                  MachineRegisterInfo &MRI,
3253                                                  MachineIRBuilder &B) const {
3254   Register Res = MI.getOperand(0).getReg();
3255   Register LHS = MI.getOperand(1).getReg();
3256   Register RHS = MI.getOperand(2).getReg();
3257   uint16_t Flags = MI.getFlags();
3258   LLT ResTy = MRI.getType(Res);
3259 
3260   const MachineFunction &MF = B.getMF();
3261   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3262                             MI.getFlag(MachineInstr::FmAfn);
3263 
3264   if (!AllowInaccurateRcp)
3265     return false;
3266 
3267   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3268     // 1 / x -> RCP(x)
3269     if (CLHS->isExactlyValue(1.0)) {
3270       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3271         .addUse(RHS)
3272         .setMIFlags(Flags);
3273 
3274       MI.eraseFromParent();
3275       return true;
3276     }
3277 
3278     // -1 / x -> RCP( FNEG(x) )
3279     if (CLHS->isExactlyValue(-1.0)) {
3280       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3281       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3282         .addUse(FNeg.getReg(0))
3283         .setMIFlags(Flags);
3284 
3285       MI.eraseFromParent();
3286       return true;
3287     }
3288   }
3289 
3290   // x / y -> x * (1.0 / y)
3291   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3292     .addUse(RHS)
3293     .setMIFlags(Flags);
3294   B.buildFMul(Res, LHS, RCP, Flags);
3295 
3296   MI.eraseFromParent();
3297   return true;
3298 }
3299 
legalizeFastUnsafeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3300 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3301                                                    MachineRegisterInfo &MRI,
3302                                                    MachineIRBuilder &B) const {
3303   Register Res = MI.getOperand(0).getReg();
3304   Register X = MI.getOperand(1).getReg();
3305   Register Y = MI.getOperand(2).getReg();
3306   uint16_t Flags = MI.getFlags();
3307   LLT ResTy = MRI.getType(Res);
3308 
3309   const MachineFunction &MF = B.getMF();
3310   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3311                             MI.getFlag(MachineInstr::FmAfn);
3312 
3313   if (!AllowInaccurateRcp)
3314     return false;
3315 
3316   auto NegY = B.buildFNeg(ResTy, Y);
3317   auto One = B.buildFConstant(ResTy, 1.0);
3318 
3319   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3320     .addUse(Y)
3321     .setMIFlags(Flags);
3322 
3323   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3324   R = B.buildFMA(ResTy, Tmp0, R, R);
3325 
3326   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3327   R = B.buildFMA(ResTy, Tmp1, R, R);
3328 
3329   auto Ret = B.buildFMul(ResTy, X, R);
3330   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3331 
3332   B.buildFMA(Res, Tmp2, R, Ret);
3333   MI.eraseFromParent();
3334   return true;
3335 }
3336 
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3337 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3338                                          MachineRegisterInfo &MRI,
3339                                          MachineIRBuilder &B) const {
3340   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3341     return true;
3342 
3343   Register Res = MI.getOperand(0).getReg();
3344   Register LHS = MI.getOperand(1).getReg();
3345   Register RHS = MI.getOperand(2).getReg();
3346 
3347   uint16_t Flags = MI.getFlags();
3348 
3349   LLT S16 = LLT::scalar(16);
3350   LLT S32 = LLT::scalar(32);
3351 
3352   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3353   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3354 
3355   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3356     .addUse(RHSExt.getReg(0))
3357     .setMIFlags(Flags);
3358 
3359   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3360   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3361 
3362   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3363     .addUse(RDst.getReg(0))
3364     .addUse(RHS)
3365     .addUse(LHS)
3366     .setMIFlags(Flags);
3367 
3368   MI.eraseFromParent();
3369   return true;
3370 }
3371 
3372 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3373 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,AMDGPU::SIModeRegisterDefaults Mode)3374 static void toggleSPDenormMode(bool Enable,
3375                                MachineIRBuilder &B,
3376                                const GCNSubtarget &ST,
3377                                AMDGPU::SIModeRegisterDefaults Mode) {
3378   // Set SP denorm mode to this value.
3379   unsigned SPDenormMode =
3380     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3381 
3382   if (ST.hasDenormModeInst()) {
3383     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3384     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3385 
3386     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3387     B.buildInstr(AMDGPU::S_DENORM_MODE)
3388       .addImm(NewDenormModeValue);
3389 
3390   } else {
3391     // Select FP32 bit field in mode register.
3392     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3393                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3394                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3395 
3396     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3397       .addImm(SPDenormMode)
3398       .addImm(SPDenormModeBitField);
3399   }
3400 }
3401 
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3402 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3403                                          MachineRegisterInfo &MRI,
3404                                          MachineIRBuilder &B) const {
3405   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3406     return true;
3407 
3408   Register Res = MI.getOperand(0).getReg();
3409   Register LHS = MI.getOperand(1).getReg();
3410   Register RHS = MI.getOperand(2).getReg();
3411   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3412   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3413 
3414   uint16_t Flags = MI.getFlags();
3415 
3416   LLT S32 = LLT::scalar(32);
3417   LLT S1 = LLT::scalar(1);
3418 
3419   auto One = B.buildFConstant(S32, 1.0f);
3420 
3421   auto DenominatorScaled =
3422     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3423       .addUse(LHS)
3424       .addUse(RHS)
3425       .addImm(0)
3426       .setMIFlags(Flags);
3427   auto NumeratorScaled =
3428     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3429       .addUse(LHS)
3430       .addUse(RHS)
3431       .addImm(1)
3432       .setMIFlags(Flags);
3433 
3434   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3435     .addUse(DenominatorScaled.getReg(0))
3436     .setMIFlags(Flags);
3437   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3438 
3439   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3440   // aren't modeled as reading it.
3441   if (!Mode.allFP32Denormals())
3442     toggleSPDenormMode(true, B, ST, Mode);
3443 
3444   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3445   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3446   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3447   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3448   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3449   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3450 
3451   if (!Mode.allFP32Denormals())
3452     toggleSPDenormMode(false, B, ST, Mode);
3453 
3454   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3455     .addUse(Fma4.getReg(0))
3456     .addUse(Fma1.getReg(0))
3457     .addUse(Fma3.getReg(0))
3458     .addUse(NumeratorScaled.getReg(1))
3459     .setMIFlags(Flags);
3460 
3461   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3462     .addUse(Fmas.getReg(0))
3463     .addUse(RHS)
3464     .addUse(LHS)
3465     .setMIFlags(Flags);
3466 
3467   MI.eraseFromParent();
3468   return true;
3469 }
3470 
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3471 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3472                                          MachineRegisterInfo &MRI,
3473                                          MachineIRBuilder &B) const {
3474   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3475     return true;
3476 
3477   Register Res = MI.getOperand(0).getReg();
3478   Register LHS = MI.getOperand(1).getReg();
3479   Register RHS = MI.getOperand(2).getReg();
3480 
3481   uint16_t Flags = MI.getFlags();
3482 
3483   LLT S64 = LLT::scalar(64);
3484   LLT S1 = LLT::scalar(1);
3485 
3486   auto One = B.buildFConstant(S64, 1.0);
3487 
3488   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3489     .addUse(LHS)
3490     .addUse(RHS)
3491     .addImm(0)
3492     .setMIFlags(Flags);
3493 
3494   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3495 
3496   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3497     .addUse(DivScale0.getReg(0))
3498     .setMIFlags(Flags);
3499 
3500   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3501   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3502   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3503 
3504   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3505     .addUse(LHS)
3506     .addUse(RHS)
3507     .addImm(1)
3508     .setMIFlags(Flags);
3509 
3510   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3511   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3512   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3513 
3514   Register Scale;
3515   if (!ST.hasUsableDivScaleConditionOutput()) {
3516     // Workaround a hardware bug on SI where the condition output from div_scale
3517     // is not usable.
3518 
3519     LLT S32 = LLT::scalar(32);
3520 
3521     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3522     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3523     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3524     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3525 
3526     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3527                               Scale1Unmerge.getReg(1));
3528     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3529                               Scale0Unmerge.getReg(1));
3530     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3531   } else {
3532     Scale = DivScale1.getReg(1);
3533   }
3534 
3535   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3536     .addUse(Fma4.getReg(0))
3537     .addUse(Fma3.getReg(0))
3538     .addUse(Mul.getReg(0))
3539     .addUse(Scale)
3540     .setMIFlags(Flags);
3541 
3542   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3543     .addUse(Fmas.getReg(0))
3544     .addUse(RHS)
3545     .addUse(LHS)
3546     .setMIFlags(Flags);
3547 
3548   MI.eraseFromParent();
3549   return true;
3550 }
3551 
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3552 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3553                                                  MachineRegisterInfo &MRI,
3554                                                  MachineIRBuilder &B) const {
3555   Register Res = MI.getOperand(0).getReg();
3556   Register LHS = MI.getOperand(2).getReg();
3557   Register RHS = MI.getOperand(3).getReg();
3558   uint16_t Flags = MI.getFlags();
3559 
3560   LLT S32 = LLT::scalar(32);
3561   LLT S1 = LLT::scalar(1);
3562 
3563   auto Abs = B.buildFAbs(S32, RHS, Flags);
3564   const APFloat C0Val(1.0f);
3565 
3566   auto C0 = B.buildConstant(S32, 0x6f800000);
3567   auto C1 = B.buildConstant(S32, 0x2f800000);
3568   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3569 
3570   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3571   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3572 
3573   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3574 
3575   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3576     .addUse(Mul0.getReg(0))
3577     .setMIFlags(Flags);
3578 
3579   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3580 
3581   B.buildFMul(Res, Sel, Mul1, Flags);
3582 
3583   MI.eraseFromParent();
3584   return true;
3585 }
3586 
3587 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3588 // FIXME: Why do we handle this one but not other removed instructions?
3589 //
3590 // Reciprocal square root.  The clamp prevents infinite results, clamping
3591 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3592 // +-max_float.
legalizeRsqClampIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3593 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3594                                                     MachineRegisterInfo &MRI,
3595                                                     MachineIRBuilder &B) const {
3596   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3597     return true;
3598 
3599   Register Dst = MI.getOperand(0).getReg();
3600   Register Src = MI.getOperand(2).getReg();
3601   auto Flags = MI.getFlags();
3602 
3603   LLT Ty = MRI.getType(Dst);
3604 
3605   const fltSemantics *FltSemantics;
3606   if (Ty == LLT::scalar(32))
3607     FltSemantics = &APFloat::IEEEsingle();
3608   else if (Ty == LLT::scalar(64))
3609     FltSemantics = &APFloat::IEEEdouble();
3610   else
3611     return false;
3612 
3613   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3614     .addUse(Src)
3615     .setMIFlags(Flags);
3616 
3617   // We don't need to concern ourselves with the snan handling difference, since
3618   // the rsq quieted (or not) so use the one which will directly select.
3619   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3620   const bool UseIEEE = MFI->getMode().IEEE;
3621 
3622   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3623   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3624                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3625 
3626   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3627 
3628   if (UseIEEE)
3629     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3630   else
3631     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3632   MI.eraseFromParent();
3633   return true;
3634 }
3635 
getDSFPAtomicOpcode(Intrinsic::ID IID)3636 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3637   switch (IID) {
3638   case Intrinsic::amdgcn_ds_fadd:
3639     return AMDGPU::G_ATOMICRMW_FADD;
3640   case Intrinsic::amdgcn_ds_fmin:
3641     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3642   case Intrinsic::amdgcn_ds_fmax:
3643     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3644   default:
3645     llvm_unreachable("not a DS FP intrinsic");
3646   }
3647 }
3648 
legalizeDSAtomicFPIntrinsic(LegalizerHelper & Helper,MachineInstr & MI,Intrinsic::ID IID) const3649 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3650                                                       MachineInstr &MI,
3651                                                       Intrinsic::ID IID) const {
3652   GISelChangeObserver &Observer = Helper.Observer;
3653   Observer.changingInstr(MI);
3654 
3655   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3656 
3657   // The remaining operands were used to set fields in the MemOperand on
3658   // construction.
3659   for (int I = 6; I > 3; --I)
3660     MI.RemoveOperand(I);
3661 
3662   MI.RemoveOperand(1); // Remove the intrinsic ID.
3663   Observer.changedInstr(MI);
3664   return true;
3665 }
3666 
getImplicitArgPtr(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3667 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3668                                             MachineRegisterInfo &MRI,
3669                                             MachineIRBuilder &B) const {
3670   uint64_t Offset =
3671     ST.getTargetLowering()->getImplicitParameterOffset(
3672       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3673   LLT DstTy = MRI.getType(DstReg);
3674   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3675 
3676   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3677   if (!loadInputValue(KernargPtrReg, B,
3678                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3679     return false;
3680 
3681   // FIXME: This should be nuw
3682   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3683   return true;
3684 }
3685 
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3686 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3687                                                  MachineRegisterInfo &MRI,
3688                                                  MachineIRBuilder &B) const {
3689   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3690   if (!MFI->isEntryFunction()) {
3691     return legalizePreloadedArgIntrin(MI, MRI, B,
3692                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3693   }
3694 
3695   Register DstReg = MI.getOperand(0).getReg();
3696   if (!getImplicitArgPtr(DstReg, MRI, B))
3697     return false;
3698 
3699   MI.eraseFromParent();
3700   return true;
3701 }
3702 
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const3703 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3704                                               MachineRegisterInfo &MRI,
3705                                               MachineIRBuilder &B,
3706                                               unsigned AddrSpace) const {
3707   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3708   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3709   Register Hi32 = Unmerge.getReg(1);
3710 
3711   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3712   MI.eraseFromParent();
3713   return true;
3714 }
3715 
3716 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3717 // offset (the offset that is included in bounds checking and swizzling, to be
3718 // split between the instruction's voffset and immoffset fields) and soffset
3719 // (the offset that is excluded from bounds checking and swizzling, to go in
3720 // the instruction's soffset field).  This function takes the first kind of
3721 // offset and figures out how to split it between voffset and immoffset.
3722 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const3723 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3724                                         Register OrigOffset) const {
3725   const unsigned MaxImm = 4095;
3726   Register BaseReg;
3727   unsigned ImmOffset;
3728   const LLT S32 = LLT::scalar(32);
3729   MachineRegisterInfo &MRI = *B.getMRI();
3730 
3731   std::tie(BaseReg, ImmOffset) =
3732       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3733 
3734   // If BaseReg is a pointer, convert it to int.
3735   if (MRI.getType(BaseReg).isPointer())
3736     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3737 
3738   // If the immediate value is too big for the immoffset field, put the value
3739   // and -4096 into the immoffset field so that the value that is copied/added
3740   // for the voffset field is a multiple of 4096, and it stands more chance
3741   // of being CSEd with the copy/add for another similar load/store.
3742   // However, do not do that rounding down to a multiple of 4096 if that is a
3743   // negative number, as it appears to be illegal to have a negative offset
3744   // in the vgpr, even if adding the immediate offset makes it positive.
3745   unsigned Overflow = ImmOffset & ~MaxImm;
3746   ImmOffset -= Overflow;
3747   if ((int32_t)Overflow < 0) {
3748     Overflow += ImmOffset;
3749     ImmOffset = 0;
3750   }
3751 
3752   if (Overflow != 0) {
3753     if (!BaseReg) {
3754       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3755     } else {
3756       auto OverflowVal = B.buildConstant(S32, Overflow);
3757       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3758     }
3759   }
3760 
3761   if (!BaseReg)
3762     BaseReg = B.buildConstant(S32, 0).getReg(0);
3763 
3764   return std::make_pair(BaseReg, ImmOffset);
3765 }
3766 
3767 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
updateBufferMMO(MachineMemOperand * MMO,Register VOffset,Register SOffset,unsigned ImmOffset,Register VIndex,MachineRegisterInfo & MRI) const3768 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3769                                           Register VOffset, Register SOffset,
3770                                           unsigned ImmOffset, Register VIndex,
3771                                           MachineRegisterInfo &MRI) const {
3772   Optional<ValueAndVReg> MaybeVOffsetVal =
3773       getIConstantVRegValWithLookThrough(VOffset, MRI);
3774   Optional<ValueAndVReg> MaybeSOffsetVal =
3775       getIConstantVRegValWithLookThrough(SOffset, MRI);
3776   Optional<ValueAndVReg> MaybeVIndexVal =
3777       getIConstantVRegValWithLookThrough(VIndex, MRI);
3778   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3779   // update the MMO with that offset. The stride is unknown so we can only do
3780   // this if VIndex is constant 0.
3781   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3782       MaybeVIndexVal->Value == 0) {
3783     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3784                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3785     MMO->setOffset(TotalOffset);
3786   } else {
3787     // We don't have a constant combined offset to use in the MMO. Give up.
3788     MMO->setValue((Value *)nullptr);
3789   }
3790 }
3791 
3792 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg,bool ImageStore) const3793 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3794                                              MachineRegisterInfo &MRI,
3795                                              Register Reg,
3796                                              bool ImageStore) const {
3797   const LLT S16 = LLT::scalar(16);
3798   const LLT S32 = LLT::scalar(32);
3799   LLT StoreVT = MRI.getType(Reg);
3800   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3801 
3802   if (ST.hasUnpackedD16VMem()) {
3803     auto Unmerge = B.buildUnmerge(S16, Reg);
3804 
3805     SmallVector<Register, 4> WideRegs;
3806     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3807       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3808 
3809     int NumElts = StoreVT.getNumElements();
3810 
3811     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3812         .getReg(0);
3813   }
3814 
3815   if (ImageStore && ST.hasImageStoreD16Bug()) {
3816     if (StoreVT.getNumElements() == 2) {
3817       SmallVector<Register, 4> PackedRegs;
3818       Reg = B.buildBitcast(S32, Reg).getReg(0);
3819       PackedRegs.push_back(Reg);
3820       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3821       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3822           .getReg(0);
3823     }
3824 
3825     if (StoreVT.getNumElements() == 3) {
3826       SmallVector<Register, 4> PackedRegs;
3827       auto Unmerge = B.buildUnmerge(S16, Reg);
3828       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3829         PackedRegs.push_back(Unmerge.getReg(I));
3830       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3831       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3832       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3833     }
3834 
3835     if (StoreVT.getNumElements() == 4) {
3836       SmallVector<Register, 4> PackedRegs;
3837       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3838       auto Unmerge = B.buildUnmerge(S32, Reg);
3839       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3840         PackedRegs.push_back(Unmerge.getReg(I));
3841       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3842       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3843           .getReg(0);
3844     }
3845 
3846     llvm_unreachable("invalid data type");
3847   }
3848 
3849   return Reg;
3850 }
3851 
fixStoreSourceType(MachineIRBuilder & B,Register VData,bool IsFormat) const3852 Register AMDGPULegalizerInfo::fixStoreSourceType(
3853   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3854   MachineRegisterInfo *MRI = B.getMRI();
3855   LLT Ty = MRI->getType(VData);
3856 
3857   const LLT S16 = LLT::scalar(16);
3858 
3859   // Fixup illegal register types for i8 stores.
3860   if (Ty == LLT::scalar(8) || Ty == S16) {
3861     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3862     return AnyExt;
3863   }
3864 
3865   if (Ty.isVector()) {
3866     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3867       if (IsFormat)
3868         return handleD16VData(B, *MRI, VData);
3869     }
3870   }
3871 
3872   return VData;
3873 }
3874 
legalizeBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsTyped,bool IsFormat) const3875 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3876                                               MachineRegisterInfo &MRI,
3877                                               MachineIRBuilder &B,
3878                                               bool IsTyped,
3879                                               bool IsFormat) const {
3880   Register VData = MI.getOperand(1).getReg();
3881   LLT Ty = MRI.getType(VData);
3882   LLT EltTy = Ty.getScalarType();
3883   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3884   const LLT S32 = LLT::scalar(32);
3885 
3886   VData = fixStoreSourceType(B, VData, IsFormat);
3887   Register RSrc = MI.getOperand(2).getReg();
3888 
3889   MachineMemOperand *MMO = *MI.memoperands_begin();
3890   const int MemSize = MMO->getSize();
3891 
3892   unsigned ImmOffset;
3893 
3894   // The typed intrinsics add an immediate after the registers.
3895   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3896 
3897   // The struct intrinsic variants add one additional operand over raw.
3898   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3899   Register VIndex;
3900   int OpOffset = 0;
3901   if (HasVIndex) {
3902     VIndex = MI.getOperand(3).getReg();
3903     OpOffset = 1;
3904   } else {
3905     VIndex = B.buildConstant(S32, 0).getReg(0);
3906   }
3907 
3908   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3909   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3910 
3911   unsigned Format = 0;
3912   if (IsTyped) {
3913     Format = MI.getOperand(5 + OpOffset).getImm();
3914     ++OpOffset;
3915   }
3916 
3917   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3918 
3919   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3920   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3921 
3922   unsigned Opc;
3923   if (IsTyped) {
3924     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3925                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3926   } else if (IsFormat) {
3927     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3928                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3929   } else {
3930     switch (MemSize) {
3931     case 1:
3932       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3933       break;
3934     case 2:
3935       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3936       break;
3937     default:
3938       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3939       break;
3940     }
3941   }
3942 
3943   auto MIB = B.buildInstr(Opc)
3944     .addUse(VData)              // vdata
3945     .addUse(RSrc)               // rsrc
3946     .addUse(VIndex)             // vindex
3947     .addUse(VOffset)            // voffset
3948     .addUse(SOffset)            // soffset
3949     .addImm(ImmOffset);         // offset(imm)
3950 
3951   if (IsTyped)
3952     MIB.addImm(Format);
3953 
3954   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3955      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3956      .addMemOperand(MMO);
3957 
3958   MI.eraseFromParent();
3959   return true;
3960 }
3961 
legalizeBufferLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat,bool IsTyped) const3962 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3963                                              MachineRegisterInfo &MRI,
3964                                              MachineIRBuilder &B,
3965                                              bool IsFormat,
3966                                              bool IsTyped) const {
3967   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3968   MachineMemOperand *MMO = *MI.memoperands_begin();
3969   const LLT MemTy = MMO->getMemoryType();
3970   const LLT S32 = LLT::scalar(32);
3971 
3972   Register Dst = MI.getOperand(0).getReg();
3973   Register RSrc = MI.getOperand(2).getReg();
3974 
3975   // The typed intrinsics add an immediate after the registers.
3976   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3977 
3978   // The struct intrinsic variants add one additional operand over raw.
3979   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3980   Register VIndex;
3981   int OpOffset = 0;
3982   if (HasVIndex) {
3983     VIndex = MI.getOperand(3).getReg();
3984     OpOffset = 1;
3985   } else {
3986     VIndex = B.buildConstant(S32, 0).getReg(0);
3987   }
3988 
3989   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3990   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3991 
3992   unsigned Format = 0;
3993   if (IsTyped) {
3994     Format = MI.getOperand(5 + OpOffset).getImm();
3995     ++OpOffset;
3996   }
3997 
3998   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3999   unsigned ImmOffset;
4000 
4001   LLT Ty = MRI.getType(Dst);
4002   LLT EltTy = Ty.getScalarType();
4003   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4004   const bool Unpacked = ST.hasUnpackedD16VMem();
4005 
4006   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4007   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4008 
4009   unsigned Opc;
4010 
4011   if (IsTyped) {
4012     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4013                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4014   } else if (IsFormat) {
4015     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4016                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4017   } else {
4018     switch (MemTy.getSizeInBits()) {
4019     case 8:
4020       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4021       break;
4022     case 16:
4023       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4024       break;
4025     default:
4026       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4027       break;
4028     }
4029   }
4030 
4031   Register LoadDstReg;
4032 
4033   bool IsExtLoad =
4034       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4035   LLT UnpackedTy = Ty.changeElementSize(32);
4036 
4037   if (IsExtLoad)
4038     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4039   else if (Unpacked && IsD16 && Ty.isVector())
4040     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4041   else
4042     LoadDstReg = Dst;
4043 
4044   auto MIB = B.buildInstr(Opc)
4045     .addDef(LoadDstReg)         // vdata
4046     .addUse(RSrc)               // rsrc
4047     .addUse(VIndex)             // vindex
4048     .addUse(VOffset)            // voffset
4049     .addUse(SOffset)            // soffset
4050     .addImm(ImmOffset);         // offset(imm)
4051 
4052   if (IsTyped)
4053     MIB.addImm(Format);
4054 
4055   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4056      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4057      .addMemOperand(MMO);
4058 
4059   if (LoadDstReg != Dst) {
4060     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4061 
4062     // Widen result for extending loads was widened.
4063     if (IsExtLoad)
4064       B.buildTrunc(Dst, LoadDstReg);
4065     else {
4066       // Repack to original 16-bit vector result
4067       // FIXME: G_TRUNC should work, but legalization currently fails
4068       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4069       SmallVector<Register, 4> Repack;
4070       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4071         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4072       B.buildMerge(Dst, Repack);
4073     }
4074   }
4075 
4076   MI.eraseFromParent();
4077   return true;
4078 }
4079 
legalizeAtomicIncDec(MachineInstr & MI,MachineIRBuilder & B,bool IsInc) const4080 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4081                                                MachineIRBuilder &B,
4082                                                bool IsInc) const {
4083   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4084                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4085   B.buildInstr(Opc)
4086     .addDef(MI.getOperand(0).getReg())
4087     .addUse(MI.getOperand(2).getReg())
4088     .addUse(MI.getOperand(3).getReg())
4089     .cloneMemRefs(MI);
4090   MI.eraseFromParent();
4091   return true;
4092 }
4093 
getBufferAtomicPseudo(Intrinsic::ID IntrID)4094 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4095   switch (IntrID) {
4096   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4097   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4098     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4099   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4100   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4101     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4102   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4103   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4104     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4105   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4106   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4107     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4108   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4109   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4110     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4111   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4112   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4113     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4114   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4115   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4116     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4117   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4118   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4119     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4120   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4121   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4122     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4123   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4124   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4125     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4126   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4127   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4128     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4129   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4130   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4131     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4132   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4133   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4134     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4135   case Intrinsic::amdgcn_buffer_atomic_fadd:
4136   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4137   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4138     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4139   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4140   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4141     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4142   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4143   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4144     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4145   default:
4146     llvm_unreachable("unhandled atomic opcode");
4147   }
4148 }
4149 
legalizeBufferAtomic(MachineInstr & MI,MachineIRBuilder & B,Intrinsic::ID IID) const4150 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4151                                                MachineIRBuilder &B,
4152                                                Intrinsic::ID IID) const {
4153   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4154                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4155   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4156 
4157   Register Dst;
4158 
4159   int OpOffset = 0;
4160   if (HasReturn) {
4161     // A few FP atomics do not support return values.
4162     Dst = MI.getOperand(0).getReg();
4163   } else {
4164     OpOffset = -1;
4165   }
4166 
4167   Register VData = MI.getOperand(2 + OpOffset).getReg();
4168   Register CmpVal;
4169 
4170   if (IsCmpSwap) {
4171     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4172     ++OpOffset;
4173   }
4174 
4175   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4176   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4177 
4178   // The struct intrinsic variants add one additional operand over raw.
4179   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4180   Register VIndex;
4181   if (HasVIndex) {
4182     VIndex = MI.getOperand(4 + OpOffset).getReg();
4183     ++OpOffset;
4184   } else {
4185     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4186   }
4187 
4188   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4189   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4190   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4191 
4192   MachineMemOperand *MMO = *MI.memoperands_begin();
4193 
4194   unsigned ImmOffset;
4195   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4196   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4197 
4198   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4199 
4200   if (HasReturn)
4201     MIB.addDef(Dst);
4202 
4203   MIB.addUse(VData); // vdata
4204 
4205   if (IsCmpSwap)
4206     MIB.addReg(CmpVal);
4207 
4208   MIB.addUse(RSrc)               // rsrc
4209      .addUse(VIndex)             // vindex
4210      .addUse(VOffset)            // voffset
4211      .addUse(SOffset)            // soffset
4212      .addImm(ImmOffset)          // offset(imm)
4213      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4214      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4215      .addMemOperand(MMO);
4216 
4217   MI.eraseFromParent();
4218   return true;
4219 }
4220 
4221 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4222 /// vector with s16 typed elements.
packImage16bitOpsToDwords(MachineIRBuilder & B,MachineInstr & MI,SmallVectorImpl<Register> & PackedAddrs,unsigned ArgOffset,const AMDGPU::ImageDimIntrinsicInfo * Intr,bool IsA16,bool IsG16)4223 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4224                                       SmallVectorImpl<Register> &PackedAddrs,
4225                                       unsigned ArgOffset,
4226                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4227                                       bool IsA16, bool IsG16) {
4228   const LLT S16 = LLT::scalar(16);
4229   const LLT V2S16 = LLT::fixed_vector(2, 16);
4230   auto EndIdx = Intr->VAddrEnd;
4231 
4232   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4233     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4234     if (!SrcOp.isReg())
4235       continue; // _L to _LZ may have eliminated this.
4236 
4237     Register AddrReg = SrcOp.getReg();
4238 
4239     if ((I < Intr->GradientStart) ||
4240         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4241         (I >= Intr->CoordStart && !IsA16)) {
4242       // Handle any gradient or coordinate operands that should not be packed
4243       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4244       PackedAddrs.push_back(AddrReg);
4245     } else {
4246       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4247       // derivatives dx/dh and dx/dv are packed with undef.
4248       if (((I + 1) >= EndIdx) ||
4249           ((Intr->NumGradients / 2) % 2 == 1 &&
4250            (I == static_cast<unsigned>(Intr->GradientStart +
4251                                        (Intr->NumGradients / 2) - 1) ||
4252             I == static_cast<unsigned>(Intr->GradientStart +
4253                                        Intr->NumGradients - 1))) ||
4254           // Check for _L to _LZ optimization
4255           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4256         PackedAddrs.push_back(
4257             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4258                 .getReg(0));
4259       } else {
4260         PackedAddrs.push_back(
4261             B.buildBuildVector(
4262                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4263                 .getReg(0));
4264         ++I;
4265       }
4266     }
4267   }
4268 }
4269 
4270 /// Convert from separate vaddr components to a single vector address register,
4271 /// and replace the remaining operands with $noreg.
convertImageAddrToPacked(MachineIRBuilder & B,MachineInstr & MI,int DimIdx,int NumVAddrs)4272 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4273                                      int DimIdx, int NumVAddrs) {
4274   const LLT S32 = LLT::scalar(32);
4275 
4276   SmallVector<Register, 8> AddrRegs;
4277   for (int I = 0; I != NumVAddrs; ++I) {
4278     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4279     if (SrcOp.isReg()) {
4280       AddrRegs.push_back(SrcOp.getReg());
4281       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4282     }
4283   }
4284 
4285   int NumAddrRegs = AddrRegs.size();
4286   if (NumAddrRegs != 1) {
4287     // Above 8 elements round up to next power of 2 (i.e. 16).
4288     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4289       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4290       auto Undef = B.buildUndef(S32);
4291       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4292       NumAddrRegs = RoundedNumRegs;
4293     }
4294 
4295     auto VAddr =
4296         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4297     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4298   }
4299 
4300   for (int I = 1; I != NumVAddrs; ++I) {
4301     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4302     if (SrcOp.isReg())
4303       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4304   }
4305 }
4306 
4307 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4308 ///
4309 /// Depending on the subtarget, load/store with 16-bit element data need to be
4310 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4311 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4312 /// registers.
4313 ///
4314 /// We don't want to directly select image instructions just yet, but also want
4315 /// to exposes all register repacking to the legalizer/combiners. We also don't
4316 /// want a selected instrution entering RegBankSelect. In order to avoid
4317 /// defining a multitude of intermediate image instructions, directly hack on
4318 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4319 /// padding now unnecessary arguments with $noreg.
legalizeImageIntrinsic(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer,const AMDGPU::ImageDimIntrinsicInfo * Intr) const4320 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4321     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4322     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4323 
4324   const unsigned NumDefs = MI.getNumExplicitDefs();
4325   const unsigned ArgOffset = NumDefs + 1;
4326   bool IsTFE = NumDefs == 2;
4327   // We are only processing the operands of d16 image operations on subtargets
4328   // that use the unpacked register layout, or need to repack the TFE result.
4329 
4330   // TODO: Do we need to guard against already legalized intrinsics?
4331   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4332       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4333 
4334   MachineRegisterInfo *MRI = B.getMRI();
4335   const LLT S32 = LLT::scalar(32);
4336   const LLT S16 = LLT::scalar(16);
4337   const LLT V2S16 = LLT::fixed_vector(2, 16);
4338 
4339   unsigned DMask = 0;
4340 
4341   // Check for 16 bit addresses and pack if true.
4342   LLT GradTy =
4343       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4344   LLT AddrTy =
4345       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4346   const bool IsG16 = GradTy == S16;
4347   const bool IsA16 = AddrTy == S16;
4348 
4349   int DMaskLanes = 0;
4350   if (!BaseOpcode->Atomic) {
4351     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4352     if (BaseOpcode->Gather4) {
4353       DMaskLanes = 4;
4354     } else if (DMask != 0) {
4355       DMaskLanes = countPopulation(DMask);
4356     } else if (!IsTFE && !BaseOpcode->Store) {
4357       // If dmask is 0, this is a no-op load. This can be eliminated.
4358       B.buildUndef(MI.getOperand(0));
4359       MI.eraseFromParent();
4360       return true;
4361     }
4362   }
4363 
4364   Observer.changingInstr(MI);
4365   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4366 
4367   unsigned NewOpcode = NumDefs == 0 ?
4368     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4369 
4370   // Track that we legalized this
4371   MI.setDesc(B.getTII().get(NewOpcode));
4372 
4373   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4374   // dmask to be at least 1 otherwise the instruction will fail
4375   if (IsTFE && DMask == 0) {
4376     DMask = 0x1;
4377     DMaskLanes = 1;
4378     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4379   }
4380 
4381   if (BaseOpcode->Atomic) {
4382     Register VData0 = MI.getOperand(2).getReg();
4383     LLT Ty = MRI->getType(VData0);
4384 
4385     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4386     if (Ty.isVector())
4387       return false;
4388 
4389     if (BaseOpcode->AtomicX2) {
4390       Register VData1 = MI.getOperand(3).getReg();
4391       // The two values are packed in one register.
4392       LLT PackedTy = LLT::fixed_vector(2, Ty);
4393       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4394       MI.getOperand(2).setReg(Concat.getReg(0));
4395       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4396     }
4397   }
4398 
4399   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4400 
4401   // Optimize _L to _LZ when _L is zero
4402   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4403           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4404     const ConstantFP *ConstantLod;
4405 
4406     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4407                  m_GFCst(ConstantLod))) {
4408       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4409         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4410         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4411             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4412                                                       Intr->Dim);
4413 
4414         // The starting indexes should remain in the same place.
4415         --CorrectedNumVAddrs;
4416 
4417         MI.getOperand(MI.getNumExplicitDefs())
4418             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4419         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4420         Intr = NewImageDimIntr;
4421       }
4422     }
4423   }
4424 
4425   // Optimize _mip away, when 'lod' is zero
4426   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4427     int64_t ConstantLod;
4428     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4429                  m_ICst(ConstantLod))) {
4430       if (ConstantLod == 0) {
4431         // TODO: Change intrinsic opcode and remove operand instead or replacing
4432         // it with 0, as the _L to _LZ handling is done above.
4433         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4434         --CorrectedNumVAddrs;
4435       }
4436     }
4437   }
4438 
4439   // Rewrite the addressing register layout before doing anything else.
4440   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4441     // 16 bit gradients are supported, but are tied to the A16 control
4442     // so both gradients and addresses must be 16 bit
4443     return false;
4444   }
4445 
4446   if (IsA16 && !ST.hasA16()) {
4447     // A16 not supported
4448     return false;
4449   }
4450 
4451   if (IsA16 || IsG16) {
4452     if (Intr->NumVAddrs > 1) {
4453       SmallVector<Register, 4> PackedRegs;
4454 
4455       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4456                                 IsG16);
4457 
4458       // See also below in the non-a16 branch
4459       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4460                           PackedRegs.size() <= ST.getNSAMaxSize();
4461 
4462       if (!UseNSA && PackedRegs.size() > 1) {
4463         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4464         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4465         PackedRegs[0] = Concat.getReg(0);
4466         PackedRegs.resize(1);
4467       }
4468 
4469       const unsigned NumPacked = PackedRegs.size();
4470       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4471         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4472         if (!SrcOp.isReg()) {
4473           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4474           continue;
4475         }
4476 
4477         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4478 
4479         if (I - Intr->VAddrStart < NumPacked)
4480           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4481         else
4482           SrcOp.setReg(AMDGPU::NoRegister);
4483       }
4484     }
4485   } else {
4486     // If the register allocator cannot place the address registers contiguously
4487     // without introducing moves, then using the non-sequential address encoding
4488     // is always preferable, since it saves VALU instructions and is usually a
4489     // wash in terms of code size or even better.
4490     //
4491     // However, we currently have no way of hinting to the register allocator
4492     // that MIMG addresses should be placed contiguously when it is possible to
4493     // do so, so force non-NSA for the common 2-address case as a heuristic.
4494     //
4495     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4496     // allocation when possible.
4497     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4498                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4499 
4500     if (!UseNSA && Intr->NumVAddrs > 1)
4501       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4502                                Intr->NumVAddrs);
4503   }
4504 
4505   int Flags = 0;
4506   if (IsA16)
4507     Flags |= 1;
4508   if (IsG16)
4509     Flags |= 2;
4510   MI.addOperand(MachineOperand::CreateImm(Flags));
4511 
4512   if (BaseOpcode->Store) { // No TFE for stores?
4513     // TODO: Handle dmask trim
4514     Register VData = MI.getOperand(1).getReg();
4515     LLT Ty = MRI->getType(VData);
4516     if (!Ty.isVector() || Ty.getElementType() != S16)
4517       return true;
4518 
4519     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4520     if (RepackedReg != VData) {
4521       MI.getOperand(1).setReg(RepackedReg);
4522     }
4523 
4524     return true;
4525   }
4526 
4527   Register DstReg = MI.getOperand(0).getReg();
4528   LLT Ty = MRI->getType(DstReg);
4529   const LLT EltTy = Ty.getScalarType();
4530   const bool IsD16 = Ty.getScalarType() == S16;
4531   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4532 
4533   // Confirm that the return type is large enough for the dmask specified
4534   if (NumElts < DMaskLanes)
4535     return false;
4536 
4537   if (NumElts > 4 || DMaskLanes > 4)
4538     return false;
4539 
4540   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4541   const LLT AdjustedTy =
4542       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4543 
4544   // The raw dword aligned data component of the load. The only legal cases
4545   // where this matters should be when using the packed D16 format, for
4546   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4547   LLT RoundedTy;
4548 
4549   // S32 vector to to cover all data, plus TFE result element.
4550   LLT TFETy;
4551 
4552   // Register type to use for each loaded component. Will be S32 or V2S16.
4553   LLT RegTy;
4554 
4555   if (IsD16 && ST.hasUnpackedD16VMem()) {
4556     RoundedTy =
4557         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4558     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4559     RegTy = S32;
4560   } else {
4561     unsigned EltSize = EltTy.getSizeInBits();
4562     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4563     unsigned RoundedSize = 32 * RoundedElts;
4564     RoundedTy = LLT::scalarOrVector(
4565         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4566     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4567     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4568   }
4569 
4570   // The return type does not need adjustment.
4571   // TODO: Should we change s16 case to s32 or <2 x s16>?
4572   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4573     return true;
4574 
4575   Register Dst1Reg;
4576 
4577   // Insert after the instruction.
4578   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4579 
4580   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4581   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4582   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4583   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4584 
4585   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4586 
4587   MI.getOperand(0).setReg(NewResultReg);
4588 
4589   // In the IR, TFE is supposed to be used with a 2 element struct return
4590   // type. The instruction really returns these two values in one contiguous
4591   // register, with one additional dword beyond the loaded data. Rewrite the
4592   // return type to use a single register result.
4593 
4594   if (IsTFE) {
4595     Dst1Reg = MI.getOperand(1).getReg();
4596     if (MRI->getType(Dst1Reg) != S32)
4597       return false;
4598 
4599     // TODO: Make sure the TFE operand bit is set.
4600     MI.RemoveOperand(1);
4601 
4602     // Handle the easy case that requires no repack instructions.
4603     if (Ty == S32) {
4604       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4605       return true;
4606     }
4607   }
4608 
4609   // Now figure out how to copy the new result register back into the old
4610   // result.
4611   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4612 
4613   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4614 
4615   if (ResultNumRegs == 1) {
4616     assert(!IsTFE);
4617     ResultRegs[0] = NewResultReg;
4618   } else {
4619     // We have to repack into a new vector of some kind.
4620     for (int I = 0; I != NumDataRegs; ++I)
4621       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4622     B.buildUnmerge(ResultRegs, NewResultReg);
4623 
4624     // Drop the final TFE element to get the data part. The TFE result is
4625     // directly written to the right place already.
4626     if (IsTFE)
4627       ResultRegs.resize(NumDataRegs);
4628   }
4629 
4630   // For an s16 scalar result, we form an s32 result with a truncate regardless
4631   // of packed vs. unpacked.
4632   if (IsD16 && !Ty.isVector()) {
4633     B.buildTrunc(DstReg, ResultRegs[0]);
4634     return true;
4635   }
4636 
4637   // Avoid a build/concat_vector of 1 entry.
4638   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4639     B.buildBitcast(DstReg, ResultRegs[0]);
4640     return true;
4641   }
4642 
4643   assert(Ty.isVector());
4644 
4645   if (IsD16) {
4646     // For packed D16 results with TFE enabled, all the data components are
4647     // S32. Cast back to the expected type.
4648     //
4649     // TODO: We don't really need to use load s32 elements. We would only need one
4650     // cast for the TFE result if a multiple of v2s16 was used.
4651     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4652       for (Register &Reg : ResultRegs)
4653         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4654     } else if (ST.hasUnpackedD16VMem()) {
4655       for (Register &Reg : ResultRegs)
4656         Reg = B.buildTrunc(S16, Reg).getReg(0);
4657     }
4658   }
4659 
4660   auto padWithUndef = [&](LLT Ty, int NumElts) {
4661     if (NumElts == 0)
4662       return;
4663     Register Undef = B.buildUndef(Ty).getReg(0);
4664     for (int I = 0; I != NumElts; ++I)
4665       ResultRegs.push_back(Undef);
4666   };
4667 
4668   // Pad out any elements eliminated due to the dmask.
4669   LLT ResTy = MRI->getType(ResultRegs[0]);
4670   if (!ResTy.isVector()) {
4671     padWithUndef(ResTy, NumElts - ResultRegs.size());
4672     B.buildBuildVector(DstReg, ResultRegs);
4673     return true;
4674   }
4675 
4676   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4677   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4678 
4679   // Deal with the one annoying legal case.
4680   const LLT V3S16 = LLT::fixed_vector(3, 16);
4681   if (Ty == V3S16) {
4682     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4683     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4684     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4685     return true;
4686   }
4687 
4688   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4689   B.buildConcatVectors(DstReg, ResultRegs);
4690   return true;
4691 }
4692 
legalizeSBufferLoad(LegalizerHelper & Helper,MachineInstr & MI) const4693 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4694   LegalizerHelper &Helper, MachineInstr &MI) const {
4695   MachineIRBuilder &B = Helper.MIRBuilder;
4696   GISelChangeObserver &Observer = Helper.Observer;
4697 
4698   Register Dst = MI.getOperand(0).getReg();
4699   LLT Ty = B.getMRI()->getType(Dst);
4700   unsigned Size = Ty.getSizeInBits();
4701   MachineFunction &MF = B.getMF();
4702 
4703   Observer.changingInstr(MI);
4704 
4705   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4706     Ty = getBitcastRegisterType(Ty);
4707     Helper.bitcastDst(MI, Ty, 0);
4708     Dst = MI.getOperand(0).getReg();
4709     B.setInsertPt(B.getMBB(), MI);
4710   }
4711 
4712   // FIXME: We don't really need this intermediate instruction. The intrinsic
4713   // should be fixed to have a memory operand. Since it's readnone, we're not
4714   // allowed to add one.
4715   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4716   MI.RemoveOperand(1); // Remove intrinsic ID
4717 
4718   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4719   // TODO: Should this use datalayout alignment?
4720   const unsigned MemSize = (Size + 7) / 8;
4721   const Align MemAlign(4);
4722   MachineMemOperand *MMO = MF.getMachineMemOperand(
4723       MachinePointerInfo(),
4724       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4725           MachineMemOperand::MOInvariant,
4726       MemSize, MemAlign);
4727   MI.addMemOperand(MF, MMO);
4728 
4729   // There are no 96-bit result scalar loads, but widening to 128-bit should
4730   // always be legal. We may need to restore this to a 96-bit result if it turns
4731   // out this needs to be converted to a vector load during RegBankSelect.
4732   if (!isPowerOf2_32(Size)) {
4733     if (Ty.isVector())
4734       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4735     else
4736       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4737   }
4738 
4739   Observer.changedInstr(MI);
4740   return true;
4741 }
4742 
4743 // TODO: Move to selection
legalizeTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4744 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4745                                                 MachineRegisterInfo &MRI,
4746                                                 MachineIRBuilder &B) const {
4747   if (!ST.isTrapHandlerEnabled() ||
4748       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4749     return legalizeTrapEndpgm(MI, MRI, B);
4750 
4751   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4752     switch (*HsaAbiVer) {
4753     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4754     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4755       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4756     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4757       return ST.supportsGetDoorbellID() ?
4758           legalizeTrapHsa(MI, MRI, B) :
4759           legalizeTrapHsaQueuePtr(MI, MRI, B);
4760     }
4761   }
4762 
4763   llvm_unreachable("Unknown trap handler");
4764 }
4765 
legalizeTrapEndpgm(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4766 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4767     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4768   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4769   MI.eraseFromParent();
4770   return true;
4771 }
4772 
legalizeTrapHsaQueuePtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4773 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4774     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4775   // Pass queue pointer to trap handler as input, and insert trap instruction
4776   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4777   Register LiveIn =
4778     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4779   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4780     return false;
4781 
4782   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4783   B.buildCopy(SGPR01, LiveIn);
4784   B.buildInstr(AMDGPU::S_TRAP)
4785       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4786       .addReg(SGPR01, RegState::Implicit);
4787 
4788   MI.eraseFromParent();
4789   return true;
4790 }
4791 
legalizeTrapHsa(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4792 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4793     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4794   B.buildInstr(AMDGPU::S_TRAP)
4795       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4796   MI.eraseFromParent();
4797   return true;
4798 }
4799 
legalizeDebugTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4800 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4801     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4802   // Is non-HSA path or trap-handler disabled? Then, report a warning
4803   // accordingly
4804   if (!ST.isTrapHandlerEnabled() ||
4805       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4806     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4807                                      "debugtrap handler not supported",
4808                                      MI.getDebugLoc(), DS_Warning);
4809     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4810     Ctx.diagnose(NoTrap);
4811   } else {
4812     // Insert debug-trap instruction
4813     B.buildInstr(AMDGPU::S_TRAP)
4814         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4815   }
4816 
4817   MI.eraseFromParent();
4818   return true;
4819 }
4820 
legalizeBVHIntrinsic(MachineInstr & MI,MachineIRBuilder & B) const4821 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4822                                                MachineIRBuilder &B) const {
4823   MachineRegisterInfo &MRI = *B.getMRI();
4824   const LLT S16 = LLT::scalar(16);
4825   const LLT S32 = LLT::scalar(32);
4826 
4827   Register DstReg = MI.getOperand(0).getReg();
4828   Register NodePtr = MI.getOperand(2).getReg();
4829   Register RayExtent = MI.getOperand(3).getReg();
4830   Register RayOrigin = MI.getOperand(4).getReg();
4831   Register RayDir = MI.getOperand(5).getReg();
4832   Register RayInvDir = MI.getOperand(6).getReg();
4833   Register TDescr = MI.getOperand(7).getReg();
4834 
4835   if (!ST.hasGFX10_AEncoding()) {
4836     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4837                                         "intrinsic not supported on subtarget",
4838                                         MI.getDebugLoc());
4839     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4840     return false;
4841   }
4842 
4843   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4844   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4845   const unsigned NumVDataDwords = 4;
4846   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4847   const bool UseNSA =
4848       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4849   const unsigned BaseOpcodes[2][2] = {
4850       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4851       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4852        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4853   int Opcode;
4854   if (UseNSA) {
4855     Opcode =
4856         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4857                               NumVDataDwords, NumVAddrDwords);
4858   } else {
4859     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4860                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4861                                    PowerOf2Ceil(NumVAddrDwords));
4862   }
4863   assert(Opcode != -1);
4864 
4865   SmallVector<Register, 12> Ops;
4866   if (Is64) {
4867     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4868     Ops.push_back(Unmerge.getReg(0));
4869     Ops.push_back(Unmerge.getReg(1));
4870   } else {
4871     Ops.push_back(NodePtr);
4872   }
4873   Ops.push_back(RayExtent);
4874 
4875   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4876     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4877     Ops.push_back(Unmerge.getReg(0));
4878     Ops.push_back(Unmerge.getReg(1));
4879     Ops.push_back(Unmerge.getReg(2));
4880   };
4881 
4882   packLanes(RayOrigin);
4883   if (IsA16) {
4884     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4885     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4886     Register R1 = MRI.createGenericVirtualRegister(S32);
4887     Register R2 = MRI.createGenericVirtualRegister(S32);
4888     Register R3 = MRI.createGenericVirtualRegister(S32);
4889     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4890     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4891     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4892     Ops.push_back(R1);
4893     Ops.push_back(R2);
4894     Ops.push_back(R3);
4895   } else {
4896     packLanes(RayDir);
4897     packLanes(RayInvDir);
4898   }
4899 
4900   if (!UseNSA) {
4901     // Build a single vector containing all the operands so far prepared.
4902     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4903     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4904     Ops.clear();
4905     Ops.push_back(MergedOps);
4906   }
4907 
4908   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4909     .addDef(DstReg)
4910     .addImm(Opcode);
4911 
4912   for (Register R : Ops) {
4913     MIB.addUse(R);
4914   }
4915 
4916   MIB.addUse(TDescr)
4917      .addImm(IsA16 ? 1 : 0)
4918      .cloneMemRefs(MI);
4919 
4920   MI.eraseFromParent();
4921   return true;
4922 }
4923 
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const4924 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4925                                             MachineInstr &MI) const {
4926   MachineIRBuilder &B = Helper.MIRBuilder;
4927   MachineRegisterInfo &MRI = *B.getMRI();
4928 
4929   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4930   auto IntrID = MI.getIntrinsicID();
4931   switch (IntrID) {
4932   case Intrinsic::amdgcn_if:
4933   case Intrinsic::amdgcn_else: {
4934     MachineInstr *Br = nullptr;
4935     MachineBasicBlock *UncondBrTarget = nullptr;
4936     bool Negated = false;
4937     if (MachineInstr *BrCond =
4938             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4939       const SIRegisterInfo *TRI
4940         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4941 
4942       Register Def = MI.getOperand(1).getReg();
4943       Register Use = MI.getOperand(3).getReg();
4944 
4945       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4946 
4947       if (Negated)
4948         std::swap(CondBrTarget, UncondBrTarget);
4949 
4950       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4951       if (IntrID == Intrinsic::amdgcn_if) {
4952         B.buildInstr(AMDGPU::SI_IF)
4953           .addDef(Def)
4954           .addUse(Use)
4955           .addMBB(UncondBrTarget);
4956       } else {
4957         B.buildInstr(AMDGPU::SI_ELSE)
4958             .addDef(Def)
4959             .addUse(Use)
4960             .addMBB(UncondBrTarget);
4961       }
4962 
4963       if (Br) {
4964         Br->getOperand(0).setMBB(CondBrTarget);
4965       } else {
4966         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4967         // since we're swapping branch targets it needs to be reinserted.
4968         // FIXME: IRTranslator should probably not do this
4969         B.buildBr(*CondBrTarget);
4970       }
4971 
4972       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4973       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4974       MI.eraseFromParent();
4975       BrCond->eraseFromParent();
4976       return true;
4977     }
4978 
4979     return false;
4980   }
4981   case Intrinsic::amdgcn_loop: {
4982     MachineInstr *Br = nullptr;
4983     MachineBasicBlock *UncondBrTarget = nullptr;
4984     bool Negated = false;
4985     if (MachineInstr *BrCond =
4986             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4987       const SIRegisterInfo *TRI
4988         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4989 
4990       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4991       Register Reg = MI.getOperand(2).getReg();
4992 
4993       if (Negated)
4994         std::swap(CondBrTarget, UncondBrTarget);
4995 
4996       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4997       B.buildInstr(AMDGPU::SI_LOOP)
4998         .addUse(Reg)
4999         .addMBB(UncondBrTarget);
5000 
5001       if (Br)
5002         Br->getOperand(0).setMBB(CondBrTarget);
5003       else
5004         B.buildBr(*CondBrTarget);
5005 
5006       MI.eraseFromParent();
5007       BrCond->eraseFromParent();
5008       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5009       return true;
5010     }
5011 
5012     return false;
5013   }
5014   case Intrinsic::amdgcn_kernarg_segment_ptr:
5015     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5016       // This only makes sense to call in a kernel, so just lower to null.
5017       B.buildConstant(MI.getOperand(0).getReg(), 0);
5018       MI.eraseFromParent();
5019       return true;
5020     }
5021 
5022     return legalizePreloadedArgIntrin(
5023       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5024   case Intrinsic::amdgcn_implicitarg_ptr:
5025     return legalizeImplicitArgPtr(MI, MRI, B);
5026   case Intrinsic::amdgcn_workitem_id_x:
5027     return legalizePreloadedArgIntrin(MI, MRI, B,
5028                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5029   case Intrinsic::amdgcn_workitem_id_y:
5030     return legalizePreloadedArgIntrin(MI, MRI, B,
5031                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5032   case Intrinsic::amdgcn_workitem_id_z:
5033     return legalizePreloadedArgIntrin(MI, MRI, B,
5034                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5035   case Intrinsic::amdgcn_workgroup_id_x:
5036     return legalizePreloadedArgIntrin(MI, MRI, B,
5037                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5038   case Intrinsic::amdgcn_workgroup_id_y:
5039     return legalizePreloadedArgIntrin(MI, MRI, B,
5040                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5041   case Intrinsic::amdgcn_workgroup_id_z:
5042     return legalizePreloadedArgIntrin(MI, MRI, B,
5043                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5044   case Intrinsic::amdgcn_dispatch_ptr:
5045     return legalizePreloadedArgIntrin(MI, MRI, B,
5046                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5047   case Intrinsic::amdgcn_queue_ptr:
5048     return legalizePreloadedArgIntrin(MI, MRI, B,
5049                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5050   case Intrinsic::amdgcn_implicit_buffer_ptr:
5051     return legalizePreloadedArgIntrin(
5052       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5053   case Intrinsic::amdgcn_dispatch_id:
5054     return legalizePreloadedArgIntrin(MI, MRI, B,
5055                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5056   case Intrinsic::amdgcn_fdiv_fast:
5057     return legalizeFDIVFastIntrin(MI, MRI, B);
5058   case Intrinsic::amdgcn_is_shared:
5059     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5060   case Intrinsic::amdgcn_is_private:
5061     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5062   case Intrinsic::amdgcn_wavefrontsize: {
5063     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5064     MI.eraseFromParent();
5065     return true;
5066   }
5067   case Intrinsic::amdgcn_s_buffer_load:
5068     return legalizeSBufferLoad(Helper, MI);
5069   case Intrinsic::amdgcn_raw_buffer_store:
5070   case Intrinsic::amdgcn_struct_buffer_store:
5071     return legalizeBufferStore(MI, MRI, B, false, false);
5072   case Intrinsic::amdgcn_raw_buffer_store_format:
5073   case Intrinsic::amdgcn_struct_buffer_store_format:
5074     return legalizeBufferStore(MI, MRI, B, false, true);
5075   case Intrinsic::amdgcn_raw_tbuffer_store:
5076   case Intrinsic::amdgcn_struct_tbuffer_store:
5077     return legalizeBufferStore(MI, MRI, B, true, true);
5078   case Intrinsic::amdgcn_raw_buffer_load:
5079   case Intrinsic::amdgcn_struct_buffer_load:
5080     return legalizeBufferLoad(MI, MRI, B, false, false);
5081   case Intrinsic::amdgcn_raw_buffer_load_format:
5082   case Intrinsic::amdgcn_struct_buffer_load_format:
5083     return legalizeBufferLoad(MI, MRI, B, true, false);
5084   case Intrinsic::amdgcn_raw_tbuffer_load:
5085   case Intrinsic::amdgcn_struct_tbuffer_load:
5086     return legalizeBufferLoad(MI, MRI, B, true, true);
5087   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5088   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5089   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5090   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5091   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5092   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5093   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5094   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5095   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5096   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5097   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5098   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5099   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5100   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5101   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5102   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5103   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5104   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5105   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5106   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5107   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5108   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5109   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5110   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5111   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5112   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5113   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5114   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5115   case Intrinsic::amdgcn_buffer_atomic_fadd:
5116   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5117   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5118   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5119   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5120     return legalizeBufferAtomic(MI, B, IntrID);
5121   case Intrinsic::amdgcn_atomic_inc:
5122     return legalizeAtomicIncDec(MI, B, true);
5123   case Intrinsic::amdgcn_atomic_dec:
5124     return legalizeAtomicIncDec(MI, B, false);
5125   case Intrinsic::trap:
5126     return legalizeTrapIntrinsic(MI, MRI, B);
5127   case Intrinsic::debugtrap:
5128     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5129   case Intrinsic::amdgcn_rsq_clamp:
5130     return legalizeRsqClampIntrinsic(MI, MRI, B);
5131   case Intrinsic::amdgcn_ds_fadd:
5132   case Intrinsic::amdgcn_ds_fmin:
5133   case Intrinsic::amdgcn_ds_fmax:
5134     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5135   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5136     return legalizeBVHIntrinsic(MI, B);
5137   default: {
5138     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5139             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5140       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5141     return true;
5142   }
5143   }
5144 
5145   return true;
5146 }
5147