1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
getPow2VectorType(LLT Ty)49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
getPow2ScalarType(LLT Ty)56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
isSmallOddVector(unsigned TypeIdx)65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
sizeIsMultipleOf32(unsigned TypeIdx)79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
isWideVec16(unsigned TypeIdx)86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
oneMoreElement(unsigned TypeIdx)94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
fewerEltsToSize64Vector(unsigned TypeIdx)103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
moreEltsToNext32Bit(unsigned TypeIdx)118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
getBitcastRegisterType(const LLT Ty)134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
bitcastToRegisterType(unsigned TypeIdx)147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
bitcastToVectorElement32(unsigned TypeIdx)154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
vectorSmallerThan(unsigned TypeIdx,unsigned Size)164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
vectorWiderThan(unsigned TypeIdx,unsigned Size)171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
numElementsNotEven(unsigned TypeIdx)178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
isRegisterSize(unsigned Size)185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
isRegisterVectorElementType(LLT EltTy)189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
isRegisterVectorType(LLT Ty)194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
isRegisterType(LLT Ty)201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
isRegisterType(unsigned TypeIdx)213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
elementTypeIsLegal(unsigned TypeIdx)219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
isWideScalarExtLoadTruncStore(unsigned TypeIdx)231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
maxSizeForAddrSpace(const GCNSubtarget & ST,unsigned AS,bool IsLoad)242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
isLoadStoreSizeLegal(const GCNSubtarget & ST,const LegalityQuery & Query)267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
loadStoreBitcastWorkaround(const LLT Ty)337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
isLoadStoreLegal(const GCNSubtarget & ST,const LegalityQuery & Query)355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
shouldBitcastLoadStoreType(const GCNSubtarget & ST,const LLT Ty,const LLT MemTy)363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
shouldWidenLoad(const GCNSubtarget & ST,LLT MemoryTy,unsigned AlignInBits,unsigned AddrSpace,unsigned Opcode)382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
shouldWidenLoad(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .clampScalar(0, S16, S32)
536       .clampMaxNumElements(0, S16, 2)
537       .scalarize(0)
538       .widenScalarToNextPow2(0, 32);
539 
540     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
541       .legalFor({S32, S16, V2S16}) // Clamp modifier
542       .minScalarOrElt(0, S16)
543       .clampMaxNumElements(0, S16, 2)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 32)
546       .lower();
547   } else if (ST.has16BitInsts()) {
548     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
549       .legalFor({S32, S16})
550       .clampScalar(0, S16, S32)
551       .scalarize(0)
552       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
553 
554     // Technically the saturating operations require clamp bit support, but this
555     // was introduced at the same time as 16-bit operations.
556     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
557       .legalFor({S32, S16}) // Clamp modifier
558       .minScalar(0, S16)
559       .scalarize(0)
560       .widenScalarToNextPow2(0, 16)
561       .lower();
562 
563     // We're just lowering this, but it helps get a better result to try to
564     // coerce to the desired type first.
565     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
566       .minScalar(0, S16)
567       .scalarize(0)
568       .lower();
569   } else {
570     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
571       .legalFor({S32})
572       .clampScalar(0, S32, S32)
573       .scalarize(0);
574 
575     if (ST.hasIntClamp()) {
576       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
577         .legalFor({S32}) // Clamp modifier.
578         .scalarize(0)
579         .minScalarOrElt(0, S32)
580         .lower();
581     } else {
582       // Clamp bit support was added in VI, along with 16-bit operations.
583       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
584         .minScalar(0, S32)
585         .scalarize(0)
586         .lower();
587     }
588 
589     // FIXME: DAG expansion gets better results. The widening uses the smaller
590     // range values and goes for the min/max lowering directly.
591     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
592       .minScalar(0, S32)
593       .scalarize(0)
594       .lower();
595   }
596 
597   getActionDefinitionsBuilder(
598       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
599       .customFor({S32, S64})
600       .clampScalar(0, S32, S64)
601       .widenScalarToNextPow2(0, 32)
602       .scalarize(0);
603 
604   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
605                    .legalFor({S32})
606                    .maxScalarOrElt(0, S32);
607 
608   if (ST.hasVOP3PInsts()) {
609     Mulh
610       .clampMaxNumElements(0, S8, 2)
611       .lowerFor({V2S8});
612   }
613 
614   Mulh
615     .scalarize(0)
616     .lower();
617 
618   // Report legal for any types we can handle anywhere. For the cases only legal
619   // on the SALU, RegBankSelect will be able to re-legalize.
620   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
621     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
622     .clampScalar(0, S32, S64)
623     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
624     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
625     .widenScalarToNextPow2(0)
626     .scalarize(0);
627 
628   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
629                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
630     .legalFor({{S32, S1}, {S32, S32}})
631     .minScalar(0, S32)
632     // TODO: .scalarize(0)
633     .lower();
634 
635   getActionDefinitionsBuilder(G_BITCAST)
636     // Don't worry about the size constraint.
637     .legalIf(all(isRegisterType(0), isRegisterType(1)))
638     .lower();
639 
640 
641   getActionDefinitionsBuilder(G_CONSTANT)
642     .legalFor({S1, S32, S64, S16, GlobalPtr,
643                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
644     .legalIf(isPointer(0))
645     .clampScalar(0, S32, S64)
646     .widenScalarToNextPow2(0);
647 
648   getActionDefinitionsBuilder(G_FCONSTANT)
649     .legalFor({S32, S64, S16})
650     .clampScalar(0, S16, S64);
651 
652   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
653       .legalIf(isRegisterType(0))
654       // s1 and s16 are special cases because they have legal operations on
655       // them, but don't really occupy registers in the normal way.
656       .legalFor({S1, S16})
657       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
658       .clampScalarOrElt(0, S32, MaxScalar)
659       .widenScalarToNextPow2(0, 32)
660       .clampMaxNumElements(0, S32, 16);
661 
662   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
663 
664   // If the amount is divergent, we have to do a wave reduction to get the
665   // maximum value, so this is expanded during RegBankSelect.
666   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
667     .legalFor({{PrivatePtr, S32}});
668 
669   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
670     .customIf(typeIsNot(0, PrivatePtr));
671 
672   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
673 
674   auto &FPOpActions = getActionDefinitionsBuilder(
675     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
676     .legalFor({S32, S64});
677   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
678     .customFor({S32, S64});
679   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
680     .customFor({S32, S64});
681 
682   if (ST.has16BitInsts()) {
683     if (ST.hasVOP3PInsts())
684       FPOpActions.legalFor({S16, V2S16});
685     else
686       FPOpActions.legalFor({S16});
687 
688     TrigActions.customFor({S16});
689     FDIVActions.customFor({S16});
690   }
691 
692   auto &MinNumMaxNum = getActionDefinitionsBuilder({
693       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
694 
695   if (ST.hasVOP3PInsts()) {
696     MinNumMaxNum.customFor(FPTypesPK16)
697       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
698       .clampMaxNumElements(0, S16, 2)
699       .clampScalar(0, S16, S64)
700       .scalarize(0);
701   } else if (ST.has16BitInsts()) {
702     MinNumMaxNum.customFor(FPTypes16)
703       .clampScalar(0, S16, S64)
704       .scalarize(0);
705   } else {
706     MinNumMaxNum.customFor(FPTypesBase)
707       .clampScalar(0, S32, S64)
708       .scalarize(0);
709   }
710 
711   if (ST.hasVOP3PInsts())
712     FPOpActions.clampMaxNumElements(0, S16, 2);
713 
714   FPOpActions
715     .scalarize(0)
716     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
717 
718   TrigActions
719     .scalarize(0)
720     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
721 
722   FDIVActions
723     .scalarize(0)
724     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
725 
726   getActionDefinitionsBuilder({G_FNEG, G_FABS})
727     .legalFor(FPTypesPK16)
728     .clampMaxNumElements(0, S16, 2)
729     .scalarize(0)
730     .clampScalar(0, S16, S64);
731 
732   if (ST.has16BitInsts()) {
733     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
734       .legalFor({S32, S64, S16})
735       .scalarize(0)
736       .clampScalar(0, S16, S64);
737   } else {
738     getActionDefinitionsBuilder(G_FSQRT)
739       .legalFor({S32, S64})
740       .scalarize(0)
741       .clampScalar(0, S32, S64);
742 
743     if (ST.hasFractBug()) {
744       getActionDefinitionsBuilder(G_FFLOOR)
745         .customFor({S64})
746         .legalFor({S32, S64})
747         .scalarize(0)
748         .clampScalar(0, S32, S64);
749     } else {
750       getActionDefinitionsBuilder(G_FFLOOR)
751         .legalFor({S32, S64})
752         .scalarize(0)
753         .clampScalar(0, S32, S64);
754     }
755   }
756 
757   getActionDefinitionsBuilder(G_FPTRUNC)
758     .legalFor({{S32, S64}, {S16, S32}})
759     .scalarize(0)
760     .lower();
761 
762   getActionDefinitionsBuilder(G_FPEXT)
763     .legalFor({{S64, S32}, {S32, S16}})
764     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
765     .scalarize(0);
766 
767   getActionDefinitionsBuilder(G_FSUB)
768       // Use actual fsub instruction
769       .legalFor({S32})
770       // Must use fadd + fneg
771       .lowerFor({S64, S16, V2S16})
772       .scalarize(0)
773       .clampScalar(0, S32, S64);
774 
775   // Whether this is legal depends on the floating point mode for the function.
776   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
777   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
778     FMad.customFor({S32, S16});
779   else if (ST.hasMadMacF32Insts())
780     FMad.customFor({S32});
781   else if (ST.hasMadF16())
782     FMad.customFor({S16});
783   FMad.scalarize(0)
784       .lower();
785 
786   auto &FRem = getActionDefinitionsBuilder(G_FREM);
787   if (ST.has16BitInsts()) {
788     FRem.customFor({S16, S32, S64});
789   } else {
790     FRem.minScalar(0, S32)
791         .customFor({S32, S64});
792   }
793   FRem.scalarize(0);
794 
795   // TODO: Do we need to clamp maximum bitwidth?
796   getActionDefinitionsBuilder(G_TRUNC)
797     .legalIf(isScalar(0))
798     .legalFor({{V2S16, V2S32}})
799     .clampMaxNumElements(0, S16, 2)
800     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
801     // situations (like an invalid implicit use), we don't want to infinite loop
802     // in the legalizer.
803     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
804     .alwaysLegal();
805 
806   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
807     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
808                {S32, S1}, {S64, S1}, {S16, S1}})
809     .scalarize(0)
810     .clampScalar(0, S32, S64)
811     .widenScalarToNextPow2(1, 32);
812 
813   // TODO: Split s1->s64 during regbankselect for VALU.
814   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
815     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
816     .lowerFor({{S32, S64}})
817     .lowerIf(typeIs(1, S1))
818     .customFor({{S64, S64}});
819   if (ST.has16BitInsts())
820     IToFP.legalFor({{S16, S16}});
821   IToFP.clampScalar(1, S32, S64)
822        .minScalar(0, S32)
823        .scalarize(0)
824        .widenScalarToNextPow2(1);
825 
826   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
827     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
828     .customFor({{S64, S32}, {S64, S64}})
829     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
830   if (ST.has16BitInsts())
831     FPToI.legalFor({{S16, S16}});
832   else
833     FPToI.minScalar(1, S32);
834 
835   FPToI.minScalar(0, S32)
836        .widenScalarToNextPow2(0, 32)
837        .scalarize(0)
838        .lower();
839 
840   // Lower roundeven into G_FRINT
841   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
842     .scalarize(0)
843     .lower();
844 
845   if (ST.has16BitInsts()) {
846     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847       .legalFor({S16, S32, S64})
848       .clampScalar(0, S16, S64)
849       .scalarize(0);
850   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
851     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
852       .legalFor({S32, S64})
853       .clampScalar(0, S32, S64)
854       .scalarize(0);
855   } else {
856     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
857       .legalFor({S32})
858       .customFor({S64})
859       .clampScalar(0, S32, S64)
860       .scalarize(0);
861   }
862 
863   getActionDefinitionsBuilder(G_PTR_ADD)
864     .legalIf(all(isPointer(0), sameSize(0, 1)))
865     .scalarize(0)
866     .scalarSameSizeAs(1, 0);
867 
868   getActionDefinitionsBuilder(G_PTRMASK)
869     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
870     .scalarSameSizeAs(1, 0)
871     .scalarize(0);
872 
873   auto &CmpBuilder =
874     getActionDefinitionsBuilder(G_ICMP)
875     // The compare output type differs based on the register bank of the output,
876     // so make both s1 and s32 legal.
877     //
878     // Scalar compares producing output in scc will be promoted to s32, as that
879     // is the allocatable register type that will be needed for the copy from
880     // scc. This will be promoted during RegBankSelect, and we assume something
881     // before that won't try to use s32 result types.
882     //
883     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
884     // bank.
885     .legalForCartesianProduct(
886       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
887     .legalForCartesianProduct(
888       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
889   if (ST.has16BitInsts()) {
890     CmpBuilder.legalFor({{S1, S16}});
891   }
892 
893   CmpBuilder
894     .widenScalarToNextPow2(1)
895     .clampScalar(1, S32, S64)
896     .scalarize(0)
897     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
898 
899   getActionDefinitionsBuilder(G_FCMP)
900     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
901     .widenScalarToNextPow2(1)
902     .clampScalar(1, S32, S64)
903     .scalarize(0);
904 
905   // FIXME: fpow has a selection pattern that should move to custom lowering.
906   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
907   if (ST.has16BitInsts())
908     Exp2Ops.legalFor({S32, S16});
909   else
910     Exp2Ops.legalFor({S32});
911   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
912   Exp2Ops.scalarize(0);
913 
914   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
915   if (ST.has16BitInsts())
916     ExpOps.customFor({{S32}, {S16}});
917   else
918     ExpOps.customFor({S32});
919   ExpOps.clampScalar(0, MinScalarFPTy, S32)
920         .scalarize(0);
921 
922   getActionDefinitionsBuilder(G_FPOWI)
923     .clampScalar(0, MinScalarFPTy, S32)
924     .lower();
925 
926   // The 64-bit versions produce 32-bit results, but only on the SALU.
927   getActionDefinitionsBuilder(G_CTPOP)
928     .legalFor({{S32, S32}, {S32, S64}})
929     .clampScalar(0, S32, S32)
930     .clampScalar(1, S32, S64)
931     .scalarize(0)
932     .widenScalarToNextPow2(0, 32)
933     .widenScalarToNextPow2(1, 32);
934 
935   // The hardware instructions return a different result on 0 than the generic
936   // instructions expect. The hardware produces -1, but these produce the
937   // bitwidth.
938   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
939     .scalarize(0)
940     .clampScalar(0, S32, S32)
941     .clampScalar(1, S32, S64)
942     .widenScalarToNextPow2(0, 32)
943     .widenScalarToNextPow2(1, 32)
944     .lower();
945 
946   // The 64-bit versions produce 32-bit results, but only on the SALU.
947   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
948     .legalFor({{S32, S32}, {S32, S64}})
949     .clampScalar(0, S32, S32)
950     .clampScalar(1, S32, S64)
951     .scalarize(0)
952     .widenScalarToNextPow2(0, 32)
953     .widenScalarToNextPow2(1, 32);
954 
955   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
956   // RegBankSelect.
957   getActionDefinitionsBuilder(G_BITREVERSE)
958     .legalFor({S32, S64})
959     .clampScalar(0, S32, S64)
960     .scalarize(0)
961     .widenScalarToNextPow2(0);
962 
963   if (ST.has16BitInsts()) {
964     getActionDefinitionsBuilder(G_BSWAP)
965       .legalFor({S16, S32, V2S16})
966       .clampMaxNumElements(0, S16, 2)
967       // FIXME: Fixing non-power-of-2 before clamp is workaround for
968       // narrowScalar limitation.
969       .widenScalarToNextPow2(0)
970       .clampScalar(0, S16, S32)
971       .scalarize(0);
972 
973     if (ST.hasVOP3PInsts()) {
974       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
975         .legalFor({S32, S16, V2S16})
976         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
977         .clampMaxNumElements(0, S16, 2)
978         .minScalar(0, S16)
979         .widenScalarToNextPow2(0)
980         .scalarize(0)
981         .lower();
982     } else {
983       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
984         .legalFor({S32, S16})
985         .widenScalarToNextPow2(0)
986         .minScalar(0, S16)
987         .scalarize(0)
988         .lower();
989     }
990   } else {
991     // TODO: Should have same legality without v_perm_b32
992     getActionDefinitionsBuilder(G_BSWAP)
993       .legalFor({S32})
994       .lowerIf(scalarNarrowerThan(0, 32))
995       // FIXME: Fixing non-power-of-2 before clamp is workaround for
996       // narrowScalar limitation.
997       .widenScalarToNextPow2(0)
998       .maxScalar(0, S32)
999       .scalarize(0)
1000       .lower();
1001 
1002     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1003       .legalFor({S32})
1004       .minScalar(0, S32)
1005       .widenScalarToNextPow2(0)
1006       .scalarize(0)
1007       .lower();
1008   }
1009 
1010   getActionDefinitionsBuilder(G_INTTOPTR)
1011     // List the common cases
1012     .legalForCartesianProduct(AddrSpaces64, {S64})
1013     .legalForCartesianProduct(AddrSpaces32, {S32})
1014     .scalarize(0)
1015     // Accept any address space as long as the size matches
1016     .legalIf(sameSize(0, 1))
1017     .widenScalarIf(smallerThan(1, 0),
1018       [](const LegalityQuery &Query) {
1019         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1020       })
1021     .narrowScalarIf(largerThan(1, 0),
1022       [](const LegalityQuery &Query) {
1023         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1024       });
1025 
1026   getActionDefinitionsBuilder(G_PTRTOINT)
1027     // List the common cases
1028     .legalForCartesianProduct(AddrSpaces64, {S64})
1029     .legalForCartesianProduct(AddrSpaces32, {S32})
1030     .scalarize(0)
1031     // Accept any address space as long as the size matches
1032     .legalIf(sameSize(0, 1))
1033     .widenScalarIf(smallerThan(0, 1),
1034       [](const LegalityQuery &Query) {
1035         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036       })
1037     .narrowScalarIf(
1038       largerThan(0, 1),
1039       [](const LegalityQuery &Query) {
1040         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1041       });
1042 
1043   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1044     .scalarize(0)
1045     .custom();
1046 
1047   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1048                                     bool IsLoad) -> bool {
1049     const LLT DstTy = Query.Types[0];
1050 
1051     // Split vector extloads.
1052     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1053     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1054 
1055     if (MemSize < DstTy.getSizeInBits())
1056       MemSize = std::max(MemSize, AlignBits);
1057 
1058     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1059       return true;
1060 
1061     const LLT PtrTy = Query.Types[1];
1062     unsigned AS = PtrTy.getAddressSpace();
1063     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1064       return true;
1065 
1066     // Catch weird sized loads that don't evenly divide into the access sizes
1067     // TODO: May be able to widen depending on alignment etc.
1068     unsigned NumRegs = (MemSize + 31) / 32;
1069     if (NumRegs == 3) {
1070       if (!ST.hasDwordx3LoadStores())
1071         return true;
1072     } else {
1073       // If the alignment allows, these should have been widened.
1074       if (!isPowerOf2_32(NumRegs))
1075         return true;
1076     }
1077 
1078     if (AlignBits < MemSize) {
1079       const SITargetLowering *TLI = ST.getTargetLowering();
1080       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1081                                                       Align(AlignBits / 8));
1082     }
1083 
1084     return false;
1085   };
1086 
1087   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1088   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1089   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1090 
1091   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1092   // LDS
1093   // TODO: Unsupported flat for SI.
1094 
1095   for (unsigned Op : {G_LOAD, G_STORE}) {
1096     const bool IsStore = Op == G_STORE;
1097 
1098     auto &Actions = getActionDefinitionsBuilder(Op);
1099     // Explicitly list some common cases.
1100     // TODO: Does this help compile time at all?
1101     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1102                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1103                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1104                                       {S64, GlobalPtr, S64, GlobalAlign32},
1105                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1106                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1107                                       {S32, GlobalPtr, S8, GlobalAlign8},
1108                                       {S32, GlobalPtr, S16, GlobalAlign16},
1109 
1110                                       {S32, LocalPtr, S32, 32},
1111                                       {S64, LocalPtr, S64, 32},
1112                                       {V2S32, LocalPtr, V2S32, 32},
1113                                       {S32, LocalPtr, S8, 8},
1114                                       {S32, LocalPtr, S16, 16},
1115                                       {V2S16, LocalPtr, S32, 32},
1116 
1117                                       {S32, PrivatePtr, S32, 32},
1118                                       {S32, PrivatePtr, S8, 8},
1119                                       {S32, PrivatePtr, S16, 16},
1120                                       {V2S16, PrivatePtr, S32, 32},
1121 
1122                                       {S32, ConstantPtr, S32, GlobalAlign32},
1123                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1124                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1125                                       {S64, ConstantPtr, S64, GlobalAlign32},
1126                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1127     Actions.legalIf(
1128       [=](const LegalityQuery &Query) -> bool {
1129         return isLoadStoreLegal(ST, Query);
1130       });
1131 
1132     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1133     // 64-bits.
1134     //
1135     // TODO: Should generalize bitcast action into coerce, which will also cover
1136     // inserting addrspacecasts.
1137     Actions.customIf(typeIs(1, Constant32Ptr));
1138 
1139     // Turn any illegal element vectors into something easier to deal
1140     // with. These will ultimately produce 32-bit scalar shifts to extract the
1141     // parts anyway.
1142     //
1143     // For odd 16-bit element vectors, prefer to split those into pieces with
1144     // 16-bit vector parts.
1145     Actions.bitcastIf(
1146       [=](const LegalityQuery &Query) -> bool {
1147         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1148                                           Query.MMODescrs[0].MemoryTy);
1149       }, bitcastToRegisterType(0));
1150 
1151     if (!IsStore) {
1152       // Widen suitably aligned loads by loading extra bytes. The standard
1153       // legalization actions can't properly express widening memory operands.
1154       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1155         return shouldWidenLoad(ST, Query, G_LOAD);
1156       });
1157     }
1158 
1159     // FIXME: load/store narrowing should be moved to lower action
1160     Actions
1161         .narrowScalarIf(
1162             [=](const LegalityQuery &Query) -> bool {
1163               return !Query.Types[0].isVector() &&
1164                      needToSplitMemOp(Query, Op == G_LOAD);
1165             },
1166             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1167               const LLT DstTy = Query.Types[0];
1168               const LLT PtrTy = Query.Types[1];
1169 
1170               const unsigned DstSize = DstTy.getSizeInBits();
1171               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1172 
1173               // Split extloads.
1174               if (DstSize > MemSize)
1175                 return std::make_pair(0, LLT::scalar(MemSize));
1176 
1177               if (!isPowerOf2_32(DstSize)) {
1178                 // We're probably decomposing an odd sized store. Try to split
1179                 // to the widest type. TODO: Account for alignment. As-is it
1180                 // should be OK, since the new parts will be further legalized.
1181                 unsigned FloorSize = PowerOf2Floor(DstSize);
1182                 return std::make_pair(0, LLT::scalar(FloorSize));
1183               }
1184 
1185               if (DstSize > 32 && (DstSize % 32 != 0)) {
1186                 // FIXME: Need a way to specify non-extload of larger size if
1187                 // suitably aligned.
1188                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1189               }
1190 
1191               unsigned MaxSize = maxSizeForAddrSpace(ST,
1192                                                      PtrTy.getAddressSpace(),
1193                                                      Op == G_LOAD);
1194               if (MemSize > MaxSize)
1195                 return std::make_pair(0, LLT::scalar(MaxSize));
1196 
1197               unsigned Align = Query.MMODescrs[0].AlignInBits;
1198               return std::make_pair(0, LLT::scalar(Align));
1199             })
1200         .fewerElementsIf(
1201             [=](const LegalityQuery &Query) -> bool {
1202               return Query.Types[0].isVector() &&
1203                      needToSplitMemOp(Query, Op == G_LOAD);
1204             },
1205             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1206               const LLT DstTy = Query.Types[0];
1207               const LLT PtrTy = Query.Types[1];
1208 
1209               LLT EltTy = DstTy.getElementType();
1210               unsigned MaxSize = maxSizeForAddrSpace(ST,
1211                                                      PtrTy.getAddressSpace(),
1212                                                      Op == G_LOAD);
1213 
1214               // FIXME: Handle widened to power of 2 results better. This ends
1215               // up scalarizing.
1216               // FIXME: 3 element stores scalarized on SI
1217 
1218               // Split if it's too large for the address space.
1219               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1220               if (MemSize > MaxSize) {
1221                 unsigned NumElts = DstTy.getNumElements();
1222                 unsigned EltSize = EltTy.getSizeInBits();
1223 
1224                 if (MaxSize % EltSize == 0) {
1225                   return std::make_pair(
1226                       0, LLT::scalarOrVector(
1227                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1228                 }
1229 
1230                 unsigned NumPieces = MemSize / MaxSize;
1231 
1232                 // FIXME: Refine when odd breakdowns handled
1233                 // The scalars will need to be re-legalized.
1234                 if (NumPieces == 1 || NumPieces >= NumElts ||
1235                     NumElts % NumPieces != 0)
1236                   return std::make_pair(0, EltTy);
1237 
1238                 return std::make_pair(
1239                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1240               }
1241 
1242               // FIXME: We could probably handle weird extending loads better.
1243               if (DstTy.getSizeInBits() > MemSize)
1244                 return std::make_pair(0, EltTy);
1245 
1246               unsigned EltSize = EltTy.getSizeInBits();
1247               unsigned DstSize = DstTy.getSizeInBits();
1248               if (!isPowerOf2_32(DstSize)) {
1249                 // We're probably decomposing an odd sized store. Try to split
1250                 // to the widest type. TODO: Account for alignment. As-is it
1251                 // should be OK, since the new parts will be further legalized.
1252                 unsigned FloorSize = PowerOf2Floor(DstSize);
1253                 return std::make_pair(
1254                     0, LLT::scalarOrVector(
1255                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1256               }
1257 
1258               // Need to split because of alignment.
1259               unsigned Align = Query.MMODescrs[0].AlignInBits;
1260               if (EltSize > Align &&
1261                   (EltSize / Align < DstTy.getNumElements())) {
1262                 return std::make_pair(
1263                     0, LLT::fixed_vector(EltSize / Align, EltTy));
1264               }
1265 
1266               // May need relegalization for the scalars.
1267               return std::make_pair(0, EltTy);
1268             })
1269     .lowerIfMemSizeNotPow2()
1270     .minScalar(0, S32)
1271     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1272     .widenScalarToNextPow2(0)
1273     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1274     .lower();
1275   }
1276 
1277   // FIXME: Unaligned accesses not lowered.
1278   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1279                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1280                                                   {S32, GlobalPtr, S16, 2 * 8},
1281                                                   {S32, LocalPtr, S8, 8},
1282                                                   {S32, LocalPtr, S16, 16},
1283                                                   {S32, PrivatePtr, S8, 8},
1284                                                   {S32, PrivatePtr, S16, 16},
1285                                                   {S32, ConstantPtr, S8, 8},
1286                                                   {S32, ConstantPtr, S16, 2 * 8}})
1287                        .legalIf(
1288                          [=](const LegalityQuery &Query) -> bool {
1289                            return isLoadStoreLegal(ST, Query);
1290                          });
1291 
1292   if (ST.hasFlatAddressSpace()) {
1293     ExtLoads.legalForTypesWithMemDesc(
1294         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1295   }
1296 
1297   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1298   // 64-bits.
1299   //
1300   // TODO: Should generalize bitcast action into coerce, which will also cover
1301   // inserting addrspacecasts.
1302   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1303 
1304   ExtLoads.clampScalar(0, S32, S32)
1305           .widenScalarToNextPow2(0)
1306           .lower();
1307 
1308   auto &Atomics = getActionDefinitionsBuilder(
1309     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1310      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1311      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1312      G_ATOMICRMW_UMIN})
1313     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1314                {S64, GlobalPtr}, {S64, LocalPtr},
1315                {S32, RegionPtr}, {S64, RegionPtr}});
1316   if (ST.hasFlatAddressSpace()) {
1317     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1318   }
1319 
1320   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1321   if (ST.hasLDSFPAtomics()) {
1322     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1323     if (ST.hasGFX90AInsts())
1324       Atomic.legalFor({{S64, LocalPtr}});
1325   }
1326   if (ST.hasAtomicFaddInsts())
1327     Atomic.legalFor({{S32, GlobalPtr}});
1328 
1329   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1330   // demarshalling
1331   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1332     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1333                 {S32, FlatPtr}, {S64, FlatPtr}})
1334     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1335                {S32, RegionPtr}, {S64, RegionPtr}});
1336   // TODO: Pointer types, any 32-bit or 64-bit vector
1337 
1338   // Condition should be s32 for scalar, s1 for vector.
1339   getActionDefinitionsBuilder(G_SELECT)
1340       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1341                                  LocalPtr, FlatPtr, PrivatePtr,
1342                                  LLT::fixed_vector(2, LocalPtr),
1343                                  LLT::fixed_vector(2, PrivatePtr)},
1344                                 {S1, S32})
1345       .clampScalar(0, S16, S64)
1346       .scalarize(1)
1347       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1348       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1349       .clampMaxNumElements(0, S32, 2)
1350       .clampMaxNumElements(0, LocalPtr, 2)
1351       .clampMaxNumElements(0, PrivatePtr, 2)
1352       .scalarize(0)
1353       .widenScalarToNextPow2(0)
1354       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1355 
1356   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1357   // be more flexible with the shift amount type.
1358   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1359     .legalFor({{S32, S32}, {S64, S32}});
1360   if (ST.has16BitInsts()) {
1361     if (ST.hasVOP3PInsts()) {
1362       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1363             .clampMaxNumElements(0, S16, 2);
1364     } else
1365       Shifts.legalFor({{S16, S16}});
1366 
1367     // TODO: Support 16-bit shift amounts for all types
1368     Shifts.widenScalarIf(
1369       [=](const LegalityQuery &Query) {
1370         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1371         // 32-bit amount.
1372         const LLT ValTy = Query.Types[0];
1373         const LLT AmountTy = Query.Types[1];
1374         return ValTy.getSizeInBits() <= 16 &&
1375                AmountTy.getSizeInBits() < 16;
1376       }, changeTo(1, S16));
1377     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1378     Shifts.clampScalar(1, S32, S32);
1379     Shifts.clampScalar(0, S16, S64);
1380     Shifts.widenScalarToNextPow2(0, 16);
1381 
1382     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1383       .minScalar(0, S16)
1384       .scalarize(0)
1385       .lower();
1386   } else {
1387     // Make sure we legalize the shift amount type first, as the general
1388     // expansion for the shifted type will produce much worse code if it hasn't
1389     // been truncated already.
1390     Shifts.clampScalar(1, S32, S32);
1391     Shifts.clampScalar(0, S32, S64);
1392     Shifts.widenScalarToNextPow2(0, 32);
1393 
1394     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1395       .minScalar(0, S32)
1396       .scalarize(0)
1397       .lower();
1398   }
1399   Shifts.scalarize(0);
1400 
1401   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1402     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1403     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1404     unsigned IdxTypeIdx = 2;
1405 
1406     getActionDefinitionsBuilder(Op)
1407       .customIf([=](const LegalityQuery &Query) {
1408           const LLT EltTy = Query.Types[EltTypeIdx];
1409           const LLT VecTy = Query.Types[VecTypeIdx];
1410           const LLT IdxTy = Query.Types[IdxTypeIdx];
1411           const unsigned EltSize = EltTy.getSizeInBits();
1412           return (EltSize == 32 || EltSize == 64) &&
1413                   VecTy.getSizeInBits() % 32 == 0 &&
1414                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1415                   IdxTy.getSizeInBits() == 32;
1416         })
1417       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1418                  bitcastToVectorElement32(VecTypeIdx))
1419       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1420       .bitcastIf(
1421         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1422         [=](const LegalityQuery &Query) {
1423           // For > 64-bit element types, try to turn this into a 64-bit
1424           // element vector since we may be able to do better indexing
1425           // if this is scalar. If not, fall back to 32.
1426           const LLT EltTy = Query.Types[EltTypeIdx];
1427           const LLT VecTy = Query.Types[VecTypeIdx];
1428           const unsigned DstEltSize = EltTy.getSizeInBits();
1429           const unsigned VecSize = VecTy.getSizeInBits();
1430 
1431           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1432           return std::make_pair(
1433               VecTypeIdx,
1434               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1435         })
1436       .clampScalar(EltTypeIdx, S32, S64)
1437       .clampScalar(VecTypeIdx, S32, S64)
1438       .clampScalar(IdxTypeIdx, S32, S32)
1439       .clampMaxNumElements(VecTypeIdx, S32, 32)
1440       // TODO: Clamp elements for 64-bit vectors?
1441       // It should only be necessary with variable indexes.
1442       // As a last resort, lower to the stack
1443       .lower();
1444   }
1445 
1446   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1447     .unsupportedIf([=](const LegalityQuery &Query) {
1448         const LLT &EltTy = Query.Types[1].getElementType();
1449         return Query.Types[0] != EltTy;
1450       });
1451 
1452   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1453     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1454     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1455 
1456     // FIXME: Doesn't handle extract of illegal sizes.
1457     getActionDefinitionsBuilder(Op)
1458       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1459       // FIXME: Multiples of 16 should not be legal.
1460       .legalIf([=](const LegalityQuery &Query) {
1461           const LLT BigTy = Query.Types[BigTyIdx];
1462           const LLT LitTy = Query.Types[LitTyIdx];
1463           return (BigTy.getSizeInBits() % 32 == 0) &&
1464                  (LitTy.getSizeInBits() % 16 == 0);
1465         })
1466       .widenScalarIf(
1467         [=](const LegalityQuery &Query) {
1468           const LLT BigTy = Query.Types[BigTyIdx];
1469           return (BigTy.getScalarSizeInBits() < 16);
1470         },
1471         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1472       .widenScalarIf(
1473         [=](const LegalityQuery &Query) {
1474           const LLT LitTy = Query.Types[LitTyIdx];
1475           return (LitTy.getScalarSizeInBits() < 16);
1476         },
1477         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1478       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1479       .widenScalarToNextPow2(BigTyIdx, 32);
1480 
1481   }
1482 
1483   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1484     .legalForCartesianProduct(AllS32Vectors, {S32})
1485     .legalForCartesianProduct(AllS64Vectors, {S64})
1486     .clampNumElements(0, V16S32, V32S32)
1487     .clampNumElements(0, V2S64, V16S64)
1488     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1489 
1490   if (ST.hasScalarPackInsts()) {
1491     BuildVector
1492       // FIXME: Should probably widen s1 vectors straight to s32
1493       .minScalarOrElt(0, S16)
1494       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1495       .minScalar(1, S32);
1496 
1497     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1498       .legalFor({V2S16, S32})
1499       .lower();
1500     BuildVector.minScalarOrElt(0, S32);
1501   } else {
1502     BuildVector.customFor({V2S16, S16});
1503     BuildVector.minScalarOrElt(0, S32);
1504 
1505     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1506       .customFor({V2S16, S32})
1507       .lower();
1508   }
1509 
1510   BuildVector.legalIf(isRegisterType(0));
1511 
1512   // FIXME: Clamp maximum size
1513   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1514     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1515     .clampMaxNumElements(0, S32, 32)
1516     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1517     .clampMaxNumElements(0, S16, 64);
1518 
1519   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1520   // pre-legalize.
1521   if (ST.hasVOP3PInsts()) {
1522     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1523       .customFor({V2S16, V2S16})
1524       .lower();
1525   } else
1526     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1527 
1528   // Merge/Unmerge
1529   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1530     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1531     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1532 
1533     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1534       const LLT Ty = Query.Types[TypeIdx];
1535       if (Ty.isVector()) {
1536         const LLT &EltTy = Ty.getElementType();
1537         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1538           return true;
1539         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1540           return true;
1541       }
1542       return false;
1543     };
1544 
1545     auto &Builder = getActionDefinitionsBuilder(Op)
1546       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1547       .lowerFor({{S16, V2S16}})
1548       .lowerIf([=](const LegalityQuery &Query) {
1549           const LLT BigTy = Query.Types[BigTyIdx];
1550           return BigTy.getSizeInBits() == 32;
1551         })
1552       // Try to widen to s16 first for small types.
1553       // TODO: Only do this on targets with legal s16 shifts
1554       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1555       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1556       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1557       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1558                            elementTypeIs(1, S16)),
1559                        changeTo(1, V2S16))
1560       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1561       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1562       // valid.
1563       .clampScalar(LitTyIdx, S32, S512)
1564       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1565       // Break up vectors with weird elements into scalars
1566       .fewerElementsIf(
1567         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1568         scalarize(0))
1569       .fewerElementsIf(
1570         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1571         scalarize(1))
1572       .clampScalar(BigTyIdx, S32, MaxScalar);
1573 
1574     if (Op == G_MERGE_VALUES) {
1575       Builder.widenScalarIf(
1576         // TODO: Use 16-bit shifts if legal for 8-bit values?
1577         [=](const LegalityQuery &Query) {
1578           const LLT Ty = Query.Types[LitTyIdx];
1579           return Ty.getSizeInBits() < 32;
1580         },
1581         changeTo(LitTyIdx, S32));
1582     }
1583 
1584     Builder.widenScalarIf(
1585       [=](const LegalityQuery &Query) {
1586         const LLT Ty = Query.Types[BigTyIdx];
1587         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1588           Ty.getSizeInBits() % 16 != 0;
1589       },
1590       [=](const LegalityQuery &Query) {
1591         // Pick the next power of 2, or a multiple of 64 over 128.
1592         // Whichever is smaller.
1593         const LLT &Ty = Query.Types[BigTyIdx];
1594         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1595         if (NewSizeInBits >= 256) {
1596           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1597           if (RoundedTo < NewSizeInBits)
1598             NewSizeInBits = RoundedTo;
1599         }
1600         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1601       })
1602       // Any vectors left are the wrong size. Scalarize them.
1603       .scalarize(0)
1604       .scalarize(1);
1605   }
1606 
1607   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1608   // RegBankSelect.
1609   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1610     .legalFor({{S32}, {S64}});
1611 
1612   if (ST.hasVOP3PInsts()) {
1613     SextInReg.lowerFor({{V2S16}})
1614       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1615       // get more vector shift opportunities, since we'll get those when
1616       // expanded.
1617       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1618   } else if (ST.has16BitInsts()) {
1619     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1620   } else {
1621     // Prefer to promote to s32 before lowering if we don't have 16-bit
1622     // shifts. This avoid a lot of intermediate truncate and extend operations.
1623     SextInReg.lowerFor({{S32}, {S64}});
1624   }
1625 
1626   SextInReg
1627     .scalarize(0)
1628     .clampScalar(0, S32, S64)
1629     .lower();
1630 
1631   // TODO: Only Try to form v2s16 with legal packed instructions.
1632   getActionDefinitionsBuilder(G_FSHR)
1633     .legalFor({{S32, S32}})
1634     .lowerFor({{V2S16, V2S16}})
1635     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1636     .scalarize(0)
1637     .lower();
1638 
1639   if (ST.hasVOP3PInsts()) {
1640     getActionDefinitionsBuilder(G_FSHL)
1641       .lowerFor({{V2S16, V2S16}})
1642       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1643       .scalarize(0)
1644       .lower();
1645   } else {
1646     getActionDefinitionsBuilder(G_FSHL)
1647       .scalarize(0)
1648       .lower();
1649   }
1650 
1651   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1652     .legalFor({S64});
1653 
1654   getActionDefinitionsBuilder(G_FENCE)
1655     .alwaysLegal();
1656 
1657   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1658       .scalarize(0)
1659       .minScalar(0, S32)
1660       .lower();
1661 
1662   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1663       .legalFor({{S32, S32}, {S64, S32}})
1664       .clampScalar(1, S32, S32)
1665       .clampScalar(0, S32, S64)
1666       .widenScalarToNextPow2(0)
1667       .scalarize(0);
1668 
1669   getActionDefinitionsBuilder({
1670       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1671       G_FCOPYSIGN,
1672 
1673       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1674       G_ATOMICRMW_NAND,
1675       G_ATOMICRMW_FSUB,
1676       G_READ_REGISTER,
1677       G_WRITE_REGISTER,
1678 
1679       G_SADDO, G_SSUBO,
1680 
1681        // TODO: Implement
1682       G_FMINIMUM, G_FMAXIMUM}).lower();
1683 
1684   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1685         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1686         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1687     .unsupported();
1688 
1689   getLegacyLegalizerInfo().computeTables();
1690   verify(*ST.getInstrInfo());
1691 }
1692 
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI) const1693 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1694                                          MachineInstr &MI) const {
1695   MachineIRBuilder &B = Helper.MIRBuilder;
1696   MachineRegisterInfo &MRI = *B.getMRI();
1697 
1698   switch (MI.getOpcode()) {
1699   case TargetOpcode::G_ADDRSPACE_CAST:
1700     return legalizeAddrSpaceCast(MI, MRI, B);
1701   case TargetOpcode::G_FRINT:
1702     return legalizeFrint(MI, MRI, B);
1703   case TargetOpcode::G_FCEIL:
1704     return legalizeFceil(MI, MRI, B);
1705   case TargetOpcode::G_FREM:
1706     return legalizeFrem(MI, MRI, B);
1707   case TargetOpcode::G_INTRINSIC_TRUNC:
1708     return legalizeIntrinsicTrunc(MI, MRI, B);
1709   case TargetOpcode::G_SITOFP:
1710     return legalizeITOFP(MI, MRI, B, true);
1711   case TargetOpcode::G_UITOFP:
1712     return legalizeITOFP(MI, MRI, B, false);
1713   case TargetOpcode::G_FPTOSI:
1714     return legalizeFPTOI(MI, MRI, B, true);
1715   case TargetOpcode::G_FPTOUI:
1716     return legalizeFPTOI(MI, MRI, B, false);
1717   case TargetOpcode::G_FMINNUM:
1718   case TargetOpcode::G_FMAXNUM:
1719   case TargetOpcode::G_FMINNUM_IEEE:
1720   case TargetOpcode::G_FMAXNUM_IEEE:
1721     return legalizeMinNumMaxNum(Helper, MI);
1722   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1723     return legalizeExtractVectorElt(MI, MRI, B);
1724   case TargetOpcode::G_INSERT_VECTOR_ELT:
1725     return legalizeInsertVectorElt(MI, MRI, B);
1726   case TargetOpcode::G_SHUFFLE_VECTOR:
1727     return legalizeShuffleVector(MI, MRI, B);
1728   case TargetOpcode::G_FSIN:
1729   case TargetOpcode::G_FCOS:
1730     return legalizeSinCos(MI, MRI, B);
1731   case TargetOpcode::G_GLOBAL_VALUE:
1732     return legalizeGlobalValue(MI, MRI, B);
1733   case TargetOpcode::G_LOAD:
1734   case TargetOpcode::G_SEXTLOAD:
1735   case TargetOpcode::G_ZEXTLOAD:
1736     return legalizeLoad(Helper, MI);
1737   case TargetOpcode::G_FMAD:
1738     return legalizeFMad(MI, MRI, B);
1739   case TargetOpcode::G_FDIV:
1740     return legalizeFDIV(MI, MRI, B);
1741   case TargetOpcode::G_UDIV:
1742   case TargetOpcode::G_UREM:
1743   case TargetOpcode::G_UDIVREM:
1744     return legalizeUnsignedDIV_REM(MI, MRI, B);
1745   case TargetOpcode::G_SDIV:
1746   case TargetOpcode::G_SREM:
1747   case TargetOpcode::G_SDIVREM:
1748     return legalizeSignedDIV_REM(MI, MRI, B);
1749   case TargetOpcode::G_ATOMIC_CMPXCHG:
1750     return legalizeAtomicCmpXChg(MI, MRI, B);
1751   case TargetOpcode::G_FLOG:
1752     return legalizeFlog(MI, B, numbers::ln2f);
1753   case TargetOpcode::G_FLOG10:
1754     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1755   case TargetOpcode::G_FEXP:
1756     return legalizeFExp(MI, B);
1757   case TargetOpcode::G_FPOW:
1758     return legalizeFPow(MI, B);
1759   case TargetOpcode::G_FFLOOR:
1760     return legalizeFFloor(MI, MRI, B);
1761   case TargetOpcode::G_BUILD_VECTOR:
1762     return legalizeBuildVector(MI, MRI, B);
1763   default:
1764     return false;
1765   }
1766 
1767   llvm_unreachable("expected switch to return");
1768 }
1769 
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1770 Register AMDGPULegalizerInfo::getSegmentAperture(
1771   unsigned AS,
1772   MachineRegisterInfo &MRI,
1773   MachineIRBuilder &B) const {
1774   MachineFunction &MF = B.getMF();
1775   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1776   const LLT S32 = LLT::scalar(32);
1777 
1778   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1779 
1780   if (ST.hasApertureRegs()) {
1781     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1782     // getreg.
1783     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1784         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1785         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1786     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1787         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1788         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1789     unsigned Encoding =
1790         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1791         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1792         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1793 
1794     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1795 
1796     B.buildInstr(AMDGPU::S_GETREG_B32)
1797       .addDef(GetReg)
1798       .addImm(Encoding);
1799     MRI.setType(GetReg, S32);
1800 
1801     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1802     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1803   }
1804 
1805   Register QueuePtr = MRI.createGenericVirtualRegister(
1806     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1807 
1808   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1809     return Register();
1810 
1811   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1812   // private_segment_aperture_base_hi.
1813   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1814 
1815   // TODO: can we be smarter about machine pointer info?
1816   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1817   MachineMemOperand *MMO = MF.getMachineMemOperand(
1818       PtrInfo,
1819       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1820           MachineMemOperand::MOInvariant,
1821       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1822 
1823   Register LoadAddr;
1824 
1825   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1826   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1827 }
1828 
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1829 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1830   MachineInstr &MI, MachineRegisterInfo &MRI,
1831   MachineIRBuilder &B) const {
1832   MachineFunction &MF = B.getMF();
1833 
1834   const LLT S32 = LLT::scalar(32);
1835   Register Dst = MI.getOperand(0).getReg();
1836   Register Src = MI.getOperand(1).getReg();
1837 
1838   LLT DstTy = MRI.getType(Dst);
1839   LLT SrcTy = MRI.getType(Src);
1840   unsigned DestAS = DstTy.getAddressSpace();
1841   unsigned SrcAS = SrcTy.getAddressSpace();
1842 
1843   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1844   // vector element.
1845   assert(!DstTy.isVector());
1846 
1847   const AMDGPUTargetMachine &TM
1848     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1849 
1850   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1851     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1852     return true;
1853   }
1854 
1855   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1856     // Truncate.
1857     B.buildExtract(Dst, Src, 0);
1858     MI.eraseFromParent();
1859     return true;
1860   }
1861 
1862   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1863     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1864     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1865 
1866     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1867     // another. Merge operands are required to be the same type, but creating an
1868     // extra ptrtoint would be kind of pointless.
1869     auto HighAddr = B.buildConstant(
1870       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1871     B.buildMerge(Dst, {Src, HighAddr});
1872     MI.eraseFromParent();
1873     return true;
1874   }
1875 
1876   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1877     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1878            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1879     unsigned NullVal = TM.getNullPointerValue(DestAS);
1880 
1881     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1882     auto FlatNull = B.buildConstant(SrcTy, 0);
1883 
1884     // Extract low 32-bits of the pointer.
1885     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1886 
1887     auto CmpRes =
1888         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1889     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1890 
1891     MI.eraseFromParent();
1892     return true;
1893   }
1894 
1895   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1896     return false;
1897 
1898   if (!ST.hasFlatAddressSpace())
1899     return false;
1900 
1901   auto SegmentNull =
1902       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1903   auto FlatNull =
1904       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1905 
1906   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1907   if (!ApertureReg.isValid())
1908     return false;
1909 
1910   auto CmpRes =
1911       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1912 
1913   // Coerce the type of the low half of the result so we can use merge_values.
1914   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1915 
1916   // TODO: Should we allow mismatched types but matching sizes in merges to
1917   // avoid the ptrtoint?
1918   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1919   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1920 
1921   MI.eraseFromParent();
1922   return true;
1923 }
1924 
legalizeFrint(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1925 bool AMDGPULegalizerInfo::legalizeFrint(
1926   MachineInstr &MI, MachineRegisterInfo &MRI,
1927   MachineIRBuilder &B) const {
1928   Register Src = MI.getOperand(1).getReg();
1929   LLT Ty = MRI.getType(Src);
1930   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1931 
1932   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1933   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1934 
1935   auto C1 = B.buildFConstant(Ty, C1Val);
1936   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1937 
1938   // TODO: Should this propagate fast-math-flags?
1939   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1940   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1941 
1942   auto C2 = B.buildFConstant(Ty, C2Val);
1943   auto Fabs = B.buildFAbs(Ty, Src);
1944 
1945   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1946   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1947   MI.eraseFromParent();
1948   return true;
1949 }
1950 
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1951 bool AMDGPULegalizerInfo::legalizeFceil(
1952   MachineInstr &MI, MachineRegisterInfo &MRI,
1953   MachineIRBuilder &B) const {
1954 
1955   const LLT S1 = LLT::scalar(1);
1956   const LLT S64 = LLT::scalar(64);
1957 
1958   Register Src = MI.getOperand(1).getReg();
1959   assert(MRI.getType(Src) == S64);
1960 
1961   // result = trunc(src)
1962   // if (src > 0.0 && src != result)
1963   //   result += 1.0
1964 
1965   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1966 
1967   const auto Zero = B.buildFConstant(S64, 0.0);
1968   const auto One = B.buildFConstant(S64, 1.0);
1969   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1970   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1971   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1972   auto Add = B.buildSelect(S64, And, One, Zero);
1973 
1974   // TODO: Should this propagate fast-math-flags?
1975   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1976   return true;
1977 }
1978 
legalizeFrem(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1979 bool AMDGPULegalizerInfo::legalizeFrem(
1980   MachineInstr &MI, MachineRegisterInfo &MRI,
1981   MachineIRBuilder &B) const {
1982     Register DstReg = MI.getOperand(0).getReg();
1983     Register Src0Reg = MI.getOperand(1).getReg();
1984     Register Src1Reg = MI.getOperand(2).getReg();
1985     auto Flags = MI.getFlags();
1986     LLT Ty = MRI.getType(DstReg);
1987 
1988     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1989     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1990     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1991     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1992     MI.eraseFromParent();
1993     return true;
1994 }
1995 
extractF64Exponent(Register Hi,MachineIRBuilder & B)1996 static MachineInstrBuilder extractF64Exponent(Register Hi,
1997                                               MachineIRBuilder &B) {
1998   const unsigned FractBits = 52;
1999   const unsigned ExpBits = 11;
2000   LLT S32 = LLT::scalar(32);
2001 
2002   auto Const0 = B.buildConstant(S32, FractBits - 32);
2003   auto Const1 = B.buildConstant(S32, ExpBits);
2004 
2005   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2006     .addUse(Hi)
2007     .addUse(Const0.getReg(0))
2008     .addUse(Const1.getReg(0));
2009 
2010   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2011 }
2012 
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2013 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2014   MachineInstr &MI, MachineRegisterInfo &MRI,
2015   MachineIRBuilder &B) const {
2016   const LLT S1 = LLT::scalar(1);
2017   const LLT S32 = LLT::scalar(32);
2018   const LLT S64 = LLT::scalar(64);
2019 
2020   Register Src = MI.getOperand(1).getReg();
2021   assert(MRI.getType(Src) == S64);
2022 
2023   // TODO: Should this use extract since the low half is unused?
2024   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2025   Register Hi = Unmerge.getReg(1);
2026 
2027   // Extract the upper half, since this is where we will find the sign and
2028   // exponent.
2029   auto Exp = extractF64Exponent(Hi, B);
2030 
2031   const unsigned FractBits = 52;
2032 
2033   // Extract the sign bit.
2034   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2035   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2036 
2037   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2038 
2039   const auto Zero32 = B.buildConstant(S32, 0);
2040 
2041   // Extend back to 64-bits.
2042   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2043 
2044   auto Shr = B.buildAShr(S64, FractMask, Exp);
2045   auto Not = B.buildNot(S64, Shr);
2046   auto Tmp0 = B.buildAnd(S64, Src, Not);
2047   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2048 
2049   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2050   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2051 
2052   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2053   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2054   MI.eraseFromParent();
2055   return true;
2056 }
2057 
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2058 bool AMDGPULegalizerInfo::legalizeITOFP(
2059   MachineInstr &MI, MachineRegisterInfo &MRI,
2060   MachineIRBuilder &B, bool Signed) const {
2061 
2062   Register Dst = MI.getOperand(0).getReg();
2063   Register Src = MI.getOperand(1).getReg();
2064 
2065   const LLT S64 = LLT::scalar(64);
2066   const LLT S32 = LLT::scalar(32);
2067 
2068   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2069 
2070   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2071 
2072   auto CvtHi = Signed ?
2073     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2074     B.buildUITOFP(S64, Unmerge.getReg(1));
2075 
2076   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2077 
2078   auto ThirtyTwo = B.buildConstant(S32, 32);
2079   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2080     .addUse(CvtHi.getReg(0))
2081     .addUse(ThirtyTwo.getReg(0));
2082 
2083   // TODO: Should this propagate fast-math-flags?
2084   B.buildFAdd(Dst, LdExp, CvtLo);
2085   MI.eraseFromParent();
2086   return true;
2087 }
2088 
2089 // TODO: Copied from DAG implementation. Verify logic and document how this
2090 // actually works.
legalizeFPTOI(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2091 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2092                                         MachineRegisterInfo &MRI,
2093                                         MachineIRBuilder &B,
2094                                         bool Signed) const {
2095 
2096   Register Dst = MI.getOperand(0).getReg();
2097   Register Src = MI.getOperand(1).getReg();
2098 
2099   const LLT S64 = LLT::scalar(64);
2100   const LLT S32 = LLT::scalar(32);
2101 
2102   const LLT SrcLT = MRI.getType(Src);
2103   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2104 
2105   unsigned Flags = MI.getFlags();
2106 
2107   // The basic idea of converting a floating point number into a pair of 32-bit
2108   // integers is illustrated as follows:
2109   //
2110   //     tf := trunc(val);
2111   //    hif := floor(tf * 2^-32);
2112   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2113   //     hi := fptoi(hif);
2114   //     lo := fptoi(lof);
2115   //
2116   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2117   MachineInstrBuilder Sign;
2118   if (Signed && SrcLT == S32) {
2119     // However, a 32-bit floating point number has only 23 bits mantissa and
2120     // it's not enough to hold all the significant bits of `lof` if val is
2121     // negative. To avoid the loss of precision, We need to take the absolute
2122     // value after truncating and flip the result back based on the original
2123     // signedness.
2124     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2125     Trunc = B.buildFAbs(S32, Trunc, Flags);
2126   }
2127   MachineInstrBuilder K0, K1;
2128   if (SrcLT == S64) {
2129     K0 = B.buildFConstant(S64,
2130                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2131     K1 = B.buildFConstant(S64,
2132                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2133   } else {
2134     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2135     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2136   }
2137 
2138   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2139   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2140   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2141 
2142   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2143                                      : B.buildFPTOUI(S32, FloorMul);
2144   auto Lo = B.buildFPTOUI(S32, Fma);
2145 
2146   if (Signed && SrcLT == S32) {
2147     // Flip the result based on the signedness, which is either all 0s or 1s.
2148     Sign = B.buildMerge(S64, {Sign, Sign});
2149     // r := xor({lo, hi}, sign) - sign;
2150     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2151   } else
2152     B.buildMerge(Dst, {Lo, Hi});
2153   MI.eraseFromParent();
2154 
2155   return true;
2156 }
2157 
legalizeMinNumMaxNum(LegalizerHelper & Helper,MachineInstr & MI) const2158 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2159                                                MachineInstr &MI) const {
2160   MachineFunction &MF = Helper.MIRBuilder.getMF();
2161   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2162 
2163   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2164                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2165 
2166   // With ieee_mode disabled, the instructions have the correct behavior
2167   // already for G_FMINNUM/G_FMAXNUM
2168   if (!MFI->getMode().IEEE)
2169     return !IsIEEEOp;
2170 
2171   if (IsIEEEOp)
2172     return true;
2173 
2174   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2175 }
2176 
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2177 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2178   MachineInstr &MI, MachineRegisterInfo &MRI,
2179   MachineIRBuilder &B) const {
2180   // TODO: Should move some of this into LegalizerHelper.
2181 
2182   // TODO: Promote dynamic indexing of s16 to s32
2183 
2184   // FIXME: Artifact combiner probably should have replaced the truncated
2185   // constant before this, so we shouldn't need
2186   // getConstantVRegValWithLookThrough.
2187   Optional<ValueAndVReg> MaybeIdxVal =
2188       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2189   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2190     return true;
2191   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2192 
2193   Register Dst = MI.getOperand(0).getReg();
2194   Register Vec = MI.getOperand(1).getReg();
2195 
2196   LLT VecTy = MRI.getType(Vec);
2197   LLT EltTy = VecTy.getElementType();
2198   assert(EltTy == MRI.getType(Dst));
2199 
2200   if (IdxVal < VecTy.getNumElements())
2201     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2202   else
2203     B.buildUndef(Dst);
2204 
2205   MI.eraseFromParent();
2206   return true;
2207 }
2208 
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2209 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2210   MachineInstr &MI, MachineRegisterInfo &MRI,
2211   MachineIRBuilder &B) const {
2212   // TODO: Should move some of this into LegalizerHelper.
2213 
2214   // TODO: Promote dynamic indexing of s16 to s32
2215 
2216   // FIXME: Artifact combiner probably should have replaced the truncated
2217   // constant before this, so we shouldn't need
2218   // getConstantVRegValWithLookThrough.
2219   Optional<ValueAndVReg> MaybeIdxVal =
2220       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2221   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2222     return true;
2223 
2224   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2225   Register Dst = MI.getOperand(0).getReg();
2226   Register Vec = MI.getOperand(1).getReg();
2227   Register Ins = MI.getOperand(2).getReg();
2228 
2229   LLT VecTy = MRI.getType(Vec);
2230   LLT EltTy = VecTy.getElementType();
2231   assert(EltTy == MRI.getType(Ins));
2232 
2233   if (IdxVal < VecTy.getNumElements())
2234     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2235   else
2236     B.buildUndef(Dst);
2237 
2238   MI.eraseFromParent();
2239   return true;
2240 }
2241 
legalizeShuffleVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2242 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2243   MachineInstr &MI, MachineRegisterInfo &MRI,
2244   MachineIRBuilder &B) const {
2245   const LLT V2S16 = LLT::fixed_vector(2, 16);
2246 
2247   Register Dst = MI.getOperand(0).getReg();
2248   Register Src0 = MI.getOperand(1).getReg();
2249   LLT DstTy = MRI.getType(Dst);
2250   LLT SrcTy = MRI.getType(Src0);
2251 
2252   if (SrcTy == V2S16 && DstTy == V2S16 &&
2253       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2254     return true;
2255 
2256   MachineIRBuilder HelperBuilder(MI);
2257   GISelObserverWrapper DummyObserver;
2258   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2259   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2260 }
2261 
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2262 bool AMDGPULegalizerInfo::legalizeSinCos(
2263   MachineInstr &MI, MachineRegisterInfo &MRI,
2264   MachineIRBuilder &B) const {
2265 
2266   Register DstReg = MI.getOperand(0).getReg();
2267   Register SrcReg = MI.getOperand(1).getReg();
2268   LLT Ty = MRI.getType(DstReg);
2269   unsigned Flags = MI.getFlags();
2270 
2271   Register TrigVal;
2272   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2273   if (ST.hasTrigReducedRange()) {
2274     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2275     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2276       .addUse(MulVal.getReg(0))
2277       .setMIFlags(Flags).getReg(0);
2278   } else
2279     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2280 
2281   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2282     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2283   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2284     .addUse(TrigVal)
2285     .setMIFlags(Flags);
2286   MI.eraseFromParent();
2287   return true;
2288 }
2289 
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,int64_t Offset,unsigned GAFlags) const2290 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2291                                                   MachineIRBuilder &B,
2292                                                   const GlobalValue *GV,
2293                                                   int64_t Offset,
2294                                                   unsigned GAFlags) const {
2295   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2296   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2297   // to the following code sequence:
2298   //
2299   // For constant address space:
2300   //   s_getpc_b64 s[0:1]
2301   //   s_add_u32 s0, s0, $symbol
2302   //   s_addc_u32 s1, s1, 0
2303   //
2304   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2305   //   a fixup or relocation is emitted to replace $symbol with a literal
2306   //   constant, which is a pc-relative offset from the encoding of the $symbol
2307   //   operand to the global variable.
2308   //
2309   // For global address space:
2310   //   s_getpc_b64 s[0:1]
2311   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2312   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2313   //
2314   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2315   //   fixups or relocations are emitted to replace $symbol@*@lo and
2316   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2317   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2318   //   operand to the global variable.
2319   //
2320   // What we want here is an offset from the value returned by s_getpc
2321   // (which is the address of the s_add_u32 instruction) to the global
2322   // variable, but since the encoding of $symbol starts 4 bytes after the start
2323   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2324   // small. This requires us to add 4 to the global variable offset in order to
2325   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2326   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2327   // instruction.
2328 
2329   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2330 
2331   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2332     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2333 
2334   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2335     .addDef(PCReg);
2336 
2337   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2338   if (GAFlags == SIInstrInfo::MO_NONE)
2339     MIB.addImm(0);
2340   else
2341     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2342 
2343   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2344 
2345   if (PtrTy.getSizeInBits() == 32)
2346     B.buildExtract(DstReg, PCReg, 0);
2347   return true;
2348  }
2349 
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2350 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2351   MachineInstr &MI, MachineRegisterInfo &MRI,
2352   MachineIRBuilder &B) const {
2353   Register DstReg = MI.getOperand(0).getReg();
2354   LLT Ty = MRI.getType(DstReg);
2355   unsigned AS = Ty.getAddressSpace();
2356 
2357   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2358   MachineFunction &MF = B.getMF();
2359   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2360 
2361   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2362     if (!MFI->isModuleEntryFunction() &&
2363         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2364       const Function &Fn = MF.getFunction();
2365       DiagnosticInfoUnsupported BadLDSDecl(
2366         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2367         DS_Warning);
2368       Fn.getContext().diagnose(BadLDSDecl);
2369 
2370       // We currently don't have a way to correctly allocate LDS objects that
2371       // aren't directly associated with a kernel. We do force inlining of
2372       // functions that use local objects. However, if these dead functions are
2373       // not eliminated, we don't want a compile time error. Just emit a warning
2374       // and a trap, since there should be no callable path here.
2375       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2376       B.buildUndef(DstReg);
2377       MI.eraseFromParent();
2378       return true;
2379     }
2380 
2381     // TODO: We could emit code to handle the initialization somewhere.
2382     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2383       const SITargetLowering *TLI = ST.getTargetLowering();
2384       if (!TLI->shouldUseLDSConstAddress(GV)) {
2385         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2386         return true; // Leave in place;
2387       }
2388 
2389       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2390         Type *Ty = GV->getValueType();
2391         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2392         // zero-sized type in other languages to declare the dynamic shared
2393         // memory which size is not known at the compile time. They will be
2394         // allocated by the runtime and placed directly after the static
2395         // allocated ones. They all share the same offset.
2396         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2397           // Adjust alignment for that dynamic shared memory array.
2398           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2399           LLT S32 = LLT::scalar(32);
2400           auto Sz =
2401               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2402           B.buildIntToPtr(DstReg, Sz);
2403           MI.eraseFromParent();
2404           return true;
2405         }
2406       }
2407 
2408       B.buildConstant(
2409           DstReg,
2410           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2411       MI.eraseFromParent();
2412       return true;
2413     }
2414 
2415     const Function &Fn = MF.getFunction();
2416     DiagnosticInfoUnsupported BadInit(
2417       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2418     Fn.getContext().diagnose(BadInit);
2419     return true;
2420   }
2421 
2422   const SITargetLowering *TLI = ST.getTargetLowering();
2423 
2424   if (TLI->shouldEmitFixup(GV)) {
2425     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2426     MI.eraseFromParent();
2427     return true;
2428   }
2429 
2430   if (TLI->shouldEmitPCReloc(GV)) {
2431     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2432     MI.eraseFromParent();
2433     return true;
2434   }
2435 
2436   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2437   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2438 
2439   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2440   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2441       MachinePointerInfo::getGOT(MF),
2442       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2443           MachineMemOperand::MOInvariant,
2444       LoadTy, Align(8));
2445 
2446   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2447 
2448   if (Ty.getSizeInBits() == 32) {
2449     // Truncate if this is a 32-bit constant adrdess.
2450     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2451     B.buildExtract(DstReg, Load, 0);
2452   } else
2453     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2454 
2455   MI.eraseFromParent();
2456   return true;
2457 }
2458 
widenToNextPowerOf2(LLT Ty)2459 static LLT widenToNextPowerOf2(LLT Ty) {
2460   if (Ty.isVector())
2461     return Ty.changeElementCount(
2462         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2463   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2464 }
2465 
legalizeLoad(LegalizerHelper & Helper,MachineInstr & MI) const2466 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2467                                        MachineInstr &MI) const {
2468   MachineIRBuilder &B = Helper.MIRBuilder;
2469   MachineRegisterInfo &MRI = *B.getMRI();
2470   GISelChangeObserver &Observer = Helper.Observer;
2471 
2472   Register PtrReg = MI.getOperand(1).getReg();
2473   LLT PtrTy = MRI.getType(PtrReg);
2474   unsigned AddrSpace = PtrTy.getAddressSpace();
2475 
2476   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2477     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2478     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2479     Observer.changingInstr(MI);
2480     MI.getOperand(1).setReg(Cast.getReg(0));
2481     Observer.changedInstr(MI);
2482     return true;
2483   }
2484 
2485   if (MI.getOpcode() != AMDGPU::G_LOAD)
2486     return false;
2487 
2488   Register ValReg = MI.getOperand(0).getReg();
2489   LLT ValTy = MRI.getType(ValReg);
2490 
2491   MachineMemOperand *MMO = *MI.memoperands_begin();
2492   const unsigned ValSize = ValTy.getSizeInBits();
2493   const LLT MemTy = MMO->getMemoryType();
2494   const Align MemAlign = MMO->getAlign();
2495   const unsigned MemSize = MemTy.getSizeInBits();
2496   const unsigned AlignInBits = 8 * MemAlign.value();
2497 
2498   // Widen non-power-of-2 loads to the alignment if needed
2499   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2500     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2501 
2502     // This was already the correct extending load result type, so just adjust
2503     // the memory type.
2504     if (WideMemSize == ValSize) {
2505       MachineFunction &MF = B.getMF();
2506 
2507       MachineMemOperand *WideMMO =
2508           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2509       Observer.changingInstr(MI);
2510       MI.setMemRefs(MF, {WideMMO});
2511       Observer.changedInstr(MI);
2512       return true;
2513     }
2514 
2515     // Don't bother handling edge case that should probably never be produced.
2516     if (ValSize > WideMemSize)
2517       return false;
2518 
2519     LLT WideTy = widenToNextPowerOf2(ValTy);
2520 
2521     Register WideLoad;
2522     if (!WideTy.isVector()) {
2523       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2524       B.buildTrunc(ValReg, WideLoad).getReg(0);
2525     } else {
2526       // Extract the subvector.
2527 
2528       if (isRegisterType(ValTy)) {
2529         // If this a case where G_EXTRACT is legal, use it.
2530         // (e.g. <3 x s32> -> <4 x s32>)
2531         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2532         B.buildExtract(ValReg, WideLoad, 0);
2533       } else {
2534         // For cases where the widened type isn't a nice register value, unmerge
2535         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2536         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2537         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2538         B.setInsertPt(B.getMBB(), MI.getIterator());
2539         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2540       }
2541     }
2542 
2543     MI.eraseFromParent();
2544     return true;
2545   }
2546 
2547   return false;
2548 }
2549 
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2550 bool AMDGPULegalizerInfo::legalizeFMad(
2551   MachineInstr &MI, MachineRegisterInfo &MRI,
2552   MachineIRBuilder &B) const {
2553   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2554   assert(Ty.isScalar());
2555 
2556   MachineFunction &MF = B.getMF();
2557   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2558 
2559   // TODO: Always legal with future ftz flag.
2560   // FIXME: Do we need just output?
2561   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2562     return true;
2563   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2564     return true;
2565 
2566   MachineIRBuilder HelperBuilder(MI);
2567   GISelObserverWrapper DummyObserver;
2568   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2569   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2570 }
2571 
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2572 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2573   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2574   Register DstReg = MI.getOperand(0).getReg();
2575   Register PtrReg = MI.getOperand(1).getReg();
2576   Register CmpVal = MI.getOperand(2).getReg();
2577   Register NewVal = MI.getOperand(3).getReg();
2578 
2579   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2580          "this should not have been custom lowered");
2581 
2582   LLT ValTy = MRI.getType(CmpVal);
2583   LLT VecTy = LLT::fixed_vector(2, ValTy);
2584 
2585   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2586 
2587   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2588     .addDef(DstReg)
2589     .addUse(PtrReg)
2590     .addUse(PackedVal)
2591     .setMemRefs(MI.memoperands());
2592 
2593   MI.eraseFromParent();
2594   return true;
2595 }
2596 
legalizeFlog(MachineInstr & MI,MachineIRBuilder & B,double Log2BaseInverted) const2597 bool AMDGPULegalizerInfo::legalizeFlog(
2598   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2599   Register Dst = MI.getOperand(0).getReg();
2600   Register Src = MI.getOperand(1).getReg();
2601   LLT Ty = B.getMRI()->getType(Dst);
2602   unsigned Flags = MI.getFlags();
2603 
2604   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2605   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2606 
2607   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2608   MI.eraseFromParent();
2609   return true;
2610 }
2611 
legalizeFExp(MachineInstr & MI,MachineIRBuilder & B) const2612 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2613                                        MachineIRBuilder &B) const {
2614   Register Dst = MI.getOperand(0).getReg();
2615   Register Src = MI.getOperand(1).getReg();
2616   unsigned Flags = MI.getFlags();
2617   LLT Ty = B.getMRI()->getType(Dst);
2618 
2619   auto K = B.buildFConstant(Ty, numbers::log2e);
2620   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2621   B.buildFExp2(Dst, Mul, Flags);
2622   MI.eraseFromParent();
2623   return true;
2624 }
2625 
legalizeFPow(MachineInstr & MI,MachineIRBuilder & B) const2626 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2627                                        MachineIRBuilder &B) const {
2628   Register Dst = MI.getOperand(0).getReg();
2629   Register Src0 = MI.getOperand(1).getReg();
2630   Register Src1 = MI.getOperand(2).getReg();
2631   unsigned Flags = MI.getFlags();
2632   LLT Ty = B.getMRI()->getType(Dst);
2633   const LLT S16 = LLT::scalar(16);
2634   const LLT S32 = LLT::scalar(32);
2635 
2636   if (Ty == S32) {
2637     auto Log = B.buildFLog2(S32, Src0, Flags);
2638     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2639       .addUse(Log.getReg(0))
2640       .addUse(Src1)
2641       .setMIFlags(Flags);
2642     B.buildFExp2(Dst, Mul, Flags);
2643   } else if (Ty == S16) {
2644     // There's no f16 fmul_legacy, so we need to convert for it.
2645     auto Log = B.buildFLog2(S16, Src0, Flags);
2646     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2647     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2648     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2649       .addUse(Ext0.getReg(0))
2650       .addUse(Ext1.getReg(0))
2651       .setMIFlags(Flags);
2652 
2653     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2654   } else
2655     return false;
2656 
2657   MI.eraseFromParent();
2658   return true;
2659 }
2660 
2661 // Find a source register, ignoring any possible source modifiers.
stripAnySourceMods(Register OrigSrc,MachineRegisterInfo & MRI)2662 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2663   Register ModSrc = OrigSrc;
2664   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2665     ModSrc = SrcFNeg->getOperand(1).getReg();
2666     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2667       ModSrc = SrcFAbs->getOperand(1).getReg();
2668   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2669     ModSrc = SrcFAbs->getOperand(1).getReg();
2670   return ModSrc;
2671 }
2672 
legalizeFFloor(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2673 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2674                                          MachineRegisterInfo &MRI,
2675                                          MachineIRBuilder &B) const {
2676 
2677   const LLT S1 = LLT::scalar(1);
2678   const LLT S64 = LLT::scalar(64);
2679   Register Dst = MI.getOperand(0).getReg();
2680   Register OrigSrc = MI.getOperand(1).getReg();
2681   unsigned Flags = MI.getFlags();
2682   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2683          "this should not have been custom lowered");
2684 
2685   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2686   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2687   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2688   // V_FRACT bug is:
2689   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2690   //
2691   // Convert floor(x) to (x - fract(x))
2692 
2693   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2694     .addUse(OrigSrc)
2695     .setMIFlags(Flags);
2696 
2697   // Give source modifier matching some assistance before obscuring a foldable
2698   // pattern.
2699 
2700   // TODO: We can avoid the neg on the fract? The input sign to fract
2701   // shouldn't matter?
2702   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2703 
2704   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2705 
2706   Register Min = MRI.createGenericVirtualRegister(S64);
2707 
2708   // We don't need to concern ourselves with the snan handling difference, so
2709   // use the one which will directly select.
2710   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2711   if (MFI->getMode().IEEE)
2712     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2713   else
2714     B.buildFMinNum(Min, Fract, Const, Flags);
2715 
2716   Register CorrectedFract = Min;
2717   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2718     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2719     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2720   }
2721 
2722   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2723   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2724 
2725   MI.eraseFromParent();
2726   return true;
2727 }
2728 
2729 // Turn an illegal packed v2s16 build vector into bit operations.
2730 // TODO: This should probably be a bitcast action in LegalizerHelper.
legalizeBuildVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2731 bool AMDGPULegalizerInfo::legalizeBuildVector(
2732   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2733   Register Dst = MI.getOperand(0).getReg();
2734   const LLT S32 = LLT::scalar(32);
2735   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2736 
2737   Register Src0 = MI.getOperand(1).getReg();
2738   Register Src1 = MI.getOperand(2).getReg();
2739   assert(MRI.getType(Src0) == LLT::scalar(16));
2740 
2741   auto Merge = B.buildMerge(S32, {Src0, Src1});
2742   B.buildBitcast(Dst, Merge);
2743 
2744   MI.eraseFromParent();
2745   return true;
2746 }
2747 
2748 // Check that this is a G_XOR x, -1
isNot(const MachineRegisterInfo & MRI,const MachineInstr & MI)2749 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2750   if (MI.getOpcode() != TargetOpcode::G_XOR)
2751     return false;
2752   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2753   return ConstVal && *ConstVal == -1;
2754 }
2755 
2756 // Return the use branch instruction, otherwise null if the usage is invalid.
2757 static MachineInstr *
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br,MachineBasicBlock * & UncondBrTarget,bool & Negated)2758 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2759                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2760   Register CondDef = MI.getOperand(0).getReg();
2761   if (!MRI.hasOneNonDBGUse(CondDef))
2762     return nullptr;
2763 
2764   MachineBasicBlock *Parent = MI.getParent();
2765   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2766 
2767   if (isNot(MRI, *UseMI)) {
2768     Register NegatedCond = UseMI->getOperand(0).getReg();
2769     if (!MRI.hasOneNonDBGUse(NegatedCond))
2770       return nullptr;
2771 
2772     // We're deleting the def of this value, so we need to remove it.
2773     UseMI->eraseFromParent();
2774 
2775     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2776     Negated = true;
2777   }
2778 
2779   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2780     return nullptr;
2781 
2782   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2783   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2784   if (Next == Parent->end()) {
2785     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2786     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2787       return nullptr;
2788     UncondBrTarget = &*NextMBB;
2789   } else {
2790     if (Next->getOpcode() != AMDGPU::G_BR)
2791       return nullptr;
2792     Br = &*Next;
2793     UncondBrTarget = Br->getOperand(0).getMBB();
2794   }
2795 
2796   return UseMI;
2797 }
2798 
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg,const TargetRegisterClass * ArgRC,LLT ArgTy) const2799 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2800                                          const ArgDescriptor *Arg,
2801                                          const TargetRegisterClass *ArgRC,
2802                                          LLT ArgTy) const {
2803   MCRegister SrcReg = Arg->getRegister();
2804   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2805   assert(DstReg.isVirtual() && "Virtual register expected");
2806 
2807   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2808                                              ArgTy);
2809   if (Arg->isMasked()) {
2810     // TODO: Should we try to emit this once in the entry block?
2811     const LLT S32 = LLT::scalar(32);
2812     const unsigned Mask = Arg->getMask();
2813     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2814 
2815     Register AndMaskSrc = LiveIn;
2816 
2817     if (Shift != 0) {
2818       auto ShiftAmt = B.buildConstant(S32, Shift);
2819       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2820     }
2821 
2822     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2823   } else {
2824     B.buildCopy(DstReg, LiveIn);
2825   }
2826 
2827   return true;
2828 }
2829 
loadInputValue(Register DstReg,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2830 bool AMDGPULegalizerInfo::loadInputValue(
2831     Register DstReg, MachineIRBuilder &B,
2832     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2833   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2834   const ArgDescriptor *Arg;
2835   const TargetRegisterClass *ArgRC;
2836   LLT ArgTy;
2837   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2838 
2839   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2840     return false; // TODO: Handle these
2841   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2842 }
2843 
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2844 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2845     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2846     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2847   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2848     return false;
2849 
2850   MI.eraseFromParent();
2851   return true;
2852 }
2853 
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2854 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2855                                        MachineRegisterInfo &MRI,
2856                                        MachineIRBuilder &B) const {
2857   Register Dst = MI.getOperand(0).getReg();
2858   LLT DstTy = MRI.getType(Dst);
2859   LLT S16 = LLT::scalar(16);
2860   LLT S32 = LLT::scalar(32);
2861   LLT S64 = LLT::scalar(64);
2862 
2863   if (DstTy == S16)
2864     return legalizeFDIV16(MI, MRI, B);
2865   if (DstTy == S32)
2866     return legalizeFDIV32(MI, MRI, B);
2867   if (DstTy == S64)
2868     return legalizeFDIV64(MI, MRI, B);
2869 
2870   return false;
2871 }
2872 
legalizeUnsignedDIV_REM32Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register X,Register Y) const2873 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2874                                                         Register DstDivReg,
2875                                                         Register DstRemReg,
2876                                                         Register X,
2877                                                         Register Y) const {
2878   const LLT S1 = LLT::scalar(1);
2879   const LLT S32 = LLT::scalar(32);
2880 
2881   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2882   // algorithm used here.
2883 
2884   // Initial estimate of inv(y).
2885   auto FloatY = B.buildUITOFP(S32, Y);
2886   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2887   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2888   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2889   auto Z = B.buildFPTOUI(S32, ScaledY);
2890 
2891   // One round of UNR.
2892   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2893   auto NegYZ = B.buildMul(S32, NegY, Z);
2894   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2895 
2896   // Quotient/remainder estimate.
2897   auto Q = B.buildUMulH(S32, X, Z);
2898   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2899 
2900   // First quotient/remainder refinement.
2901   auto One = B.buildConstant(S32, 1);
2902   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2903   if (DstDivReg)
2904     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2905   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2906 
2907   // Second quotient/remainder refinement.
2908   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2909   if (DstDivReg)
2910     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2911 
2912   if (DstRemReg)
2913     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2914 }
2915 
2916 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2917 //
2918 // Return lo, hi of result
2919 //
2920 // %cvt.lo = G_UITOFP Val.lo
2921 // %cvt.hi = G_UITOFP Val.hi
2922 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2923 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2924 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2925 // %mul2 = G_FMUL %mul1, 2**(-32)
2926 // %trunc = G_INTRINSIC_TRUNC %mul2
2927 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2928 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
emitReciprocalU64(MachineIRBuilder & B,Register Val)2929 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2930                                                        Register Val) {
2931   const LLT S32 = LLT::scalar(32);
2932   auto Unmerge = B.buildUnmerge(S32, Val);
2933 
2934   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2935   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2936 
2937   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2938                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2939 
2940   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2941   auto Mul1 =
2942       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2943 
2944   // 2**(-32)
2945   auto Mul2 =
2946       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2947   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2948 
2949   // -(2**32)
2950   auto Mad2 = B.buildFMAD(S32, Trunc,
2951                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2952 
2953   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2954   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2955 
2956   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2957 }
2958 
legalizeUnsignedDIV_REM64Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register Numer,Register Denom) const2959 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2960                                                         Register DstDivReg,
2961                                                         Register DstRemReg,
2962                                                         Register Numer,
2963                                                         Register Denom) const {
2964   const LLT S32 = LLT::scalar(32);
2965   const LLT S64 = LLT::scalar(64);
2966   const LLT S1 = LLT::scalar(1);
2967   Register RcpLo, RcpHi;
2968 
2969   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2970 
2971   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2972 
2973   auto Zero64 = B.buildConstant(S64, 0);
2974   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2975 
2976   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2977   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2978 
2979   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2980   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2981   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2982 
2983   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2984   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2985   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2986   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2987 
2988   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2989   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2990   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2991   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2992   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2993 
2994   auto Zero32 = B.buildConstant(S32, 0);
2995   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2996   auto Add2_HiC =
2997       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2998   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2999   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3000 
3001   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3002   Register NumerLo = UnmergeNumer.getReg(0);
3003   Register NumerHi = UnmergeNumer.getReg(1);
3004 
3005   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3006   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3007   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3008   Register Mul3_Lo = UnmergeMul3.getReg(0);
3009   Register Mul3_Hi = UnmergeMul3.getReg(1);
3010   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3011   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3012   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3013   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3014 
3015   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3016   Register DenomLo = UnmergeDenom.getReg(0);
3017   Register DenomHi = UnmergeDenom.getReg(1);
3018 
3019   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3020   auto C1 = B.buildSExt(S32, CmpHi);
3021 
3022   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3023   auto C2 = B.buildSExt(S32, CmpLo);
3024 
3025   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3026   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3027 
3028   // TODO: Here and below portions of the code can be enclosed into if/endif.
3029   // Currently control flow is unconditional and we have 4 selects after
3030   // potential endif to substitute PHIs.
3031 
3032   // if C3 != 0 ...
3033   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3034   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3035   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3036   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3037 
3038   auto One64 = B.buildConstant(S64, 1);
3039   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3040 
3041   auto C4 =
3042       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3043   auto C5 =
3044       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3045   auto C6 = B.buildSelect(
3046       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3047 
3048   // if (C6 != 0)
3049   auto Add4 = B.buildAdd(S64, Add3, One64);
3050   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3051 
3052   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3053   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3054   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3055 
3056   // endif C6
3057   // endif C3
3058 
3059   if (DstDivReg) {
3060     auto Sel1 = B.buildSelect(
3061         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3062     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3063                   Sel1, MulHi3);
3064   }
3065 
3066   if (DstRemReg) {
3067     auto Sel2 = B.buildSelect(
3068         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3069     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3070                   Sel2, Sub1);
3071   }
3072 }
3073 
legalizeUnsignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3074 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3075                                                   MachineRegisterInfo &MRI,
3076                                                   MachineIRBuilder &B) const {
3077   Register DstDivReg, DstRemReg;
3078   switch (MI.getOpcode()) {
3079   default:
3080     llvm_unreachable("Unexpected opcode!");
3081   case AMDGPU::G_UDIV: {
3082     DstDivReg = MI.getOperand(0).getReg();
3083     break;
3084   }
3085   case AMDGPU::G_UREM: {
3086     DstRemReg = MI.getOperand(0).getReg();
3087     break;
3088   }
3089   case AMDGPU::G_UDIVREM: {
3090     DstDivReg = MI.getOperand(0).getReg();
3091     DstRemReg = MI.getOperand(1).getReg();
3092     break;
3093   }
3094   }
3095 
3096   const LLT S64 = LLT::scalar(64);
3097   const LLT S32 = LLT::scalar(32);
3098   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3099   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3100   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3101   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3102 
3103   if (Ty == S32)
3104     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3105   else if (Ty == S64)
3106     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3107   else
3108     return false;
3109 
3110   MI.eraseFromParent();
3111   return true;
3112 }
3113 
legalizeSignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3114 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3115                                                 MachineRegisterInfo &MRI,
3116                                                 MachineIRBuilder &B) const {
3117   const LLT S64 = LLT::scalar(64);
3118   const LLT S32 = LLT::scalar(32);
3119 
3120   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3121   if (Ty != S32 && Ty != S64)
3122     return false;
3123 
3124   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3125   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3126   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3127 
3128   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3129   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3130   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3131 
3132   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3133   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3134 
3135   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3136   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3137 
3138   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3139   switch (MI.getOpcode()) {
3140   default:
3141     llvm_unreachable("Unexpected opcode!");
3142   case AMDGPU::G_SDIV: {
3143     DstDivReg = MI.getOperand(0).getReg();
3144     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3145     break;
3146   }
3147   case AMDGPU::G_SREM: {
3148     DstRemReg = MI.getOperand(0).getReg();
3149     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3150     break;
3151   }
3152   case AMDGPU::G_SDIVREM: {
3153     DstDivReg = MI.getOperand(0).getReg();
3154     DstRemReg = MI.getOperand(1).getReg();
3155     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3156     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3157     break;
3158   }
3159   }
3160 
3161   if (Ty == S32)
3162     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3163   else
3164     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3165 
3166   if (DstDivReg) {
3167     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3168     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3169     B.buildSub(DstDivReg, SignXor, Sign);
3170   }
3171 
3172   if (DstRemReg) {
3173     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3174     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3175     B.buildSub(DstRemReg, SignXor, Sign);
3176   }
3177 
3178   MI.eraseFromParent();
3179   return true;
3180 }
3181 
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3182 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3183                                                  MachineRegisterInfo &MRI,
3184                                                  MachineIRBuilder &B) const {
3185   Register Res = MI.getOperand(0).getReg();
3186   Register LHS = MI.getOperand(1).getReg();
3187   Register RHS = MI.getOperand(2).getReg();
3188   uint16_t Flags = MI.getFlags();
3189   LLT ResTy = MRI.getType(Res);
3190 
3191   const MachineFunction &MF = B.getMF();
3192   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3193                             MI.getFlag(MachineInstr::FmAfn);
3194 
3195   if (!AllowInaccurateRcp)
3196     return false;
3197 
3198   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3199     // 1 / x -> RCP(x)
3200     if (CLHS->isExactlyValue(1.0)) {
3201       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3202         .addUse(RHS)
3203         .setMIFlags(Flags);
3204 
3205       MI.eraseFromParent();
3206       return true;
3207     }
3208 
3209     // -1 / x -> RCP( FNEG(x) )
3210     if (CLHS->isExactlyValue(-1.0)) {
3211       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3212       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3213         .addUse(FNeg.getReg(0))
3214         .setMIFlags(Flags);
3215 
3216       MI.eraseFromParent();
3217       return true;
3218     }
3219   }
3220 
3221   // x / y -> x * (1.0 / y)
3222   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3223     .addUse(RHS)
3224     .setMIFlags(Flags);
3225   B.buildFMul(Res, LHS, RCP, Flags);
3226 
3227   MI.eraseFromParent();
3228   return true;
3229 }
3230 
legalizeFastUnsafeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3231 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3232                                                    MachineRegisterInfo &MRI,
3233                                                    MachineIRBuilder &B) const {
3234   Register Res = MI.getOperand(0).getReg();
3235   Register X = MI.getOperand(1).getReg();
3236   Register Y = MI.getOperand(2).getReg();
3237   uint16_t Flags = MI.getFlags();
3238   LLT ResTy = MRI.getType(Res);
3239 
3240   const MachineFunction &MF = B.getMF();
3241   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3242                             MI.getFlag(MachineInstr::FmAfn);
3243 
3244   if (!AllowInaccurateRcp)
3245     return false;
3246 
3247   auto NegY = B.buildFNeg(ResTy, Y);
3248   auto One = B.buildFConstant(ResTy, 1.0);
3249 
3250   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3251     .addUse(Y)
3252     .setMIFlags(Flags);
3253 
3254   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3255   R = B.buildFMA(ResTy, Tmp0, R, R);
3256 
3257   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3258   R = B.buildFMA(ResTy, Tmp1, R, R);
3259 
3260   auto Ret = B.buildFMul(ResTy, X, R);
3261   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3262 
3263   B.buildFMA(Res, Tmp2, R, Ret);
3264   MI.eraseFromParent();
3265   return true;
3266 }
3267 
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3268 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3269                                          MachineRegisterInfo &MRI,
3270                                          MachineIRBuilder &B) const {
3271   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3272     return true;
3273 
3274   Register Res = MI.getOperand(0).getReg();
3275   Register LHS = MI.getOperand(1).getReg();
3276   Register RHS = MI.getOperand(2).getReg();
3277 
3278   uint16_t Flags = MI.getFlags();
3279 
3280   LLT S16 = LLT::scalar(16);
3281   LLT S32 = LLT::scalar(32);
3282 
3283   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3284   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3285 
3286   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3287     .addUse(RHSExt.getReg(0))
3288     .setMIFlags(Flags);
3289 
3290   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3291   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3292 
3293   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3294     .addUse(RDst.getReg(0))
3295     .addUse(RHS)
3296     .addUse(LHS)
3297     .setMIFlags(Flags);
3298 
3299   MI.eraseFromParent();
3300   return true;
3301 }
3302 
3303 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3304 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,AMDGPU::SIModeRegisterDefaults Mode)3305 static void toggleSPDenormMode(bool Enable,
3306                                MachineIRBuilder &B,
3307                                const GCNSubtarget &ST,
3308                                AMDGPU::SIModeRegisterDefaults Mode) {
3309   // Set SP denorm mode to this value.
3310   unsigned SPDenormMode =
3311     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3312 
3313   if (ST.hasDenormModeInst()) {
3314     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3315     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3316 
3317     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3318     B.buildInstr(AMDGPU::S_DENORM_MODE)
3319       .addImm(NewDenormModeValue);
3320 
3321   } else {
3322     // Select FP32 bit field in mode register.
3323     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3324                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3325                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3326 
3327     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3328       .addImm(SPDenormMode)
3329       .addImm(SPDenormModeBitField);
3330   }
3331 }
3332 
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3333 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3334                                          MachineRegisterInfo &MRI,
3335                                          MachineIRBuilder &B) const {
3336   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3337     return true;
3338 
3339   Register Res = MI.getOperand(0).getReg();
3340   Register LHS = MI.getOperand(1).getReg();
3341   Register RHS = MI.getOperand(2).getReg();
3342   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3343   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3344 
3345   uint16_t Flags = MI.getFlags();
3346 
3347   LLT S32 = LLT::scalar(32);
3348   LLT S1 = LLT::scalar(1);
3349 
3350   auto One = B.buildFConstant(S32, 1.0f);
3351 
3352   auto DenominatorScaled =
3353     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3354       .addUse(LHS)
3355       .addUse(RHS)
3356       .addImm(0)
3357       .setMIFlags(Flags);
3358   auto NumeratorScaled =
3359     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3360       .addUse(LHS)
3361       .addUse(RHS)
3362       .addImm(1)
3363       .setMIFlags(Flags);
3364 
3365   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3366     .addUse(DenominatorScaled.getReg(0))
3367     .setMIFlags(Flags);
3368   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3369 
3370   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3371   // aren't modeled as reading it.
3372   if (!Mode.allFP32Denormals())
3373     toggleSPDenormMode(true, B, ST, Mode);
3374 
3375   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3376   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3377   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3378   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3379   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3380   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3381 
3382   if (!Mode.allFP32Denormals())
3383     toggleSPDenormMode(false, B, ST, Mode);
3384 
3385   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3386     .addUse(Fma4.getReg(0))
3387     .addUse(Fma1.getReg(0))
3388     .addUse(Fma3.getReg(0))
3389     .addUse(NumeratorScaled.getReg(1))
3390     .setMIFlags(Flags);
3391 
3392   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3393     .addUse(Fmas.getReg(0))
3394     .addUse(RHS)
3395     .addUse(LHS)
3396     .setMIFlags(Flags);
3397 
3398   MI.eraseFromParent();
3399   return true;
3400 }
3401 
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3402 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3403                                          MachineRegisterInfo &MRI,
3404                                          MachineIRBuilder &B) const {
3405   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3406     return true;
3407 
3408   Register Res = MI.getOperand(0).getReg();
3409   Register LHS = MI.getOperand(1).getReg();
3410   Register RHS = MI.getOperand(2).getReg();
3411 
3412   uint16_t Flags = MI.getFlags();
3413 
3414   LLT S64 = LLT::scalar(64);
3415   LLT S1 = LLT::scalar(1);
3416 
3417   auto One = B.buildFConstant(S64, 1.0);
3418 
3419   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3420     .addUse(LHS)
3421     .addUse(RHS)
3422     .addImm(0)
3423     .setMIFlags(Flags);
3424 
3425   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3426 
3427   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3428     .addUse(DivScale0.getReg(0))
3429     .setMIFlags(Flags);
3430 
3431   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3432   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3433   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3434 
3435   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3436     .addUse(LHS)
3437     .addUse(RHS)
3438     .addImm(1)
3439     .setMIFlags(Flags);
3440 
3441   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3442   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3443   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3444 
3445   Register Scale;
3446   if (!ST.hasUsableDivScaleConditionOutput()) {
3447     // Workaround a hardware bug on SI where the condition output from div_scale
3448     // is not usable.
3449 
3450     LLT S32 = LLT::scalar(32);
3451 
3452     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3453     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3454     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3455     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3456 
3457     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3458                               Scale1Unmerge.getReg(1));
3459     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3460                               Scale0Unmerge.getReg(1));
3461     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3462   } else {
3463     Scale = DivScale1.getReg(1);
3464   }
3465 
3466   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3467     .addUse(Fma4.getReg(0))
3468     .addUse(Fma3.getReg(0))
3469     .addUse(Mul.getReg(0))
3470     .addUse(Scale)
3471     .setMIFlags(Flags);
3472 
3473   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3474     .addUse(Fmas.getReg(0))
3475     .addUse(RHS)
3476     .addUse(LHS)
3477     .setMIFlags(Flags);
3478 
3479   MI.eraseFromParent();
3480   return true;
3481 }
3482 
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3483 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3484                                                  MachineRegisterInfo &MRI,
3485                                                  MachineIRBuilder &B) const {
3486   Register Res = MI.getOperand(0).getReg();
3487   Register LHS = MI.getOperand(2).getReg();
3488   Register RHS = MI.getOperand(3).getReg();
3489   uint16_t Flags = MI.getFlags();
3490 
3491   LLT S32 = LLT::scalar(32);
3492   LLT S1 = LLT::scalar(1);
3493 
3494   auto Abs = B.buildFAbs(S32, RHS, Flags);
3495   const APFloat C0Val(1.0f);
3496 
3497   auto C0 = B.buildConstant(S32, 0x6f800000);
3498   auto C1 = B.buildConstant(S32, 0x2f800000);
3499   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3500 
3501   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3502   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3503 
3504   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3505 
3506   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3507     .addUse(Mul0.getReg(0))
3508     .setMIFlags(Flags);
3509 
3510   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3511 
3512   B.buildFMul(Res, Sel, Mul1, Flags);
3513 
3514   MI.eraseFromParent();
3515   return true;
3516 }
3517 
3518 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3519 // FIXME: Why do we handle this one but not other removed instructions?
3520 //
3521 // Reciprocal square root.  The clamp prevents infinite results, clamping
3522 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3523 // +-max_float.
legalizeRsqClampIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3524 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3525                                                     MachineRegisterInfo &MRI,
3526                                                     MachineIRBuilder &B) const {
3527   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3528     return true;
3529 
3530   Register Dst = MI.getOperand(0).getReg();
3531   Register Src = MI.getOperand(2).getReg();
3532   auto Flags = MI.getFlags();
3533 
3534   LLT Ty = MRI.getType(Dst);
3535 
3536   const fltSemantics *FltSemantics;
3537   if (Ty == LLT::scalar(32))
3538     FltSemantics = &APFloat::IEEEsingle();
3539   else if (Ty == LLT::scalar(64))
3540     FltSemantics = &APFloat::IEEEdouble();
3541   else
3542     return false;
3543 
3544   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3545     .addUse(Src)
3546     .setMIFlags(Flags);
3547 
3548   // We don't need to concern ourselves with the snan handling difference, since
3549   // the rsq quieted (or not) so use the one which will directly select.
3550   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3551   const bool UseIEEE = MFI->getMode().IEEE;
3552 
3553   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3554   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3555                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3556 
3557   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3558 
3559   if (UseIEEE)
3560     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3561   else
3562     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3563   MI.eraseFromParent();
3564   return true;
3565 }
3566 
getDSFPAtomicOpcode(Intrinsic::ID IID)3567 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3568   switch (IID) {
3569   case Intrinsic::amdgcn_ds_fadd:
3570     return AMDGPU::G_ATOMICRMW_FADD;
3571   case Intrinsic::amdgcn_ds_fmin:
3572     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3573   case Intrinsic::amdgcn_ds_fmax:
3574     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3575   default:
3576     llvm_unreachable("not a DS FP intrinsic");
3577   }
3578 }
3579 
legalizeDSAtomicFPIntrinsic(LegalizerHelper & Helper,MachineInstr & MI,Intrinsic::ID IID) const3580 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3581                                                       MachineInstr &MI,
3582                                                       Intrinsic::ID IID) const {
3583   GISelChangeObserver &Observer = Helper.Observer;
3584   Observer.changingInstr(MI);
3585 
3586   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3587 
3588   // The remaining operands were used to set fields in the MemOperand on
3589   // construction.
3590   for (int I = 6; I > 3; --I)
3591     MI.RemoveOperand(I);
3592 
3593   MI.RemoveOperand(1); // Remove the intrinsic ID.
3594   Observer.changedInstr(MI);
3595   return true;
3596 }
3597 
getImplicitArgPtr(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3598 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3599                                             MachineRegisterInfo &MRI,
3600                                             MachineIRBuilder &B) const {
3601   uint64_t Offset =
3602     ST.getTargetLowering()->getImplicitParameterOffset(
3603       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3604   LLT DstTy = MRI.getType(DstReg);
3605   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3606 
3607   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3608   if (!loadInputValue(KernargPtrReg, B,
3609                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3610     return false;
3611 
3612   // FIXME: This should be nuw
3613   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3614   return true;
3615 }
3616 
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3617 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3618                                                  MachineRegisterInfo &MRI,
3619                                                  MachineIRBuilder &B) const {
3620   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3621   if (!MFI->isEntryFunction()) {
3622     return legalizePreloadedArgIntrin(MI, MRI, B,
3623                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3624   }
3625 
3626   Register DstReg = MI.getOperand(0).getReg();
3627   if (!getImplicitArgPtr(DstReg, MRI, B))
3628     return false;
3629 
3630   MI.eraseFromParent();
3631   return true;
3632 }
3633 
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const3634 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3635                                               MachineRegisterInfo &MRI,
3636                                               MachineIRBuilder &B,
3637                                               unsigned AddrSpace) const {
3638   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3639   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3640   Register Hi32 = Unmerge.getReg(1);
3641 
3642   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3643   MI.eraseFromParent();
3644   return true;
3645 }
3646 
3647 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3648 // offset (the offset that is included in bounds checking and swizzling, to be
3649 // split between the instruction's voffset and immoffset fields) and soffset
3650 // (the offset that is excluded from bounds checking and swizzling, to go in
3651 // the instruction's soffset field).  This function takes the first kind of
3652 // offset and figures out how to split it between voffset and immoffset.
3653 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const3654 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3655                                         Register OrigOffset) const {
3656   const unsigned MaxImm = 4095;
3657   Register BaseReg;
3658   unsigned ImmOffset;
3659   const LLT S32 = LLT::scalar(32);
3660   MachineRegisterInfo &MRI = *B.getMRI();
3661 
3662   std::tie(BaseReg, ImmOffset) =
3663       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3664 
3665   // If BaseReg is a pointer, convert it to int.
3666   if (MRI.getType(BaseReg).isPointer())
3667     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3668 
3669   // If the immediate value is too big for the immoffset field, put the value
3670   // and -4096 into the immoffset field so that the value that is copied/added
3671   // for the voffset field is a multiple of 4096, and it stands more chance
3672   // of being CSEd with the copy/add for another similar load/store.
3673   // However, do not do that rounding down to a multiple of 4096 if that is a
3674   // negative number, as it appears to be illegal to have a negative offset
3675   // in the vgpr, even if adding the immediate offset makes it positive.
3676   unsigned Overflow = ImmOffset & ~MaxImm;
3677   ImmOffset -= Overflow;
3678   if ((int32_t)Overflow < 0) {
3679     Overflow += ImmOffset;
3680     ImmOffset = 0;
3681   }
3682 
3683   if (Overflow != 0) {
3684     if (!BaseReg) {
3685       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3686     } else {
3687       auto OverflowVal = B.buildConstant(S32, Overflow);
3688       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3689     }
3690   }
3691 
3692   if (!BaseReg)
3693     BaseReg = B.buildConstant(S32, 0).getReg(0);
3694 
3695   return std::make_pair(BaseReg, ImmOffset);
3696 }
3697 
3698 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
updateBufferMMO(MachineMemOperand * MMO,Register VOffset,Register SOffset,unsigned ImmOffset,Register VIndex,MachineRegisterInfo & MRI) const3699 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3700                                           Register VOffset, Register SOffset,
3701                                           unsigned ImmOffset, Register VIndex,
3702                                           MachineRegisterInfo &MRI) const {
3703   Optional<ValueAndVReg> MaybeVOffsetVal =
3704       getConstantVRegValWithLookThrough(VOffset, MRI);
3705   Optional<ValueAndVReg> MaybeSOffsetVal =
3706       getConstantVRegValWithLookThrough(SOffset, MRI);
3707   Optional<ValueAndVReg> MaybeVIndexVal =
3708       getConstantVRegValWithLookThrough(VIndex, MRI);
3709   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3710   // update the MMO with that offset. The stride is unknown so we can only do
3711   // this if VIndex is constant 0.
3712   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3713       MaybeVIndexVal->Value == 0) {
3714     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3715                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3716     MMO->setOffset(TotalOffset);
3717   } else {
3718     // We don't have a constant combined offset to use in the MMO. Give up.
3719     MMO->setValue((Value *)nullptr);
3720   }
3721 }
3722 
3723 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg,bool ImageStore) const3724 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3725                                              MachineRegisterInfo &MRI,
3726                                              Register Reg,
3727                                              bool ImageStore) const {
3728   const LLT S16 = LLT::scalar(16);
3729   const LLT S32 = LLT::scalar(32);
3730   LLT StoreVT = MRI.getType(Reg);
3731   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3732 
3733   if (ST.hasUnpackedD16VMem()) {
3734     auto Unmerge = B.buildUnmerge(S16, Reg);
3735 
3736     SmallVector<Register, 4> WideRegs;
3737     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3738       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3739 
3740     int NumElts = StoreVT.getNumElements();
3741 
3742     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3743         .getReg(0);
3744   }
3745 
3746   if (ImageStore && ST.hasImageStoreD16Bug()) {
3747     if (StoreVT.getNumElements() == 2) {
3748       SmallVector<Register, 4> PackedRegs;
3749       Reg = B.buildBitcast(S32, Reg).getReg(0);
3750       PackedRegs.push_back(Reg);
3751       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3752       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3753           .getReg(0);
3754     }
3755 
3756     if (StoreVT.getNumElements() == 3) {
3757       SmallVector<Register, 4> PackedRegs;
3758       auto Unmerge = B.buildUnmerge(S16, Reg);
3759       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3760         PackedRegs.push_back(Unmerge.getReg(I));
3761       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3762       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3763       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3764     }
3765 
3766     if (StoreVT.getNumElements() == 4) {
3767       SmallVector<Register, 4> PackedRegs;
3768       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3769       auto Unmerge = B.buildUnmerge(S32, Reg);
3770       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3771         PackedRegs.push_back(Unmerge.getReg(I));
3772       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3773       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3774           .getReg(0);
3775     }
3776 
3777     llvm_unreachable("invalid data type");
3778   }
3779 
3780   return Reg;
3781 }
3782 
fixStoreSourceType(MachineIRBuilder & B,Register VData,bool IsFormat) const3783 Register AMDGPULegalizerInfo::fixStoreSourceType(
3784   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3785   MachineRegisterInfo *MRI = B.getMRI();
3786   LLT Ty = MRI->getType(VData);
3787 
3788   const LLT S16 = LLT::scalar(16);
3789 
3790   // Fixup illegal register types for i8 stores.
3791   if (Ty == LLT::scalar(8) || Ty == S16) {
3792     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3793     return AnyExt;
3794   }
3795 
3796   if (Ty.isVector()) {
3797     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3798       if (IsFormat)
3799         return handleD16VData(B, *MRI, VData);
3800     }
3801   }
3802 
3803   return VData;
3804 }
3805 
legalizeBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsTyped,bool IsFormat) const3806 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3807                                               MachineRegisterInfo &MRI,
3808                                               MachineIRBuilder &B,
3809                                               bool IsTyped,
3810                                               bool IsFormat) const {
3811   Register VData = MI.getOperand(1).getReg();
3812   LLT Ty = MRI.getType(VData);
3813   LLT EltTy = Ty.getScalarType();
3814   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3815   const LLT S32 = LLT::scalar(32);
3816 
3817   VData = fixStoreSourceType(B, VData, IsFormat);
3818   Register RSrc = MI.getOperand(2).getReg();
3819 
3820   MachineMemOperand *MMO = *MI.memoperands_begin();
3821   const int MemSize = MMO->getSize();
3822 
3823   unsigned ImmOffset;
3824 
3825   // The typed intrinsics add an immediate after the registers.
3826   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3827 
3828   // The struct intrinsic variants add one additional operand over raw.
3829   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3830   Register VIndex;
3831   int OpOffset = 0;
3832   if (HasVIndex) {
3833     VIndex = MI.getOperand(3).getReg();
3834     OpOffset = 1;
3835   } else {
3836     VIndex = B.buildConstant(S32, 0).getReg(0);
3837   }
3838 
3839   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3840   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3841 
3842   unsigned Format = 0;
3843   if (IsTyped) {
3844     Format = MI.getOperand(5 + OpOffset).getImm();
3845     ++OpOffset;
3846   }
3847 
3848   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3849 
3850   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3851   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3852 
3853   unsigned Opc;
3854   if (IsTyped) {
3855     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3856                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3857   } else if (IsFormat) {
3858     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3859                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3860   } else {
3861     switch (MemSize) {
3862     case 1:
3863       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3864       break;
3865     case 2:
3866       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3867       break;
3868     default:
3869       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3870       break;
3871     }
3872   }
3873 
3874   auto MIB = B.buildInstr(Opc)
3875     .addUse(VData)              // vdata
3876     .addUse(RSrc)               // rsrc
3877     .addUse(VIndex)             // vindex
3878     .addUse(VOffset)            // voffset
3879     .addUse(SOffset)            // soffset
3880     .addImm(ImmOffset);         // offset(imm)
3881 
3882   if (IsTyped)
3883     MIB.addImm(Format);
3884 
3885   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3886      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3887      .addMemOperand(MMO);
3888 
3889   MI.eraseFromParent();
3890   return true;
3891 }
3892 
legalizeBufferLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat,bool IsTyped) const3893 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3894                                              MachineRegisterInfo &MRI,
3895                                              MachineIRBuilder &B,
3896                                              bool IsFormat,
3897                                              bool IsTyped) const {
3898   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3899   MachineMemOperand *MMO = *MI.memoperands_begin();
3900   const LLT MemTy = MMO->getMemoryType();
3901   const LLT S32 = LLT::scalar(32);
3902 
3903   Register Dst = MI.getOperand(0).getReg();
3904   Register RSrc = MI.getOperand(2).getReg();
3905 
3906   // The typed intrinsics add an immediate after the registers.
3907   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3908 
3909   // The struct intrinsic variants add one additional operand over raw.
3910   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3911   Register VIndex;
3912   int OpOffset = 0;
3913   if (HasVIndex) {
3914     VIndex = MI.getOperand(3).getReg();
3915     OpOffset = 1;
3916   } else {
3917     VIndex = B.buildConstant(S32, 0).getReg(0);
3918   }
3919 
3920   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3921   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3922 
3923   unsigned Format = 0;
3924   if (IsTyped) {
3925     Format = MI.getOperand(5 + OpOffset).getImm();
3926     ++OpOffset;
3927   }
3928 
3929   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3930   unsigned ImmOffset;
3931 
3932   LLT Ty = MRI.getType(Dst);
3933   LLT EltTy = Ty.getScalarType();
3934   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3935   const bool Unpacked = ST.hasUnpackedD16VMem();
3936 
3937   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3938   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3939 
3940   unsigned Opc;
3941 
3942   if (IsTyped) {
3943     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3944                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3945   } else if (IsFormat) {
3946     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3947                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3948   } else {
3949     switch (MemTy.getSizeInBits()) {
3950     case 8:
3951       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3952       break;
3953     case 16:
3954       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3955       break;
3956     default:
3957       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3958       break;
3959     }
3960   }
3961 
3962   Register LoadDstReg;
3963 
3964   bool IsExtLoad =
3965       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
3966   LLT UnpackedTy = Ty.changeElementSize(32);
3967 
3968   if (IsExtLoad)
3969     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3970   else if (Unpacked && IsD16 && Ty.isVector())
3971     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3972   else
3973     LoadDstReg = Dst;
3974 
3975   auto MIB = B.buildInstr(Opc)
3976     .addDef(LoadDstReg)         // vdata
3977     .addUse(RSrc)               // rsrc
3978     .addUse(VIndex)             // vindex
3979     .addUse(VOffset)            // voffset
3980     .addUse(SOffset)            // soffset
3981     .addImm(ImmOffset);         // offset(imm)
3982 
3983   if (IsTyped)
3984     MIB.addImm(Format);
3985 
3986   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3987      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3988      .addMemOperand(MMO);
3989 
3990   if (LoadDstReg != Dst) {
3991     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3992 
3993     // Widen result for extending loads was widened.
3994     if (IsExtLoad)
3995       B.buildTrunc(Dst, LoadDstReg);
3996     else {
3997       // Repack to original 16-bit vector result
3998       // FIXME: G_TRUNC should work, but legalization currently fails
3999       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4000       SmallVector<Register, 4> Repack;
4001       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4002         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4003       B.buildMerge(Dst, Repack);
4004     }
4005   }
4006 
4007   MI.eraseFromParent();
4008   return true;
4009 }
4010 
legalizeAtomicIncDec(MachineInstr & MI,MachineIRBuilder & B,bool IsInc) const4011 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4012                                                MachineIRBuilder &B,
4013                                                bool IsInc) const {
4014   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4015                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4016   B.buildInstr(Opc)
4017     .addDef(MI.getOperand(0).getReg())
4018     .addUse(MI.getOperand(2).getReg())
4019     .addUse(MI.getOperand(3).getReg())
4020     .cloneMemRefs(MI);
4021   MI.eraseFromParent();
4022   return true;
4023 }
4024 
getBufferAtomicPseudo(Intrinsic::ID IntrID)4025 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4026   switch (IntrID) {
4027   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4028   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4029     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4030   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4031   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4032     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4033   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4034   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4035     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4036   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4037   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4038     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4039   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4040   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4041     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4042   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4043   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4044     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4045   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4046   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4047     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4048   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4049   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4050     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4051   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4052   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4053     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4054   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4055   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4056     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4057   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4058   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4059     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4060   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4061   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4062     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4063   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4064   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4065     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4066   case Intrinsic::amdgcn_buffer_atomic_fadd:
4067   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4068   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4069     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4070   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4071   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4072     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4073   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4074   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4075     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4076   default:
4077     llvm_unreachable("unhandled atomic opcode");
4078   }
4079 }
4080 
legalizeBufferAtomic(MachineInstr & MI,MachineIRBuilder & B,Intrinsic::ID IID) const4081 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4082                                                MachineIRBuilder &B,
4083                                                Intrinsic::ID IID) const {
4084   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4085                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4086   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4087 
4088   Register Dst;
4089 
4090   int OpOffset = 0;
4091   if (HasReturn) {
4092     // A few FP atomics do not support return values.
4093     Dst = MI.getOperand(0).getReg();
4094   } else {
4095     OpOffset = -1;
4096   }
4097 
4098   Register VData = MI.getOperand(2 + OpOffset).getReg();
4099   Register CmpVal;
4100 
4101   if (IsCmpSwap) {
4102     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4103     ++OpOffset;
4104   }
4105 
4106   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4107   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4108 
4109   // The struct intrinsic variants add one additional operand over raw.
4110   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4111   Register VIndex;
4112   if (HasVIndex) {
4113     VIndex = MI.getOperand(4 + OpOffset).getReg();
4114     ++OpOffset;
4115   } else {
4116     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4117   }
4118 
4119   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4120   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4121   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4122 
4123   MachineMemOperand *MMO = *MI.memoperands_begin();
4124 
4125   unsigned ImmOffset;
4126   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4127   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4128 
4129   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4130 
4131   if (HasReturn)
4132     MIB.addDef(Dst);
4133 
4134   MIB.addUse(VData); // vdata
4135 
4136   if (IsCmpSwap)
4137     MIB.addReg(CmpVal);
4138 
4139   MIB.addUse(RSrc)               // rsrc
4140      .addUse(VIndex)             // vindex
4141      .addUse(VOffset)            // voffset
4142      .addUse(SOffset)            // soffset
4143      .addImm(ImmOffset)          // offset(imm)
4144      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4145      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4146      .addMemOperand(MMO);
4147 
4148   MI.eraseFromParent();
4149   return true;
4150 }
4151 
4152 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4153 /// vector with s16 typed elements.
packImage16bitOpsToDwords(MachineIRBuilder & B,MachineInstr & MI,SmallVectorImpl<Register> & PackedAddrs,unsigned ArgOffset,const AMDGPU::ImageDimIntrinsicInfo * Intr,bool IsA16,bool IsG16)4154 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4155                                       SmallVectorImpl<Register> &PackedAddrs,
4156                                       unsigned ArgOffset,
4157                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4158                                       bool IsA16, bool IsG16) {
4159   const LLT S16 = LLT::scalar(16);
4160   const LLT V2S16 = LLT::fixed_vector(2, 16);
4161   auto EndIdx = Intr->VAddrEnd;
4162 
4163   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4164     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4165     if (!SrcOp.isReg())
4166       continue; // _L to _LZ may have eliminated this.
4167 
4168     Register AddrReg = SrcOp.getReg();
4169 
4170     if ((I < Intr->GradientStart) ||
4171         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4172         (I >= Intr->CoordStart && !IsA16)) {
4173       // Handle any gradient or coordinate operands that should not be packed
4174       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4175       PackedAddrs.push_back(AddrReg);
4176     } else {
4177       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4178       // derivatives dx/dh and dx/dv are packed with undef.
4179       if (((I + 1) >= EndIdx) ||
4180           ((Intr->NumGradients / 2) % 2 == 1 &&
4181            (I == static_cast<unsigned>(Intr->GradientStart +
4182                                        (Intr->NumGradients / 2) - 1) ||
4183             I == static_cast<unsigned>(Intr->GradientStart +
4184                                        Intr->NumGradients - 1))) ||
4185           // Check for _L to _LZ optimization
4186           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4187         PackedAddrs.push_back(
4188             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4189                 .getReg(0));
4190       } else {
4191         PackedAddrs.push_back(
4192             B.buildBuildVector(
4193                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4194                 .getReg(0));
4195         ++I;
4196       }
4197     }
4198   }
4199 }
4200 
4201 /// Convert from separate vaddr components to a single vector address register,
4202 /// and replace the remaining operands with $noreg.
convertImageAddrToPacked(MachineIRBuilder & B,MachineInstr & MI,int DimIdx,int NumVAddrs)4203 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4204                                      int DimIdx, int NumVAddrs) {
4205   const LLT S32 = LLT::scalar(32);
4206 
4207   SmallVector<Register, 8> AddrRegs;
4208   for (int I = 0; I != NumVAddrs; ++I) {
4209     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4210     if (SrcOp.isReg()) {
4211       AddrRegs.push_back(SrcOp.getReg());
4212       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4213     }
4214   }
4215 
4216   int NumAddrRegs = AddrRegs.size();
4217   if (NumAddrRegs != 1) {
4218     // Above 8 elements round up to next power of 2 (i.e. 16).
4219     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4220       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4221       auto Undef = B.buildUndef(S32);
4222       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4223       NumAddrRegs = RoundedNumRegs;
4224     }
4225 
4226     auto VAddr =
4227         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4228     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4229   }
4230 
4231   for (int I = 1; I != NumVAddrs; ++I) {
4232     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4233     if (SrcOp.isReg())
4234       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4235   }
4236 }
4237 
4238 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4239 ///
4240 /// Depending on the subtarget, load/store with 16-bit element data need to be
4241 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4242 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4243 /// registers.
4244 ///
4245 /// We don't want to directly select image instructions just yet, but also want
4246 /// to exposes all register repacking to the legalizer/combiners. We also don't
4247 /// want a selected instrution entering RegBankSelect. In order to avoid
4248 /// defining a multitude of intermediate image instructions, directly hack on
4249 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4250 /// now unnecessary arguments with $noreg.
legalizeImageIntrinsic(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer,const AMDGPU::ImageDimIntrinsicInfo * Intr) const4251 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4252     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4253     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4254 
4255   const unsigned NumDefs = MI.getNumExplicitDefs();
4256   const unsigned ArgOffset = NumDefs + 1;
4257   bool IsTFE = NumDefs == 2;
4258   // We are only processing the operands of d16 image operations on subtargets
4259   // that use the unpacked register layout, or need to repack the TFE result.
4260 
4261   // TODO: Do we need to guard against already legalized intrinsics?
4262   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4263       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4264 
4265   MachineRegisterInfo *MRI = B.getMRI();
4266   const LLT S32 = LLT::scalar(32);
4267   const LLT S16 = LLT::scalar(16);
4268   const LLT V2S16 = LLT::fixed_vector(2, 16);
4269 
4270   unsigned DMask = 0;
4271 
4272   // Check for 16 bit addresses and pack if true.
4273   LLT GradTy =
4274       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4275   LLT AddrTy =
4276       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4277   const bool IsG16 = GradTy == S16;
4278   const bool IsA16 = AddrTy == S16;
4279 
4280   int DMaskLanes = 0;
4281   if (!BaseOpcode->Atomic) {
4282     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4283     if (BaseOpcode->Gather4) {
4284       DMaskLanes = 4;
4285     } else if (DMask != 0) {
4286       DMaskLanes = countPopulation(DMask);
4287     } else if (!IsTFE && !BaseOpcode->Store) {
4288       // If dmask is 0, this is a no-op load. This can be eliminated.
4289       B.buildUndef(MI.getOperand(0));
4290       MI.eraseFromParent();
4291       return true;
4292     }
4293   }
4294 
4295   Observer.changingInstr(MI);
4296   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4297 
4298   unsigned NewOpcode = NumDefs == 0 ?
4299     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4300 
4301   // Track that we legalized this
4302   MI.setDesc(B.getTII().get(NewOpcode));
4303 
4304   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4305   // dmask to be at least 1 otherwise the instruction will fail
4306   if (IsTFE && DMask == 0) {
4307     DMask = 0x1;
4308     DMaskLanes = 1;
4309     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4310   }
4311 
4312   if (BaseOpcode->Atomic) {
4313     Register VData0 = MI.getOperand(2).getReg();
4314     LLT Ty = MRI->getType(VData0);
4315 
4316     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4317     if (Ty.isVector())
4318       return false;
4319 
4320     if (BaseOpcode->AtomicX2) {
4321       Register VData1 = MI.getOperand(3).getReg();
4322       // The two values are packed in one register.
4323       LLT PackedTy = LLT::fixed_vector(2, Ty);
4324       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4325       MI.getOperand(2).setReg(Concat.getReg(0));
4326       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4327     }
4328   }
4329 
4330   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4331 
4332   // Optimize _L to _LZ when _L is zero
4333   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4334           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4335     const ConstantFP *ConstantLod;
4336 
4337     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4338                  m_GFCst(ConstantLod))) {
4339       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4340         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4341         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4342             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4343                                                       Intr->Dim);
4344 
4345         // The starting indexes should remain in the same place.
4346         --CorrectedNumVAddrs;
4347 
4348         MI.getOperand(MI.getNumExplicitDefs())
4349             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4350         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4351         Intr = NewImageDimIntr;
4352       }
4353     }
4354   }
4355 
4356   // Optimize _mip away, when 'lod' is zero
4357   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4358     int64_t ConstantLod;
4359     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4360                  m_ICst(ConstantLod))) {
4361       if (ConstantLod == 0) {
4362         // TODO: Change intrinsic opcode and remove operand instead or replacing
4363         // it with 0, as the _L to _LZ handling is done above.
4364         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4365         --CorrectedNumVAddrs;
4366       }
4367     }
4368   }
4369 
4370   // Rewrite the addressing register layout before doing anything else.
4371   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4372     // 16 bit gradients are supported, but are tied to the A16 control
4373     // so both gradients and addresses must be 16 bit
4374     return false;
4375   }
4376 
4377   if (IsA16 && !ST.hasA16()) {
4378     // A16 not supported
4379     return false;
4380   }
4381 
4382   if (IsA16 || IsG16) {
4383     if (Intr->NumVAddrs > 1) {
4384       SmallVector<Register, 4> PackedRegs;
4385 
4386       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4387                                 IsG16);
4388 
4389       // See also below in the non-a16 branch
4390       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4391                           PackedRegs.size() <= ST.getNSAMaxSize();
4392 
4393       if (!UseNSA && PackedRegs.size() > 1) {
4394         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4395         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4396         PackedRegs[0] = Concat.getReg(0);
4397         PackedRegs.resize(1);
4398       }
4399 
4400       const unsigned NumPacked = PackedRegs.size();
4401       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4402         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4403         if (!SrcOp.isReg()) {
4404           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4405           continue;
4406         }
4407 
4408         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4409 
4410         if (I - Intr->VAddrStart < NumPacked)
4411           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4412         else
4413           SrcOp.setReg(AMDGPU::NoRegister);
4414       }
4415     }
4416   } else {
4417     // If the register allocator cannot place the address registers contiguously
4418     // without introducing moves, then using the non-sequential address encoding
4419     // is always preferable, since it saves VALU instructions and is usually a
4420     // wash in terms of code size or even better.
4421     //
4422     // However, we currently have no way of hinting to the register allocator
4423     // that MIMG addresses should be placed contiguously when it is possible to
4424     // do so, so force non-NSA for the common 2-address case as a heuristic.
4425     //
4426     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4427     // allocation when possible.
4428     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4429                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4430 
4431     if (!UseNSA && Intr->NumVAddrs > 1)
4432       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4433                                Intr->NumVAddrs);
4434   }
4435 
4436   int Flags = 0;
4437   if (IsA16)
4438     Flags |= 1;
4439   if (IsG16)
4440     Flags |= 2;
4441   MI.addOperand(MachineOperand::CreateImm(Flags));
4442 
4443   if (BaseOpcode->Store) { // No TFE for stores?
4444     // TODO: Handle dmask trim
4445     Register VData = MI.getOperand(1).getReg();
4446     LLT Ty = MRI->getType(VData);
4447     if (!Ty.isVector() || Ty.getElementType() != S16)
4448       return true;
4449 
4450     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4451     if (RepackedReg != VData) {
4452       MI.getOperand(1).setReg(RepackedReg);
4453     }
4454 
4455     return true;
4456   }
4457 
4458   Register DstReg = MI.getOperand(0).getReg();
4459   LLT Ty = MRI->getType(DstReg);
4460   const LLT EltTy = Ty.getScalarType();
4461   const bool IsD16 = Ty.getScalarType() == S16;
4462   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4463 
4464   // Confirm that the return type is large enough for the dmask specified
4465   if (NumElts < DMaskLanes)
4466     return false;
4467 
4468   if (NumElts > 4 || DMaskLanes > 4)
4469     return false;
4470 
4471   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4472   const LLT AdjustedTy =
4473       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4474 
4475   // The raw dword aligned data component of the load. The only legal cases
4476   // where this matters should be when using the packed D16 format, for
4477   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4478   LLT RoundedTy;
4479 
4480   // S32 vector to to cover all data, plus TFE result element.
4481   LLT TFETy;
4482 
4483   // Register type to use for each loaded component. Will be S32 or V2S16.
4484   LLT RegTy;
4485 
4486   if (IsD16 && ST.hasUnpackedD16VMem()) {
4487     RoundedTy =
4488         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4489     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4490     RegTy = S32;
4491   } else {
4492     unsigned EltSize = EltTy.getSizeInBits();
4493     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4494     unsigned RoundedSize = 32 * RoundedElts;
4495     RoundedTy = LLT::scalarOrVector(
4496         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4497     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4498     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4499   }
4500 
4501   // The return type does not need adjustment.
4502   // TODO: Should we change s16 case to s32 or <2 x s16>?
4503   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4504     return true;
4505 
4506   Register Dst1Reg;
4507 
4508   // Insert after the instruction.
4509   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4510 
4511   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4512   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4513   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4514   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4515 
4516   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4517 
4518   MI.getOperand(0).setReg(NewResultReg);
4519 
4520   // In the IR, TFE is supposed to be used with a 2 element struct return
4521   // type. The intruction really returns these two values in one contiguous
4522   // register, with one additional dword beyond the loaded data. Rewrite the
4523   // return type to use a single register result.
4524 
4525   if (IsTFE) {
4526     Dst1Reg = MI.getOperand(1).getReg();
4527     if (MRI->getType(Dst1Reg) != S32)
4528       return false;
4529 
4530     // TODO: Make sure the TFE operand bit is set.
4531     MI.RemoveOperand(1);
4532 
4533     // Handle the easy case that requires no repack instructions.
4534     if (Ty == S32) {
4535       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4536       return true;
4537     }
4538   }
4539 
4540   // Now figure out how to copy the new result register back into the old
4541   // result.
4542   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4543 
4544   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4545 
4546   if (ResultNumRegs == 1) {
4547     assert(!IsTFE);
4548     ResultRegs[0] = NewResultReg;
4549   } else {
4550     // We have to repack into a new vector of some kind.
4551     for (int I = 0; I != NumDataRegs; ++I)
4552       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4553     B.buildUnmerge(ResultRegs, NewResultReg);
4554 
4555     // Drop the final TFE element to get the data part. The TFE result is
4556     // directly written to the right place already.
4557     if (IsTFE)
4558       ResultRegs.resize(NumDataRegs);
4559   }
4560 
4561   // For an s16 scalar result, we form an s32 result with a truncate regardless
4562   // of packed vs. unpacked.
4563   if (IsD16 && !Ty.isVector()) {
4564     B.buildTrunc(DstReg, ResultRegs[0]);
4565     return true;
4566   }
4567 
4568   // Avoid a build/concat_vector of 1 entry.
4569   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4570     B.buildBitcast(DstReg, ResultRegs[0]);
4571     return true;
4572   }
4573 
4574   assert(Ty.isVector());
4575 
4576   if (IsD16) {
4577     // For packed D16 results with TFE enabled, all the data components are
4578     // S32. Cast back to the expected type.
4579     //
4580     // TODO: We don't really need to use load s32 elements. We would only need one
4581     // cast for the TFE result if a multiple of v2s16 was used.
4582     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4583       for (Register &Reg : ResultRegs)
4584         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4585     } else if (ST.hasUnpackedD16VMem()) {
4586       for (Register &Reg : ResultRegs)
4587         Reg = B.buildTrunc(S16, Reg).getReg(0);
4588     }
4589   }
4590 
4591   auto padWithUndef = [&](LLT Ty, int NumElts) {
4592     if (NumElts == 0)
4593       return;
4594     Register Undef = B.buildUndef(Ty).getReg(0);
4595     for (int I = 0; I != NumElts; ++I)
4596       ResultRegs.push_back(Undef);
4597   };
4598 
4599   // Pad out any elements eliminated due to the dmask.
4600   LLT ResTy = MRI->getType(ResultRegs[0]);
4601   if (!ResTy.isVector()) {
4602     padWithUndef(ResTy, NumElts - ResultRegs.size());
4603     B.buildBuildVector(DstReg, ResultRegs);
4604     return true;
4605   }
4606 
4607   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4608   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4609 
4610   // Deal with the one annoying legal case.
4611   const LLT V3S16 = LLT::fixed_vector(3, 16);
4612   if (Ty == V3S16) {
4613     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4614     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4615     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4616     return true;
4617   }
4618 
4619   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4620   B.buildConcatVectors(DstReg, ResultRegs);
4621   return true;
4622 }
4623 
legalizeSBufferLoad(LegalizerHelper & Helper,MachineInstr & MI) const4624 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4625   LegalizerHelper &Helper, MachineInstr &MI) const {
4626   MachineIRBuilder &B = Helper.MIRBuilder;
4627   GISelChangeObserver &Observer = Helper.Observer;
4628 
4629   Register Dst = MI.getOperand(0).getReg();
4630   LLT Ty = B.getMRI()->getType(Dst);
4631   unsigned Size = Ty.getSizeInBits();
4632   MachineFunction &MF = B.getMF();
4633 
4634   Observer.changingInstr(MI);
4635 
4636   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4637     Ty = getBitcastRegisterType(Ty);
4638     Helper.bitcastDst(MI, Ty, 0);
4639     Dst = MI.getOperand(0).getReg();
4640     B.setInsertPt(B.getMBB(), MI);
4641   }
4642 
4643   // FIXME: We don't really need this intermediate instruction. The intrinsic
4644   // should be fixed to have a memory operand. Since it's readnone, we're not
4645   // allowed to add one.
4646   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4647   MI.RemoveOperand(1); // Remove intrinsic ID
4648 
4649   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4650   // TODO: Should this use datalayout alignment?
4651   const unsigned MemSize = (Size + 7) / 8;
4652   const Align MemAlign(4);
4653   MachineMemOperand *MMO = MF.getMachineMemOperand(
4654       MachinePointerInfo(),
4655       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4656           MachineMemOperand::MOInvariant,
4657       MemSize, MemAlign);
4658   MI.addMemOperand(MF, MMO);
4659 
4660   // There are no 96-bit result scalar loads, but widening to 128-bit should
4661   // always be legal. We may need to restore this to a 96-bit result if it turns
4662   // out this needs to be converted to a vector load during RegBankSelect.
4663   if (!isPowerOf2_32(Size)) {
4664     if (Ty.isVector())
4665       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4666     else
4667       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4668   }
4669 
4670   Observer.changedInstr(MI);
4671   return true;
4672 }
4673 
4674 // TODO: Move to selection
legalizeTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4675 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4676                                                 MachineRegisterInfo &MRI,
4677                                                 MachineIRBuilder &B) const {
4678   if (!ST.isTrapHandlerEnabled() ||
4679       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4680     return legalizeTrapEndpgm(MI, MRI, B);
4681 
4682   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4683     switch (*HsaAbiVer) {
4684     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4685     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4686       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4687     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4688       return ST.supportsGetDoorbellID() ?
4689           legalizeTrapHsa(MI, MRI, B) :
4690           legalizeTrapHsaQueuePtr(MI, MRI, B);
4691     }
4692   }
4693 
4694   llvm_unreachable("Unknown trap handler");
4695 }
4696 
legalizeTrapEndpgm(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4697 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4698     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4699   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4700   MI.eraseFromParent();
4701   return true;
4702 }
4703 
legalizeTrapHsaQueuePtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4704 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4705     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4706   // Pass queue pointer to trap handler as input, and insert trap instruction
4707   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4708   Register LiveIn =
4709     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4710   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4711     return false;
4712 
4713   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4714   B.buildCopy(SGPR01, LiveIn);
4715   B.buildInstr(AMDGPU::S_TRAP)
4716       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4717       .addReg(SGPR01, RegState::Implicit);
4718 
4719   MI.eraseFromParent();
4720   return true;
4721 }
4722 
legalizeTrapHsa(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4723 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4724     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4725   B.buildInstr(AMDGPU::S_TRAP)
4726       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4727   MI.eraseFromParent();
4728   return true;
4729 }
4730 
legalizeDebugTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4731 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4732     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4733   // Is non-HSA path or trap-handler disabled? then, report a warning
4734   // accordingly
4735   if (!ST.isTrapHandlerEnabled() ||
4736       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4737     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4738                                      "debugtrap handler not supported",
4739                                      MI.getDebugLoc(), DS_Warning);
4740     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4741     Ctx.diagnose(NoTrap);
4742   } else {
4743     // Insert debug-trap instruction
4744     B.buildInstr(AMDGPU::S_TRAP)
4745         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4746   }
4747 
4748   MI.eraseFromParent();
4749   return true;
4750 }
4751 
legalizeBVHIntrinsic(MachineInstr & MI,MachineIRBuilder & B) const4752 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4753                                                MachineIRBuilder &B) const {
4754   MachineRegisterInfo &MRI = *B.getMRI();
4755   const LLT S16 = LLT::scalar(16);
4756   const LLT S32 = LLT::scalar(32);
4757 
4758   Register DstReg = MI.getOperand(0).getReg();
4759   Register NodePtr = MI.getOperand(2).getReg();
4760   Register RayExtent = MI.getOperand(3).getReg();
4761   Register RayOrigin = MI.getOperand(4).getReg();
4762   Register RayDir = MI.getOperand(5).getReg();
4763   Register RayInvDir = MI.getOperand(6).getReg();
4764   Register TDescr = MI.getOperand(7).getReg();
4765 
4766   if (!ST.hasGFX10_AEncoding()) {
4767     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4768                                         "intrinsic not supported on subtarget",
4769                                         MI.getDebugLoc());
4770     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4771     return false;
4772   }
4773 
4774   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4775   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4776   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4777                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4778                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4779                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4780 
4781   SmallVector<Register, 12> Ops;
4782   if (Is64) {
4783     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4784     Ops.push_back(Unmerge.getReg(0));
4785     Ops.push_back(Unmerge.getReg(1));
4786   } else {
4787     Ops.push_back(NodePtr);
4788   }
4789   Ops.push_back(RayExtent);
4790 
4791   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4792     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4793     Ops.push_back(Unmerge.getReg(0));
4794     Ops.push_back(Unmerge.getReg(1));
4795     Ops.push_back(Unmerge.getReg(2));
4796   };
4797 
4798   packLanes(RayOrigin);
4799   if (IsA16) {
4800     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4801     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4802     Register R1 = MRI.createGenericVirtualRegister(S32);
4803     Register R2 = MRI.createGenericVirtualRegister(S32);
4804     Register R3 = MRI.createGenericVirtualRegister(S32);
4805     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4806     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4807     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4808     Ops.push_back(R1);
4809     Ops.push_back(R2);
4810     Ops.push_back(R3);
4811   } else {
4812     packLanes(RayDir);
4813     packLanes(RayInvDir);
4814   }
4815 
4816   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4817     .addDef(DstReg)
4818     .addImm(Opcode);
4819 
4820   for (Register R : Ops) {
4821     MIB.addUse(R);
4822   }
4823 
4824   MIB.addUse(TDescr)
4825      .addImm(IsA16 ? 1 : 0)
4826      .cloneMemRefs(MI);
4827 
4828   MI.eraseFromParent();
4829   return true;
4830 }
4831 
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const4832 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4833                                             MachineInstr &MI) const {
4834   MachineIRBuilder &B = Helper.MIRBuilder;
4835   MachineRegisterInfo &MRI = *B.getMRI();
4836 
4837   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4838   auto IntrID = MI.getIntrinsicID();
4839   switch (IntrID) {
4840   case Intrinsic::amdgcn_if:
4841   case Intrinsic::amdgcn_else: {
4842     MachineInstr *Br = nullptr;
4843     MachineBasicBlock *UncondBrTarget = nullptr;
4844     bool Negated = false;
4845     if (MachineInstr *BrCond =
4846             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4847       const SIRegisterInfo *TRI
4848         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4849 
4850       Register Def = MI.getOperand(1).getReg();
4851       Register Use = MI.getOperand(3).getReg();
4852 
4853       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4854 
4855       if (Negated)
4856         std::swap(CondBrTarget, UncondBrTarget);
4857 
4858       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4859       if (IntrID == Intrinsic::amdgcn_if) {
4860         B.buildInstr(AMDGPU::SI_IF)
4861           .addDef(Def)
4862           .addUse(Use)
4863           .addMBB(UncondBrTarget);
4864       } else {
4865         B.buildInstr(AMDGPU::SI_ELSE)
4866             .addDef(Def)
4867             .addUse(Use)
4868             .addMBB(UncondBrTarget);
4869       }
4870 
4871       if (Br) {
4872         Br->getOperand(0).setMBB(CondBrTarget);
4873       } else {
4874         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4875         // since we're swapping branch targets it needs to be reinserted.
4876         // FIXME: IRTranslator should probably not do this
4877         B.buildBr(*CondBrTarget);
4878       }
4879 
4880       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4881       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4882       MI.eraseFromParent();
4883       BrCond->eraseFromParent();
4884       return true;
4885     }
4886 
4887     return false;
4888   }
4889   case Intrinsic::amdgcn_loop: {
4890     MachineInstr *Br = nullptr;
4891     MachineBasicBlock *UncondBrTarget = nullptr;
4892     bool Negated = false;
4893     if (MachineInstr *BrCond =
4894             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4895       const SIRegisterInfo *TRI
4896         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4897 
4898       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4899       Register Reg = MI.getOperand(2).getReg();
4900 
4901       if (Negated)
4902         std::swap(CondBrTarget, UncondBrTarget);
4903 
4904       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4905       B.buildInstr(AMDGPU::SI_LOOP)
4906         .addUse(Reg)
4907         .addMBB(UncondBrTarget);
4908 
4909       if (Br)
4910         Br->getOperand(0).setMBB(CondBrTarget);
4911       else
4912         B.buildBr(*CondBrTarget);
4913 
4914       MI.eraseFromParent();
4915       BrCond->eraseFromParent();
4916       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4917       return true;
4918     }
4919 
4920     return false;
4921   }
4922   case Intrinsic::amdgcn_kernarg_segment_ptr:
4923     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4924       // This only makes sense to call in a kernel, so just lower to null.
4925       B.buildConstant(MI.getOperand(0).getReg(), 0);
4926       MI.eraseFromParent();
4927       return true;
4928     }
4929 
4930     return legalizePreloadedArgIntrin(
4931       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4932   case Intrinsic::amdgcn_implicitarg_ptr:
4933     return legalizeImplicitArgPtr(MI, MRI, B);
4934   case Intrinsic::amdgcn_workitem_id_x:
4935     return legalizePreloadedArgIntrin(MI, MRI, B,
4936                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4937   case Intrinsic::amdgcn_workitem_id_y:
4938     return legalizePreloadedArgIntrin(MI, MRI, B,
4939                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4940   case Intrinsic::amdgcn_workitem_id_z:
4941     return legalizePreloadedArgIntrin(MI, MRI, B,
4942                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4943   case Intrinsic::amdgcn_workgroup_id_x:
4944     return legalizePreloadedArgIntrin(MI, MRI, B,
4945                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4946   case Intrinsic::amdgcn_workgroup_id_y:
4947     return legalizePreloadedArgIntrin(MI, MRI, B,
4948                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4949   case Intrinsic::amdgcn_workgroup_id_z:
4950     return legalizePreloadedArgIntrin(MI, MRI, B,
4951                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4952   case Intrinsic::amdgcn_dispatch_ptr:
4953     return legalizePreloadedArgIntrin(MI, MRI, B,
4954                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4955   case Intrinsic::amdgcn_queue_ptr:
4956     return legalizePreloadedArgIntrin(MI, MRI, B,
4957                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4958   case Intrinsic::amdgcn_implicit_buffer_ptr:
4959     return legalizePreloadedArgIntrin(
4960       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4961   case Intrinsic::amdgcn_dispatch_id:
4962     return legalizePreloadedArgIntrin(MI, MRI, B,
4963                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4964   case Intrinsic::amdgcn_fdiv_fast:
4965     return legalizeFDIVFastIntrin(MI, MRI, B);
4966   case Intrinsic::amdgcn_is_shared:
4967     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4968   case Intrinsic::amdgcn_is_private:
4969     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4970   case Intrinsic::amdgcn_wavefrontsize: {
4971     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4972     MI.eraseFromParent();
4973     return true;
4974   }
4975   case Intrinsic::amdgcn_s_buffer_load:
4976     return legalizeSBufferLoad(Helper, MI);
4977   case Intrinsic::amdgcn_raw_buffer_store:
4978   case Intrinsic::amdgcn_struct_buffer_store:
4979     return legalizeBufferStore(MI, MRI, B, false, false);
4980   case Intrinsic::amdgcn_raw_buffer_store_format:
4981   case Intrinsic::amdgcn_struct_buffer_store_format:
4982     return legalizeBufferStore(MI, MRI, B, false, true);
4983   case Intrinsic::amdgcn_raw_tbuffer_store:
4984   case Intrinsic::amdgcn_struct_tbuffer_store:
4985     return legalizeBufferStore(MI, MRI, B, true, true);
4986   case Intrinsic::amdgcn_raw_buffer_load:
4987   case Intrinsic::amdgcn_struct_buffer_load:
4988     return legalizeBufferLoad(MI, MRI, B, false, false);
4989   case Intrinsic::amdgcn_raw_buffer_load_format:
4990   case Intrinsic::amdgcn_struct_buffer_load_format:
4991     return legalizeBufferLoad(MI, MRI, B, true, false);
4992   case Intrinsic::amdgcn_raw_tbuffer_load:
4993   case Intrinsic::amdgcn_struct_tbuffer_load:
4994     return legalizeBufferLoad(MI, MRI, B, true, true);
4995   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4996   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4997   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4998   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4999   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5000   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5001   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5002   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5003   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5004   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5005   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5006   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5007   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5008   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5009   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5010   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5011   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5012   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5013   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5014   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5015   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5016   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5017   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5018   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5019   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5020   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5021   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5022   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5023   case Intrinsic::amdgcn_buffer_atomic_fadd:
5024   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5025   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5026   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5027   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5028     return legalizeBufferAtomic(MI, B, IntrID);
5029   case Intrinsic::amdgcn_atomic_inc:
5030     return legalizeAtomicIncDec(MI, B, true);
5031   case Intrinsic::amdgcn_atomic_dec:
5032     return legalizeAtomicIncDec(MI, B, false);
5033   case Intrinsic::trap:
5034     return legalizeTrapIntrinsic(MI, MRI, B);
5035   case Intrinsic::debugtrap:
5036     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5037   case Intrinsic::amdgcn_rsq_clamp:
5038     return legalizeRsqClampIntrinsic(MI, MRI, B);
5039   case Intrinsic::amdgcn_ds_fadd:
5040   case Intrinsic::amdgcn_ds_fmin:
5041   case Intrinsic::amdgcn_ds_fmax:
5042     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5043   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5044     return legalizeBVHIntrinsic(MI, B);
5045   default: {
5046     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5047             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5048       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5049     return true;
5050   }
5051   }
5052 
5053   return true;
5054 }
5055