1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPULegalizerInfo.h"
15
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
30
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41 "amdgpu-global-isel-new-legality",
42 cl::desc("Use GlobalISel desired legality, rather than try to use"
43 "rules compatible with selection patterns"),
44 cl::init(false),
45 cl::ReallyHidden);
46
47 static constexpr unsigned MaxRegisterSize = 1024;
48
49 // Round the number of elements to the next power of two elements
getPow2VectorType(LLT Ty)50 static LLT getPow2VectorType(LLT Ty) {
51 unsigned NElts = Ty.getNumElements();
52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54 }
55
56 // Round the number of bits to the next power of two bits
getPow2ScalarType(LLT Ty)57 static LLT getPow2ScalarType(LLT Ty) {
58 unsigned Bits = Ty.getSizeInBits();
59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
60 return LLT::scalar(Pow2Bits);
61 }
62
63 /// \returns true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
isSmallOddVector(unsigned TypeIdx)66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67 return [=](const LegalityQuery &Query) {
68 const LLT Ty = Query.Types[TypeIdx];
69 if (!Ty.isVector())
70 return false;
71
72 const LLT EltTy = Ty.getElementType();
73 const unsigned EltSize = EltTy.getSizeInBits();
74 return Ty.getNumElements() % 2 != 0 &&
75 EltSize > 1 && EltSize < 32 &&
76 Ty.getSizeInBits() % 32 != 0;
77 };
78 }
79
sizeIsMultipleOf32(unsigned TypeIdx)80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81 return [=](const LegalityQuery &Query) {
82 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getSizeInBits() % 32 == 0;
84 };
85 }
86
isWideVec16(unsigned TypeIdx)87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88 return [=](const LegalityQuery &Query) {
89 const LLT Ty = Query.Types[TypeIdx];
90 const LLT EltTy = Ty.getScalarType();
91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92 };
93 }
94
oneMoreElement(unsigned TypeIdx)95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96 return [=](const LegalityQuery &Query) {
97 const LLT Ty = Query.Types[TypeIdx];
98 const LLT EltTy = Ty.getElementType();
99 return std::pair(TypeIdx,
100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101 };
102 }
103
fewerEltsToSize64Vector(unsigned TypeIdx)104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 unsigned Size = Ty.getSizeInBits();
109 unsigned Pieces = (Size + 63) / 64;
110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111 return std::pair(TypeIdx, LLT::scalarOrVector(
112 ElementCount::getFixed(NewNumElts), EltTy));
113 };
114 }
115
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
moreEltsToNext32Bit(unsigned TypeIdx)118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119 return [=](const LegalityQuery &Query) {
120 const LLT Ty = Query.Types[TypeIdx];
121
122 const LLT EltTy = Ty.getElementType();
123 const int Size = Ty.getSizeInBits();
124 const int EltSize = EltTy.getSizeInBits();
125 const int NextMul32 = (Size + 31) / 32;
126
127 assert(EltSize < 32);
128
129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131 };
132 }
133
getBitcastRegisterType(const LLT Ty)134 static LLT getBitcastRegisterType(const LLT Ty) {
135 const unsigned Size = Ty.getSizeInBits();
136
137 if (Size <= 32) {
138 // <2 x s8> -> s16
139 // <4 x s8> -> s32
140 return LLT::scalar(Size);
141 }
142
143 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
144 }
145
bitcastToRegisterType(unsigned TypeIdx)146 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
147 return [=](const LegalityQuery &Query) {
148 const LLT Ty = Query.Types[TypeIdx];
149 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
150 };
151 }
152
bitcastToVectorElement32(unsigned TypeIdx)153 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
154 return [=](const LegalityQuery &Query) {
155 const LLT Ty = Query.Types[TypeIdx];
156 unsigned Size = Ty.getSizeInBits();
157 assert(Size % 32 == 0);
158 return std::pair(
159 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
160 };
161 }
162
vectorSmallerThan(unsigned TypeIdx,unsigned Size)163 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
164 return [=](const LegalityQuery &Query) {
165 const LLT QueryTy = Query.Types[TypeIdx];
166 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
167 };
168 }
169
vectorWiderThan(unsigned TypeIdx,unsigned Size)170 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
171 return [=](const LegalityQuery &Query) {
172 const LLT QueryTy = Query.Types[TypeIdx];
173 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
174 };
175 }
176
numElementsNotEven(unsigned TypeIdx)177 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
178 return [=](const LegalityQuery &Query) {
179 const LLT QueryTy = Query.Types[TypeIdx];
180 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
181 };
182 }
183
isRegisterSize(unsigned Size)184 static bool isRegisterSize(unsigned Size) {
185 return Size % 32 == 0 && Size <= MaxRegisterSize;
186 }
187
isRegisterVectorElementType(LLT EltTy)188 static bool isRegisterVectorElementType(LLT EltTy) {
189 const int EltSize = EltTy.getSizeInBits();
190 return EltSize == 16 || EltSize % 32 == 0;
191 }
192
isRegisterVectorType(LLT Ty)193 static bool isRegisterVectorType(LLT Ty) {
194 const int EltSize = Ty.getElementType().getSizeInBits();
195 return EltSize == 32 || EltSize == 64 ||
196 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
197 EltSize == 128 || EltSize == 256;
198 }
199
isRegisterType(LLT Ty)200 static bool isRegisterType(LLT Ty) {
201 if (!isRegisterSize(Ty.getSizeInBits()))
202 return false;
203
204 if (Ty.isVector())
205 return isRegisterVectorType(Ty);
206
207 return true;
208 }
209
210 // Any combination of 32 or 64-bit elements up the maximum register size, and
211 // multiples of v2s16.
isRegisterType(unsigned TypeIdx)212 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
213 return [=](const LegalityQuery &Query) {
214 return isRegisterType(Query.Types[TypeIdx]);
215 };
216 }
217
elementTypeIsLegal(unsigned TypeIdx)218 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
219 return [=](const LegalityQuery &Query) {
220 const LLT QueryTy = Query.Types[TypeIdx];
221 if (!QueryTy.isVector())
222 return false;
223 const LLT EltTy = QueryTy.getElementType();
224 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
225 };
226 }
227
228 // If we have a truncating store or an extending load with a data size larger
229 // than 32-bits, we need to reduce to a 32-bit type.
isWideScalarExtLoadTruncStore(unsigned TypeIdx)230 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT Ty = Query.Types[TypeIdx];
233 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
234 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
235 };
236 }
237
238 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
239 // handle some operations by just promoting the register during
240 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
maxSizeForAddrSpace(const GCNSubtarget & ST,unsigned AS,bool IsLoad)241 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
242 bool IsLoad) {
243 switch (AS) {
244 case AMDGPUAS::PRIVATE_ADDRESS:
245 // FIXME: Private element size.
246 return ST.enableFlatScratch() ? 128 : 32;
247 case AMDGPUAS::LOCAL_ADDRESS:
248 return ST.useDS128() ? 128 : 64;
249 case AMDGPUAS::GLOBAL_ADDRESS:
250 case AMDGPUAS::CONSTANT_ADDRESS:
251 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
252 // Treat constant and global as identical. SMRD loads are sometimes usable for
253 // global loads (ideally constant address space should be eliminated)
254 // depending on the context. Legality cannot be context dependent, but
255 // RegBankSelect can split the load as necessary depending on the pointer
256 // register bank/uniformity and if the memory is invariant or not written in a
257 // kernel.
258 return IsLoad ? 512 : 128;
259 default:
260 // Flat addresses may contextually need to be split to 32-bit parts if they
261 // may alias scratch depending on the subtarget.
262 return 128;
263 }
264 }
265
isLoadStoreSizeLegal(const GCNSubtarget & ST,const LegalityQuery & Query)266 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
267 const LegalityQuery &Query) {
268 const LLT Ty = Query.Types[0];
269
270 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
271 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
272
273 unsigned RegSize = Ty.getSizeInBits();
274 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
275 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
276 unsigned AS = Query.Types[1].getAddressSpace();
277
278 // All of these need to be custom lowered to cast the pointer operand.
279 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
280 return false;
281
282 // Do not handle extending vector loads.
283 if (Ty.isVector() && MemSize != RegSize)
284 return false;
285
286 // TODO: We should be able to widen loads if the alignment is high enough, but
287 // we also need to modify the memory access size.
288 #if 0
289 // Accept widening loads based on alignment.
290 if (IsLoad && MemSize < Size)
291 MemSize = std::max(MemSize, Align);
292 #endif
293
294 // Only 1-byte and 2-byte to 32-bit extloads are valid.
295 if (MemSize != RegSize && RegSize != 32)
296 return false;
297
298 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
299 return false;
300
301 switch (MemSize) {
302 case 8:
303 case 16:
304 case 32:
305 case 64:
306 case 128:
307 break;
308 case 96:
309 if (!ST.hasDwordx3LoadStores())
310 return false;
311 break;
312 case 256:
313 case 512:
314 // These may contextually need to be broken down.
315 break;
316 default:
317 return false;
318 }
319
320 assert(RegSize >= MemSize);
321
322 if (AlignBits < MemSize) {
323 const SITargetLowering *TLI = ST.getTargetLowering();
324 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
325 Align(AlignBits / 8)))
326 return false;
327 }
328
329 return true;
330 }
331
332 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
333 // workaround this. Eventually it should ignore the type for loads and only care
334 // about the size. Return true in cases where we will workaround this for now by
335 // bitcasting.
loadStoreBitcastWorkaround(const LLT Ty)336 static bool loadStoreBitcastWorkaround(const LLT Ty) {
337 if (EnableNewLegality)
338 return false;
339
340 const unsigned Size = Ty.getSizeInBits();
341 if (Size <= 64)
342 return false;
343 if (!Ty.isVector())
344 return true;
345
346 LLT EltTy = Ty.getElementType();
347 if (EltTy.isPointer())
348 return true;
349
350 unsigned EltSize = EltTy.getSizeInBits();
351 return EltSize != 32 && EltSize != 64;
352 }
353
isLoadStoreLegal(const GCNSubtarget & ST,const LegalityQuery & Query)354 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
355 const LLT Ty = Query.Types[0];
356 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
357 !loadStoreBitcastWorkaround(Ty);
358 }
359
360 /// Return true if a load or store of the type should be lowered with a bitcast
361 /// to a different type.
shouldBitcastLoadStoreType(const GCNSubtarget & ST,const LLT Ty,const LLT MemTy)362 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
363 const LLT MemTy) {
364 const unsigned MemSizeInBits = MemTy.getSizeInBits();
365 const unsigned Size = Ty.getSizeInBits();
366 if (Size != MemSizeInBits)
367 return Size <= 32 && Ty.isVector();
368
369 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
370 return true;
371
372 // Don't try to handle bitcasting vector ext loads for now.
373 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
374 (Size <= 32 || isRegisterSize(Size)) &&
375 !isRegisterVectorElementType(Ty.getElementType());
376 }
377
378 /// Return true if we should legalize a load by widening an odd sized memory
379 /// access up to the alignment. Note this case when the memory access itself
380 /// changes, not the size of the result register.
shouldWidenLoad(const GCNSubtarget & ST,LLT MemoryTy,uint64_t AlignInBits,unsigned AddrSpace,unsigned Opcode)381 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
382 uint64_t AlignInBits, unsigned AddrSpace,
383 unsigned Opcode) {
384 unsigned SizeInBits = MemoryTy.getSizeInBits();
385 // We don't want to widen cases that are naturally legal.
386 if (isPowerOf2_32(SizeInBits))
387 return false;
388
389 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
390 // end up widening these for a scalar load during RegBankSelect, since there
391 // aren't 96-bit scalar loads.
392 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
393 return false;
394
395 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
396 return false;
397
398 // A load is known dereferenceable up to the alignment, so it's legal to widen
399 // to it.
400 //
401 // TODO: Could check dereferenceable for less aligned cases.
402 unsigned RoundedSize = NextPowerOf2(SizeInBits);
403 if (AlignInBits < RoundedSize)
404 return false;
405
406 // Do not widen if it would introduce a slow unaligned load.
407 const SITargetLowering *TLI = ST.getTargetLowering();
408 unsigned Fast = 0;
409 return TLI->allowsMisalignedMemoryAccessesImpl(
410 RoundedSize, AddrSpace, Align(AlignInBits / 8),
411 MachineMemOperand::MOLoad, &Fast) &&
412 Fast;
413 }
414
shouldWidenLoad(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)415 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
416 unsigned Opcode) {
417 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
418 return false;
419
420 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
421 Query.MMODescrs[0].AlignInBits,
422 Query.Types[1].getAddressSpace(), Opcode);
423 }
424
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)425 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
426 const GCNTargetMachine &TM)
427 : ST(ST_) {
428 using namespace TargetOpcode;
429
430 auto GetAddrSpacePtr = [&TM](unsigned AS) {
431 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
432 };
433
434 const LLT S1 = LLT::scalar(1);
435 const LLT S8 = LLT::scalar(8);
436 const LLT S16 = LLT::scalar(16);
437 const LLT S32 = LLT::scalar(32);
438 const LLT S64 = LLT::scalar(64);
439 const LLT S128 = LLT::scalar(128);
440 const LLT S256 = LLT::scalar(256);
441 const LLT S512 = LLT::scalar(512);
442 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
443
444 const LLT V2S8 = LLT::fixed_vector(2, 8);
445 const LLT V2S16 = LLT::fixed_vector(2, 16);
446 const LLT V4S16 = LLT::fixed_vector(4, 16);
447
448 const LLT V2S32 = LLT::fixed_vector(2, 32);
449 const LLT V3S32 = LLT::fixed_vector(3, 32);
450 const LLT V4S32 = LLT::fixed_vector(4, 32);
451 const LLT V5S32 = LLT::fixed_vector(5, 32);
452 const LLT V6S32 = LLT::fixed_vector(6, 32);
453 const LLT V7S32 = LLT::fixed_vector(7, 32);
454 const LLT V8S32 = LLT::fixed_vector(8, 32);
455 const LLT V9S32 = LLT::fixed_vector(9, 32);
456 const LLT V10S32 = LLT::fixed_vector(10, 32);
457 const LLT V11S32 = LLT::fixed_vector(11, 32);
458 const LLT V12S32 = LLT::fixed_vector(12, 32);
459 const LLT V13S32 = LLT::fixed_vector(13, 32);
460 const LLT V14S32 = LLT::fixed_vector(14, 32);
461 const LLT V15S32 = LLT::fixed_vector(15, 32);
462 const LLT V16S32 = LLT::fixed_vector(16, 32);
463 const LLT V32S32 = LLT::fixed_vector(32, 32);
464
465 const LLT V2S64 = LLT::fixed_vector(2, 64);
466 const LLT V3S64 = LLT::fixed_vector(3, 64);
467 const LLT V4S64 = LLT::fixed_vector(4, 64);
468 const LLT V5S64 = LLT::fixed_vector(5, 64);
469 const LLT V6S64 = LLT::fixed_vector(6, 64);
470 const LLT V7S64 = LLT::fixed_vector(7, 64);
471 const LLT V8S64 = LLT::fixed_vector(8, 64);
472 const LLT V16S64 = LLT::fixed_vector(16, 64);
473
474 std::initializer_list<LLT> AllS32Vectors =
475 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
476 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
477 std::initializer_list<LLT> AllS64Vectors =
478 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
479
480 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
481 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
482 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
483 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
484 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
485 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
486 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
487
488 const LLT CodePtr = FlatPtr;
489
490 const std::initializer_list<LLT> AddrSpaces64 = {
491 GlobalPtr, ConstantPtr, FlatPtr
492 };
493
494 const std::initializer_list<LLT> AddrSpaces32 = {
495 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
496 };
497
498 const std::initializer_list<LLT> FPTypesBase = {
499 S32, S64
500 };
501
502 const std::initializer_list<LLT> FPTypes16 = {
503 S32, S64, S16
504 };
505
506 const std::initializer_list<LLT> FPTypesPK16 = {
507 S32, S64, S16, V2S16
508 };
509
510 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
511
512 // s1 for VCC branches, s32 for SCC branches.
513 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
514
515 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
516 // elements for v3s16
517 getActionDefinitionsBuilder(G_PHI)
518 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
519 .legalFor(AllS32Vectors)
520 .legalFor(AllS64Vectors)
521 .legalFor(AddrSpaces64)
522 .legalFor(AddrSpaces32)
523 .legalIf(isPointer(0))
524 .clampScalar(0, S16, S256)
525 .widenScalarToNextPow2(0, 32)
526 .clampMaxNumElements(0, S32, 16)
527 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
528 .scalarize(0);
529
530 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
531 // Full set of gfx9 features.
532 getActionDefinitionsBuilder({G_ADD, G_SUB})
533 .legalFor({S32, S16, V2S16})
534 .clampMaxNumElementsStrict(0, S16, 2)
535 .scalarize(0)
536 .minScalar(0, S16)
537 .widenScalarToNextMultipleOf(0, 32)
538 .maxScalar(0, S32);
539
540 getActionDefinitionsBuilder(G_MUL)
541 .legalFor({S32, S16, V2S16})
542 .clampMaxNumElementsStrict(0, S16, 2)
543 .scalarize(0)
544 .minScalar(0, S16)
545 .widenScalarToNextMultipleOf(0, 32)
546 .custom();
547 assert(ST.hasMad64_32());
548
549 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
550 .legalFor({S32, S16, V2S16}) // Clamp modifier
551 .minScalarOrElt(0, S16)
552 .clampMaxNumElementsStrict(0, S16, 2)
553 .scalarize(0)
554 .widenScalarToNextPow2(0, 32)
555 .lower();
556 } else if (ST.has16BitInsts()) {
557 getActionDefinitionsBuilder({G_ADD, G_SUB})
558 .legalFor({S32, S16})
559 .minScalar(0, S16)
560 .widenScalarToNextMultipleOf(0, 32)
561 .maxScalar(0, S32)
562 .scalarize(0);
563
564 getActionDefinitionsBuilder(G_MUL)
565 .legalFor({S32, S16})
566 .scalarize(0)
567 .minScalar(0, S16)
568 .widenScalarToNextMultipleOf(0, 32)
569 .custom();
570 assert(ST.hasMad64_32());
571
572 // Technically the saturating operations require clamp bit support, but this
573 // was introduced at the same time as 16-bit operations.
574 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
575 .legalFor({S32, S16}) // Clamp modifier
576 .minScalar(0, S16)
577 .scalarize(0)
578 .widenScalarToNextPow2(0, 16)
579 .lower();
580
581 // We're just lowering this, but it helps get a better result to try to
582 // coerce to the desired type first.
583 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
584 .minScalar(0, S16)
585 .scalarize(0)
586 .lower();
587 } else {
588 getActionDefinitionsBuilder({G_ADD, G_SUB})
589 .legalFor({S32})
590 .widenScalarToNextMultipleOf(0, 32)
591 .clampScalar(0, S32, S32)
592 .scalarize(0);
593
594 auto &Mul = getActionDefinitionsBuilder(G_MUL)
595 .legalFor({S32})
596 .scalarize(0)
597 .minScalar(0, S32)
598 .widenScalarToNextMultipleOf(0, 32);
599
600 if (ST.hasMad64_32())
601 Mul.custom();
602 else
603 Mul.maxScalar(0, S32);
604
605 if (ST.hasIntClamp()) {
606 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
607 .legalFor({S32}) // Clamp modifier.
608 .scalarize(0)
609 .minScalarOrElt(0, S32)
610 .lower();
611 } else {
612 // Clamp bit support was added in VI, along with 16-bit operations.
613 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
614 .minScalar(0, S32)
615 .scalarize(0)
616 .lower();
617 }
618
619 // FIXME: DAG expansion gets better results. The widening uses the smaller
620 // range values and goes for the min/max lowering directly.
621 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
622 .minScalar(0, S32)
623 .scalarize(0)
624 .lower();
625 }
626
627 getActionDefinitionsBuilder(
628 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
629 .customFor({S32, S64})
630 .clampScalar(0, S32, S64)
631 .widenScalarToNextPow2(0, 32)
632 .scalarize(0);
633
634 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
635 .legalFor({S32})
636 .maxScalar(0, S32);
637
638 if (ST.hasVOP3PInsts()) {
639 Mulh
640 .clampMaxNumElements(0, S8, 2)
641 .lowerFor({V2S8});
642 }
643
644 Mulh
645 .scalarize(0)
646 .lower();
647
648 // Report legal for any types we can handle anywhere. For the cases only legal
649 // on the SALU, RegBankSelect will be able to re-legalize.
650 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
651 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
652 .clampScalar(0, S32, S64)
653 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
654 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
655 .widenScalarToNextPow2(0)
656 .scalarize(0);
657
658 getActionDefinitionsBuilder(
659 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
660 .legalFor({{S32, S1}, {S32, S32}})
661 .clampScalar(0, S32, S32)
662 .scalarize(0);
663
664 getActionDefinitionsBuilder(G_BITCAST)
665 // Don't worry about the size constraint.
666 .legalIf(all(isRegisterType(0), isRegisterType(1)))
667 .lower();
668
669
670 getActionDefinitionsBuilder(G_CONSTANT)
671 .legalFor({S1, S32, S64, S16, GlobalPtr,
672 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
673 .legalIf(isPointer(0))
674 .clampScalar(0, S32, S64)
675 .widenScalarToNextPow2(0);
676
677 getActionDefinitionsBuilder(G_FCONSTANT)
678 .legalFor({S32, S64, S16})
679 .clampScalar(0, S16, S64);
680
681 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
682 .legalIf(isRegisterType(0))
683 // s1 and s16 are special cases because they have legal operations on
684 // them, but don't really occupy registers in the normal way.
685 .legalFor({S1, S16})
686 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
687 .clampScalarOrElt(0, S32, MaxScalar)
688 .widenScalarToNextPow2(0, 32)
689 .clampMaxNumElements(0, S32, 16);
690
691 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
692
693 // If the amount is divergent, we have to do a wave reduction to get the
694 // maximum value, so this is expanded during RegBankSelect.
695 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
696 .legalFor({{PrivatePtr, S32}});
697
698 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
699 .customIf(typeIsNot(0, PrivatePtr));
700
701 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
702
703 auto &FPOpActions = getActionDefinitionsBuilder(
704 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
705 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
706 .legalFor({S32, S64});
707 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
708 .customFor({S32, S64});
709 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
710 .customFor({S32, S64});
711
712 if (ST.has16BitInsts()) {
713 if (ST.hasVOP3PInsts())
714 FPOpActions.legalFor({S16, V2S16});
715 else
716 FPOpActions.legalFor({S16});
717
718 TrigActions.customFor({S16});
719 FDIVActions.customFor({S16});
720 }
721
722 auto &MinNumMaxNum = getActionDefinitionsBuilder({
723 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
724
725 if (ST.hasVOP3PInsts()) {
726 MinNumMaxNum.customFor(FPTypesPK16)
727 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
728 .clampMaxNumElements(0, S16, 2)
729 .clampScalar(0, S16, S64)
730 .scalarize(0);
731 } else if (ST.has16BitInsts()) {
732 MinNumMaxNum.customFor(FPTypes16)
733 .clampScalar(0, S16, S64)
734 .scalarize(0);
735 } else {
736 MinNumMaxNum.customFor(FPTypesBase)
737 .clampScalar(0, S32, S64)
738 .scalarize(0);
739 }
740
741 if (ST.hasVOP3PInsts())
742 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
743
744 FPOpActions
745 .scalarize(0)
746 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
747
748 TrigActions
749 .scalarize(0)
750 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
751
752 FDIVActions
753 .scalarize(0)
754 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
755
756 getActionDefinitionsBuilder({G_FNEG, G_FABS})
757 .legalFor(FPTypesPK16)
758 .clampMaxNumElementsStrict(0, S16, 2)
759 .scalarize(0)
760 .clampScalar(0, S16, S64);
761
762 if (ST.has16BitInsts()) {
763 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
764 .legalFor({S32, S64, S16})
765 .scalarize(0)
766 .clampScalar(0, S16, S64);
767 } else {
768 getActionDefinitionsBuilder(G_FSQRT)
769 .legalFor({S32, S64})
770 .scalarize(0)
771 .clampScalar(0, S32, S64);
772
773 if (ST.hasFractBug()) {
774 getActionDefinitionsBuilder(G_FFLOOR)
775 .customFor({S64})
776 .legalFor({S32, S64})
777 .scalarize(0)
778 .clampScalar(0, S32, S64);
779 } else {
780 getActionDefinitionsBuilder(G_FFLOOR)
781 .legalFor({S32, S64})
782 .scalarize(0)
783 .clampScalar(0, S32, S64);
784 }
785 }
786
787 getActionDefinitionsBuilder(G_FPTRUNC)
788 .legalFor({{S32, S64}, {S16, S32}})
789 .scalarize(0)
790 .lower();
791
792 getActionDefinitionsBuilder(G_FPEXT)
793 .legalFor({{S64, S32}, {S32, S16}})
794 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
795 .scalarize(0);
796
797 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
798 if (ST.has16BitInsts()) {
799 FSubActions
800 // Use actual fsub instruction
801 .legalFor({S32, S16})
802 // Must use fadd + fneg
803 .lowerFor({S64, V2S16});
804 } else {
805 FSubActions
806 // Use actual fsub instruction
807 .legalFor({S32})
808 // Must use fadd + fneg
809 .lowerFor({S64, S16, V2S16});
810 }
811
812 FSubActions
813 .scalarize(0)
814 .clampScalar(0, S32, S64);
815
816 // Whether this is legal depends on the floating point mode for the function.
817 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
818 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
819 FMad.customFor({S32, S16});
820 else if (ST.hasMadMacF32Insts())
821 FMad.customFor({S32});
822 else if (ST.hasMadF16())
823 FMad.customFor({S16});
824 FMad.scalarize(0)
825 .lower();
826
827 auto &FRem = getActionDefinitionsBuilder(G_FREM);
828 if (ST.has16BitInsts()) {
829 FRem.customFor({S16, S32, S64});
830 } else {
831 FRem.minScalar(0, S32)
832 .customFor({S32, S64});
833 }
834 FRem.scalarize(0);
835
836 // TODO: Do we need to clamp maximum bitwidth?
837 getActionDefinitionsBuilder(G_TRUNC)
838 .legalIf(isScalar(0))
839 .legalFor({{V2S16, V2S32}})
840 .clampMaxNumElements(0, S16, 2)
841 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
842 // situations (like an invalid implicit use), we don't want to infinite loop
843 // in the legalizer.
844 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
845 .alwaysLegal();
846
847 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
848 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
849 {S32, S1}, {S64, S1}, {S16, S1}})
850 .scalarize(0)
851 .clampScalar(0, S32, S64)
852 .widenScalarToNextPow2(1, 32);
853
854 // TODO: Split s1->s64 during regbankselect for VALU.
855 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
856 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
857 .lowerIf(typeIs(1, S1))
858 .customFor({{S32, S64}, {S64, S64}});
859 if (ST.has16BitInsts())
860 IToFP.legalFor({{S16, S16}});
861 IToFP.clampScalar(1, S32, S64)
862 .minScalar(0, S32)
863 .scalarize(0)
864 .widenScalarToNextPow2(1);
865
866 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
867 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
868 .customFor({{S64, S32}, {S64, S64}})
869 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
870 if (ST.has16BitInsts())
871 FPToI.legalFor({{S16, S16}});
872 else
873 FPToI.minScalar(1, S32);
874
875 FPToI.minScalar(0, S32)
876 .widenScalarToNextPow2(0, 32)
877 .scalarize(0)
878 .lower();
879
880 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
881 .customFor({S16, S32})
882 .scalarize(0)
883 .lower();
884
885 // Lower roundeven into G_FRINT
886 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
887 .scalarize(0)
888 .lower();
889
890 if (ST.has16BitInsts()) {
891 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
892 .legalFor({S16, S32, S64})
893 .clampScalar(0, S16, S64)
894 .scalarize(0);
895 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
896 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
897 .legalFor({S32, S64})
898 .clampScalar(0, S32, S64)
899 .scalarize(0);
900 } else {
901 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
902 .legalFor({S32})
903 .customFor({S64})
904 .clampScalar(0, S32, S64)
905 .scalarize(0);
906 }
907
908 getActionDefinitionsBuilder(G_PTR_ADD)
909 .legalIf(all(isPointer(0), sameSize(0, 1)))
910 .scalarize(0)
911 .scalarSameSizeAs(1, 0);
912
913 getActionDefinitionsBuilder(G_PTRMASK)
914 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
915 .scalarSameSizeAs(1, 0)
916 .scalarize(0);
917
918 auto &CmpBuilder =
919 getActionDefinitionsBuilder(G_ICMP)
920 // The compare output type differs based on the register bank of the output,
921 // so make both s1 and s32 legal.
922 //
923 // Scalar compares producing output in scc will be promoted to s32, as that
924 // is the allocatable register type that will be needed for the copy from
925 // scc. This will be promoted during RegBankSelect, and we assume something
926 // before that won't try to use s32 result types.
927 //
928 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
929 // bank.
930 .legalForCartesianProduct(
931 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
932 .legalForCartesianProduct(
933 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
934 if (ST.has16BitInsts()) {
935 CmpBuilder.legalFor({{S1, S16}});
936 }
937
938 CmpBuilder
939 .widenScalarToNextPow2(1)
940 .clampScalar(1, S32, S64)
941 .scalarize(0)
942 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
943
944 getActionDefinitionsBuilder(G_FCMP)
945 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
946 .widenScalarToNextPow2(1)
947 .clampScalar(1, S32, S64)
948 .scalarize(0);
949
950 // FIXME: fpow has a selection pattern that should move to custom lowering.
951 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
952 if (ST.has16BitInsts())
953 Exp2Ops.legalFor({S32, S16});
954 else
955 Exp2Ops.legalFor({S32});
956 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
957 Exp2Ops.scalarize(0);
958
959 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
960 if (ST.has16BitInsts())
961 ExpOps.customFor({{S32}, {S16}});
962 else
963 ExpOps.customFor({S32});
964 ExpOps.clampScalar(0, MinScalarFPTy, S32)
965 .scalarize(0);
966
967 getActionDefinitionsBuilder(G_FPOWI)
968 .clampScalar(0, MinScalarFPTy, S32)
969 .lower();
970
971 // The 64-bit versions produce 32-bit results, but only on the SALU.
972 getActionDefinitionsBuilder(G_CTPOP)
973 .legalFor({{S32, S32}, {S32, S64}})
974 .clampScalar(0, S32, S32)
975 .widenScalarToNextPow2(1, 32)
976 .clampScalar(1, S32, S64)
977 .scalarize(0)
978 .widenScalarToNextPow2(0, 32);
979
980 // If no 16 bit instr is available, lower into different instructions.
981 if (ST.has16BitInsts())
982 getActionDefinitionsBuilder(G_IS_FPCLASS)
983 .legalForCartesianProduct({S1}, FPTypes16)
984 .widenScalarToNextPow2(1)
985 .scalarize(0)
986 .lower();
987 else
988 getActionDefinitionsBuilder(G_IS_FPCLASS)
989 .legalForCartesianProduct({S1}, FPTypesBase)
990 .lowerFor({S1, S16})
991 .widenScalarToNextPow2(1)
992 .scalarize(0)
993 .lower();
994
995 // The hardware instructions return a different result on 0 than the generic
996 // instructions expect. The hardware produces -1, but these produce the
997 // bitwidth.
998 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
999 .scalarize(0)
1000 .clampScalar(0, S32, S32)
1001 .clampScalar(1, S32, S64)
1002 .widenScalarToNextPow2(0, 32)
1003 .widenScalarToNextPow2(1, 32)
1004 .custom();
1005
1006 // The 64-bit versions produce 32-bit results, but only on the SALU.
1007 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1008 .legalFor({{S32, S32}, {S32, S64}})
1009 .clampScalar(0, S32, S32)
1010 .clampScalar(1, S32, S64)
1011 .scalarize(0)
1012 .widenScalarToNextPow2(0, 32)
1013 .widenScalarToNextPow2(1, 32);
1014
1015 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1016 // RegBankSelect.
1017 getActionDefinitionsBuilder(G_BITREVERSE)
1018 .legalFor({S32, S64})
1019 .clampScalar(0, S32, S64)
1020 .scalarize(0)
1021 .widenScalarToNextPow2(0);
1022
1023 if (ST.has16BitInsts()) {
1024 getActionDefinitionsBuilder(G_BSWAP)
1025 .legalFor({S16, S32, V2S16})
1026 .clampMaxNumElementsStrict(0, S16, 2)
1027 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1028 // narrowScalar limitation.
1029 .widenScalarToNextPow2(0)
1030 .clampScalar(0, S16, S32)
1031 .scalarize(0);
1032
1033 if (ST.hasVOP3PInsts()) {
1034 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1035 .legalFor({S32, S16, V2S16})
1036 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1037 .clampMaxNumElements(0, S16, 2)
1038 .minScalar(0, S16)
1039 .widenScalarToNextPow2(0)
1040 .scalarize(0)
1041 .lower();
1042 } else {
1043 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1044 .legalFor({S32, S16})
1045 .widenScalarToNextPow2(0)
1046 .minScalar(0, S16)
1047 .scalarize(0)
1048 .lower();
1049 }
1050 } else {
1051 // TODO: Should have same legality without v_perm_b32
1052 getActionDefinitionsBuilder(G_BSWAP)
1053 .legalFor({S32})
1054 .lowerIf(scalarNarrowerThan(0, 32))
1055 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1056 // narrowScalar limitation.
1057 .widenScalarToNextPow2(0)
1058 .maxScalar(0, S32)
1059 .scalarize(0)
1060 .lower();
1061
1062 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1063 .legalFor({S32})
1064 .minScalar(0, S32)
1065 .widenScalarToNextPow2(0)
1066 .scalarize(0)
1067 .lower();
1068 }
1069
1070 getActionDefinitionsBuilder(G_INTTOPTR)
1071 // List the common cases
1072 .legalForCartesianProduct(AddrSpaces64, {S64})
1073 .legalForCartesianProduct(AddrSpaces32, {S32})
1074 .scalarize(0)
1075 // Accept any address space as long as the size matches
1076 .legalIf(sameSize(0, 1))
1077 .widenScalarIf(smallerThan(1, 0),
1078 [](const LegalityQuery &Query) {
1079 return std::pair(
1080 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1081 })
1082 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1083 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1084 });
1085
1086 getActionDefinitionsBuilder(G_PTRTOINT)
1087 // List the common cases
1088 .legalForCartesianProduct(AddrSpaces64, {S64})
1089 .legalForCartesianProduct(AddrSpaces32, {S32})
1090 .scalarize(0)
1091 // Accept any address space as long as the size matches
1092 .legalIf(sameSize(0, 1))
1093 .widenScalarIf(smallerThan(0, 1),
1094 [](const LegalityQuery &Query) {
1095 return std::pair(
1096 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1097 })
1098 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1099 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1100 });
1101
1102 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1103 .scalarize(0)
1104 .custom();
1105
1106 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1107 bool IsLoad) -> bool {
1108 const LLT DstTy = Query.Types[0];
1109
1110 // Split vector extloads.
1111 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1112
1113 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1114 return true;
1115
1116 const LLT PtrTy = Query.Types[1];
1117 unsigned AS = PtrTy.getAddressSpace();
1118 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1119 return true;
1120
1121 // Catch weird sized loads that don't evenly divide into the access sizes
1122 // TODO: May be able to widen depending on alignment etc.
1123 unsigned NumRegs = (MemSize + 31) / 32;
1124 if (NumRegs == 3) {
1125 if (!ST.hasDwordx3LoadStores())
1126 return true;
1127 } else {
1128 // If the alignment allows, these should have been widened.
1129 if (!isPowerOf2_32(NumRegs))
1130 return true;
1131 }
1132
1133 return false;
1134 };
1135
1136 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1137 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1138 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1139
1140 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1141 // LDS
1142 // TODO: Unsupported flat for SI.
1143
1144 for (unsigned Op : {G_LOAD, G_STORE}) {
1145 const bool IsStore = Op == G_STORE;
1146
1147 auto &Actions = getActionDefinitionsBuilder(Op);
1148 // Explicitly list some common cases.
1149 // TODO: Does this help compile time at all?
1150 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1151 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1152 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1153 {S64, GlobalPtr, S64, GlobalAlign32},
1154 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1155 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1156 {S32, GlobalPtr, S8, GlobalAlign8},
1157 {S32, GlobalPtr, S16, GlobalAlign16},
1158
1159 {S32, LocalPtr, S32, 32},
1160 {S64, LocalPtr, S64, 32},
1161 {V2S32, LocalPtr, V2S32, 32},
1162 {S32, LocalPtr, S8, 8},
1163 {S32, LocalPtr, S16, 16},
1164 {V2S16, LocalPtr, S32, 32},
1165
1166 {S32, PrivatePtr, S32, 32},
1167 {S32, PrivatePtr, S8, 8},
1168 {S32, PrivatePtr, S16, 16},
1169 {V2S16, PrivatePtr, S32, 32},
1170
1171 {S32, ConstantPtr, S32, GlobalAlign32},
1172 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1173 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1174 {S64, ConstantPtr, S64, GlobalAlign32},
1175 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1176 Actions.legalIf(
1177 [=](const LegalityQuery &Query) -> bool {
1178 return isLoadStoreLegal(ST, Query);
1179 });
1180
1181 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1182 // 64-bits.
1183 //
1184 // TODO: Should generalize bitcast action into coerce, which will also cover
1185 // inserting addrspacecasts.
1186 Actions.customIf(typeIs(1, Constant32Ptr));
1187
1188 // Turn any illegal element vectors into something easier to deal
1189 // with. These will ultimately produce 32-bit scalar shifts to extract the
1190 // parts anyway.
1191 //
1192 // For odd 16-bit element vectors, prefer to split those into pieces with
1193 // 16-bit vector parts.
1194 Actions.bitcastIf(
1195 [=](const LegalityQuery &Query) -> bool {
1196 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1197 Query.MMODescrs[0].MemoryTy);
1198 }, bitcastToRegisterType(0));
1199
1200 if (!IsStore) {
1201 // Widen suitably aligned loads by loading extra bytes. The standard
1202 // legalization actions can't properly express widening memory operands.
1203 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1204 return shouldWidenLoad(ST, Query, G_LOAD);
1205 });
1206 }
1207
1208 // FIXME: load/store narrowing should be moved to lower action
1209 Actions
1210 .narrowScalarIf(
1211 [=](const LegalityQuery &Query) -> bool {
1212 return !Query.Types[0].isVector() &&
1213 needToSplitMemOp(Query, Op == G_LOAD);
1214 },
1215 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1216 const LLT DstTy = Query.Types[0];
1217 const LLT PtrTy = Query.Types[1];
1218
1219 const unsigned DstSize = DstTy.getSizeInBits();
1220 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1221
1222 // Split extloads.
1223 if (DstSize > MemSize)
1224 return std::pair(0, LLT::scalar(MemSize));
1225
1226 unsigned MaxSize = maxSizeForAddrSpace(ST,
1227 PtrTy.getAddressSpace(),
1228 Op == G_LOAD);
1229 if (MemSize > MaxSize)
1230 return std::pair(0, LLT::scalar(MaxSize));
1231
1232 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1233 return std::pair(0, LLT::scalar(Align));
1234 })
1235 .fewerElementsIf(
1236 [=](const LegalityQuery &Query) -> bool {
1237 return Query.Types[0].isVector() &&
1238 needToSplitMemOp(Query, Op == G_LOAD);
1239 },
1240 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1241 const LLT DstTy = Query.Types[0];
1242 const LLT PtrTy = Query.Types[1];
1243
1244 LLT EltTy = DstTy.getElementType();
1245 unsigned MaxSize = maxSizeForAddrSpace(ST,
1246 PtrTy.getAddressSpace(),
1247 Op == G_LOAD);
1248
1249 // FIXME: Handle widened to power of 2 results better. This ends
1250 // up scalarizing.
1251 // FIXME: 3 element stores scalarized on SI
1252
1253 // Split if it's too large for the address space.
1254 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1255 if (MemSize > MaxSize) {
1256 unsigned NumElts = DstTy.getNumElements();
1257 unsigned EltSize = EltTy.getSizeInBits();
1258
1259 if (MaxSize % EltSize == 0) {
1260 return std::pair(
1261 0, LLT::scalarOrVector(
1262 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1263 }
1264
1265 unsigned NumPieces = MemSize / MaxSize;
1266
1267 // FIXME: Refine when odd breakdowns handled
1268 // The scalars will need to be re-legalized.
1269 if (NumPieces == 1 || NumPieces >= NumElts ||
1270 NumElts % NumPieces != 0)
1271 return std::pair(0, EltTy);
1272
1273 return std::pair(0,
1274 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1275 }
1276
1277 // FIXME: We could probably handle weird extending loads better.
1278 if (DstTy.getSizeInBits() > MemSize)
1279 return std::pair(0, EltTy);
1280
1281 unsigned EltSize = EltTy.getSizeInBits();
1282 unsigned DstSize = DstTy.getSizeInBits();
1283 if (!isPowerOf2_32(DstSize)) {
1284 // We're probably decomposing an odd sized store. Try to split
1285 // to the widest type. TODO: Account for alignment. As-is it
1286 // should be OK, since the new parts will be further legalized.
1287 unsigned FloorSize = PowerOf2Floor(DstSize);
1288 return std::pair(
1289 0, LLT::scalarOrVector(
1290 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1291 }
1292
1293 // May need relegalization for the scalars.
1294 return std::pair(0, EltTy);
1295 })
1296 .minScalar(0, S32)
1297 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1298 .widenScalarToNextPow2(0)
1299 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1300 .lower();
1301 }
1302
1303 // FIXME: Unaligned accesses not lowered.
1304 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1305 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1306 {S32, GlobalPtr, S16, 2 * 8},
1307 {S32, LocalPtr, S8, 8},
1308 {S32, LocalPtr, S16, 16},
1309 {S32, PrivatePtr, S8, 8},
1310 {S32, PrivatePtr, S16, 16},
1311 {S32, ConstantPtr, S8, 8},
1312 {S32, ConstantPtr, S16, 2 * 8}})
1313 .legalIf(
1314 [=](const LegalityQuery &Query) -> bool {
1315 return isLoadStoreLegal(ST, Query);
1316 });
1317
1318 if (ST.hasFlatAddressSpace()) {
1319 ExtLoads.legalForTypesWithMemDesc(
1320 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1321 }
1322
1323 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1324 // 64-bits.
1325 //
1326 // TODO: Should generalize bitcast action into coerce, which will also cover
1327 // inserting addrspacecasts.
1328 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1329
1330 ExtLoads.clampScalar(0, S32, S32)
1331 .widenScalarToNextPow2(0)
1332 .lower();
1333
1334 auto &Atomics = getActionDefinitionsBuilder(
1335 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1336 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1337 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1338 G_ATOMICRMW_UMIN})
1339 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1340 {S64, GlobalPtr}, {S64, LocalPtr},
1341 {S32, RegionPtr}, {S64, RegionPtr}});
1342 if (ST.hasFlatAddressSpace()) {
1343 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1344 }
1345
1346 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1347 if (ST.hasLDSFPAtomicAdd()) {
1348 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1349 if (ST.hasGFX90AInsts())
1350 Atomic.legalFor({{S64, LocalPtr}});
1351 if (ST.hasGFX940Insts())
1352 Atomic.legalFor({{V2S16, LocalPtr}});
1353 }
1354 if (ST.hasAtomicFaddInsts())
1355 Atomic.legalFor({{S32, GlobalPtr}});
1356 if (ST.hasFlatAtomicFaddF32Inst())
1357 Atomic.legalFor({{S32, FlatPtr}});
1358
1359 if (ST.hasGFX90AInsts()) {
1360 // These are legal with some caveats, and should have undergone expansion in
1361 // the IR in most situations
1362 // TODO: Move atomic expansion into legalizer
1363 Atomic.legalFor({
1364 {S32, GlobalPtr},
1365 {S64, GlobalPtr},
1366 {S64, FlatPtr}
1367 });
1368 }
1369
1370 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1371 // demarshalling
1372 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1373 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1374 {S32, FlatPtr}, {S64, FlatPtr}})
1375 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1376 {S32, RegionPtr}, {S64, RegionPtr}});
1377 // TODO: Pointer types, any 32-bit or 64-bit vector
1378
1379 // Condition should be s32 for scalar, s1 for vector.
1380 getActionDefinitionsBuilder(G_SELECT)
1381 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1382 LocalPtr, FlatPtr, PrivatePtr,
1383 LLT::fixed_vector(2, LocalPtr),
1384 LLT::fixed_vector(2, PrivatePtr)},
1385 {S1, S32})
1386 .clampScalar(0, S16, S64)
1387 .scalarize(1)
1388 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1389 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1390 .clampMaxNumElements(0, S32, 2)
1391 .clampMaxNumElements(0, LocalPtr, 2)
1392 .clampMaxNumElements(0, PrivatePtr, 2)
1393 .scalarize(0)
1394 .widenScalarToNextPow2(0)
1395 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1396
1397 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1398 // be more flexible with the shift amount type.
1399 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1400 .legalFor({{S32, S32}, {S64, S32}});
1401 if (ST.has16BitInsts()) {
1402 if (ST.hasVOP3PInsts()) {
1403 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1404 .clampMaxNumElements(0, S16, 2);
1405 } else
1406 Shifts.legalFor({{S16, S16}});
1407
1408 // TODO: Support 16-bit shift amounts for all types
1409 Shifts.widenScalarIf(
1410 [=](const LegalityQuery &Query) {
1411 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1412 // 32-bit amount.
1413 const LLT ValTy = Query.Types[0];
1414 const LLT AmountTy = Query.Types[1];
1415 return ValTy.getSizeInBits() <= 16 &&
1416 AmountTy.getSizeInBits() < 16;
1417 }, changeTo(1, S16));
1418 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1419 Shifts.clampScalar(1, S32, S32);
1420 Shifts.widenScalarToNextPow2(0, 16);
1421 Shifts.clampScalar(0, S16, S64);
1422
1423 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1424 .minScalar(0, S16)
1425 .scalarize(0)
1426 .lower();
1427 } else {
1428 // Make sure we legalize the shift amount type first, as the general
1429 // expansion for the shifted type will produce much worse code if it hasn't
1430 // been truncated already.
1431 Shifts.clampScalar(1, S32, S32);
1432 Shifts.widenScalarToNextPow2(0, 32);
1433 Shifts.clampScalar(0, S32, S64);
1434
1435 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1436 .minScalar(0, S32)
1437 .scalarize(0)
1438 .lower();
1439 }
1440 Shifts.scalarize(0);
1441
1442 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1443 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1444 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1445 unsigned IdxTypeIdx = 2;
1446
1447 getActionDefinitionsBuilder(Op)
1448 .customIf([=](const LegalityQuery &Query) {
1449 const LLT EltTy = Query.Types[EltTypeIdx];
1450 const LLT VecTy = Query.Types[VecTypeIdx];
1451 const LLT IdxTy = Query.Types[IdxTypeIdx];
1452 const unsigned EltSize = EltTy.getSizeInBits();
1453 return (EltSize == 32 || EltSize == 64) &&
1454 VecTy.getSizeInBits() % 32 == 0 &&
1455 VecTy.getSizeInBits() <= MaxRegisterSize &&
1456 IdxTy.getSizeInBits() == 32;
1457 })
1458 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1459 bitcastToVectorElement32(VecTypeIdx))
1460 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1461 .bitcastIf(
1462 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1463 [=](const LegalityQuery &Query) {
1464 // For > 64-bit element types, try to turn this into a 64-bit
1465 // element vector since we may be able to do better indexing
1466 // if this is scalar. If not, fall back to 32.
1467 const LLT EltTy = Query.Types[EltTypeIdx];
1468 const LLT VecTy = Query.Types[VecTypeIdx];
1469 const unsigned DstEltSize = EltTy.getSizeInBits();
1470 const unsigned VecSize = VecTy.getSizeInBits();
1471
1472 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1473 return std::pair(
1474 VecTypeIdx,
1475 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1476 })
1477 .clampScalar(EltTypeIdx, S32, S64)
1478 .clampScalar(VecTypeIdx, S32, S64)
1479 .clampScalar(IdxTypeIdx, S32, S32)
1480 .clampMaxNumElements(VecTypeIdx, S32, 32)
1481 // TODO: Clamp elements for 64-bit vectors?
1482 // It should only be necessary with variable indexes.
1483 // As a last resort, lower to the stack
1484 .lower();
1485 }
1486
1487 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1488 .unsupportedIf([=](const LegalityQuery &Query) {
1489 const LLT &EltTy = Query.Types[1].getElementType();
1490 return Query.Types[0] != EltTy;
1491 });
1492
1493 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1494 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1495 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1496
1497 // FIXME: Doesn't handle extract of illegal sizes.
1498 getActionDefinitionsBuilder(Op)
1499 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1500 .lowerIf([=](const LegalityQuery &Query) {
1501 // Sub-vector(or single element) insert and extract.
1502 // TODO: verify immediate offset here since lower only works with
1503 // whole elements.
1504 const LLT BigTy = Query.Types[BigTyIdx];
1505 return BigTy.isVector();
1506 })
1507 // FIXME: Multiples of 16 should not be legal.
1508 .legalIf([=](const LegalityQuery &Query) {
1509 const LLT BigTy = Query.Types[BigTyIdx];
1510 const LLT LitTy = Query.Types[LitTyIdx];
1511 return (BigTy.getSizeInBits() % 32 == 0) &&
1512 (LitTy.getSizeInBits() % 16 == 0);
1513 })
1514 .widenScalarIf(
1515 [=](const LegalityQuery &Query) {
1516 const LLT BigTy = Query.Types[BigTyIdx];
1517 return (BigTy.getScalarSizeInBits() < 16);
1518 },
1519 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1520 .widenScalarIf(
1521 [=](const LegalityQuery &Query) {
1522 const LLT LitTy = Query.Types[LitTyIdx];
1523 return (LitTy.getScalarSizeInBits() < 16);
1524 },
1525 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1526 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1527 .widenScalarToNextPow2(BigTyIdx, 32);
1528
1529 }
1530
1531 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1532 .legalForCartesianProduct(AllS32Vectors, {S32})
1533 .legalForCartesianProduct(AllS64Vectors, {S64})
1534 .clampNumElements(0, V16S32, V32S32)
1535 .clampNumElements(0, V2S64, V16S64)
1536 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1537
1538 if (ST.hasScalarPackInsts()) {
1539 BuildVector
1540 // FIXME: Should probably widen s1 vectors straight to s32
1541 .minScalarOrElt(0, S16)
1542 .minScalar(1, S16);
1543
1544 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1545 .legalFor({V2S16, S32})
1546 .lower();
1547 } else {
1548 BuildVector.customFor({V2S16, S16});
1549 BuildVector.minScalarOrElt(0, S32);
1550
1551 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1552 .customFor({V2S16, S32})
1553 .lower();
1554 }
1555
1556 BuildVector.legalIf(isRegisterType(0));
1557
1558 // FIXME: Clamp maximum size
1559 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1560 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1561 .clampMaxNumElements(0, S32, 32)
1562 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1563 .clampMaxNumElements(0, S16, 64);
1564
1565 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1566
1567 // Merge/Unmerge
1568 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1569 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1570 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1571
1572 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1573 const LLT Ty = Query.Types[TypeIdx];
1574 if (Ty.isVector()) {
1575 const LLT &EltTy = Ty.getElementType();
1576 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1577 return true;
1578 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1579 return true;
1580 }
1581 return false;
1582 };
1583
1584 auto &Builder = getActionDefinitionsBuilder(Op)
1585 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1586 .lowerFor({{S16, V2S16}})
1587 .lowerIf([=](const LegalityQuery &Query) {
1588 const LLT BigTy = Query.Types[BigTyIdx];
1589 return BigTy.getSizeInBits() == 32;
1590 })
1591 // Try to widen to s16 first for small types.
1592 // TODO: Only do this on targets with legal s16 shifts
1593 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1594 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1595 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1596 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1597 elementTypeIs(1, S16)),
1598 changeTo(1, V2S16))
1599 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1600 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1601 // valid.
1602 .clampScalar(LitTyIdx, S32, S512)
1603 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1604 // Break up vectors with weird elements into scalars
1605 .fewerElementsIf(
1606 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1607 scalarize(0))
1608 .fewerElementsIf(
1609 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1610 scalarize(1))
1611 .clampScalar(BigTyIdx, S32, MaxScalar);
1612
1613 if (Op == G_MERGE_VALUES) {
1614 Builder.widenScalarIf(
1615 // TODO: Use 16-bit shifts if legal for 8-bit values?
1616 [=](const LegalityQuery &Query) {
1617 const LLT Ty = Query.Types[LitTyIdx];
1618 return Ty.getSizeInBits() < 32;
1619 },
1620 changeTo(LitTyIdx, S32));
1621 }
1622
1623 Builder.widenScalarIf(
1624 [=](const LegalityQuery &Query) {
1625 const LLT Ty = Query.Types[BigTyIdx];
1626 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1627 Ty.getSizeInBits() % 16 != 0;
1628 },
1629 [=](const LegalityQuery &Query) {
1630 // Pick the next power of 2, or a multiple of 64 over 128.
1631 // Whichever is smaller.
1632 const LLT &Ty = Query.Types[BigTyIdx];
1633 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1634 if (NewSizeInBits >= 256) {
1635 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1636 if (RoundedTo < NewSizeInBits)
1637 NewSizeInBits = RoundedTo;
1638 }
1639 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1640 })
1641 // Any vectors left are the wrong size. Scalarize them.
1642 .scalarize(0)
1643 .scalarize(1);
1644 }
1645
1646 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1647 // RegBankSelect.
1648 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1649 .legalFor({{S32}, {S64}});
1650
1651 if (ST.hasVOP3PInsts()) {
1652 SextInReg.lowerFor({{V2S16}})
1653 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1654 // get more vector shift opportunities, since we'll get those when
1655 // expanded.
1656 .clampMaxNumElementsStrict(0, S16, 2);
1657 } else if (ST.has16BitInsts()) {
1658 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1659 } else {
1660 // Prefer to promote to s32 before lowering if we don't have 16-bit
1661 // shifts. This avoid a lot of intermediate truncate and extend operations.
1662 SextInReg.lowerFor({{S32}, {S64}});
1663 }
1664
1665 SextInReg
1666 .scalarize(0)
1667 .clampScalar(0, S32, S64)
1668 .lower();
1669
1670 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1671 .scalarize(0)
1672 .lower();
1673
1674 // TODO: Only Try to form v2s16 with legal packed instructions.
1675 getActionDefinitionsBuilder(G_FSHR)
1676 .legalFor({{S32, S32}})
1677 .lowerFor({{V2S16, V2S16}})
1678 .clampMaxNumElementsStrict(0, S16, 2)
1679 .scalarize(0)
1680 .lower();
1681
1682 if (ST.hasVOP3PInsts()) {
1683 getActionDefinitionsBuilder(G_FSHL)
1684 .lowerFor({{V2S16, V2S16}})
1685 .clampMaxNumElementsStrict(0, S16, 2)
1686 .scalarize(0)
1687 .lower();
1688 } else {
1689 getActionDefinitionsBuilder(G_FSHL)
1690 .scalarize(0)
1691 .lower();
1692 }
1693
1694 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1695 .legalFor({S64});
1696
1697 getActionDefinitionsBuilder(G_FENCE)
1698 .alwaysLegal();
1699
1700 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1701 .scalarize(0)
1702 .minScalar(0, S32)
1703 .lower();
1704
1705 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1706 .legalFor({{S32, S32}, {S64, S32}})
1707 .clampScalar(1, S32, S32)
1708 .clampScalar(0, S32, S64)
1709 .widenScalarToNextPow2(0)
1710 .scalarize(0);
1711
1712 getActionDefinitionsBuilder({
1713 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1714 G_FCOPYSIGN,
1715
1716 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1717 G_ATOMICRMW_NAND,
1718 G_ATOMICRMW_FSUB,
1719 G_READ_REGISTER,
1720 G_WRITE_REGISTER,
1721
1722 G_SADDO, G_SSUBO,
1723
1724 // TODO: Implement
1725 G_FMINIMUM, G_FMAXIMUM}).lower();
1726
1727 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1728 .lower();
1729
1730 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1731 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1732 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1733 .unsupported();
1734
1735 getLegacyLegalizerInfo().computeTables();
1736 verify(*ST.getInstrInfo());
1737 }
1738
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI) const1739 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1740 MachineInstr &MI) const {
1741 MachineIRBuilder &B = Helper.MIRBuilder;
1742 MachineRegisterInfo &MRI = *B.getMRI();
1743
1744 switch (MI.getOpcode()) {
1745 case TargetOpcode::G_ADDRSPACE_CAST:
1746 return legalizeAddrSpaceCast(MI, MRI, B);
1747 case TargetOpcode::G_FRINT:
1748 return legalizeFrint(MI, MRI, B);
1749 case TargetOpcode::G_FCEIL:
1750 return legalizeFceil(MI, MRI, B);
1751 case TargetOpcode::G_FREM:
1752 return legalizeFrem(MI, MRI, B);
1753 case TargetOpcode::G_INTRINSIC_TRUNC:
1754 return legalizeIntrinsicTrunc(MI, MRI, B);
1755 case TargetOpcode::G_SITOFP:
1756 return legalizeITOFP(MI, MRI, B, true);
1757 case TargetOpcode::G_UITOFP:
1758 return legalizeITOFP(MI, MRI, B, false);
1759 case TargetOpcode::G_FPTOSI:
1760 return legalizeFPTOI(MI, MRI, B, true);
1761 case TargetOpcode::G_FPTOUI:
1762 return legalizeFPTOI(MI, MRI, B, false);
1763 case TargetOpcode::G_FMINNUM:
1764 case TargetOpcode::G_FMAXNUM:
1765 case TargetOpcode::G_FMINNUM_IEEE:
1766 case TargetOpcode::G_FMAXNUM_IEEE:
1767 return legalizeMinNumMaxNum(Helper, MI);
1768 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1769 return legalizeExtractVectorElt(MI, MRI, B);
1770 case TargetOpcode::G_INSERT_VECTOR_ELT:
1771 return legalizeInsertVectorElt(MI, MRI, B);
1772 case TargetOpcode::G_FSIN:
1773 case TargetOpcode::G_FCOS:
1774 return legalizeSinCos(MI, MRI, B);
1775 case TargetOpcode::G_GLOBAL_VALUE:
1776 return legalizeGlobalValue(MI, MRI, B);
1777 case TargetOpcode::G_LOAD:
1778 case TargetOpcode::G_SEXTLOAD:
1779 case TargetOpcode::G_ZEXTLOAD:
1780 return legalizeLoad(Helper, MI);
1781 case TargetOpcode::G_FMAD:
1782 return legalizeFMad(MI, MRI, B);
1783 case TargetOpcode::G_FDIV:
1784 return legalizeFDIV(MI, MRI, B);
1785 case TargetOpcode::G_UDIV:
1786 case TargetOpcode::G_UREM:
1787 case TargetOpcode::G_UDIVREM:
1788 return legalizeUnsignedDIV_REM(MI, MRI, B);
1789 case TargetOpcode::G_SDIV:
1790 case TargetOpcode::G_SREM:
1791 case TargetOpcode::G_SDIVREM:
1792 return legalizeSignedDIV_REM(MI, MRI, B);
1793 case TargetOpcode::G_ATOMIC_CMPXCHG:
1794 return legalizeAtomicCmpXChg(MI, MRI, B);
1795 case TargetOpcode::G_FLOG:
1796 return legalizeFlog(MI, B, numbers::ln2f);
1797 case TargetOpcode::G_FLOG10:
1798 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1799 case TargetOpcode::G_FEXP:
1800 return legalizeFExp(MI, B);
1801 case TargetOpcode::G_FPOW:
1802 return legalizeFPow(MI, B);
1803 case TargetOpcode::G_FFLOOR:
1804 return legalizeFFloor(MI, MRI, B);
1805 case TargetOpcode::G_BUILD_VECTOR:
1806 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
1807 return legalizeBuildVector(MI, MRI, B);
1808 case TargetOpcode::G_MUL:
1809 return legalizeMul(Helper, MI);
1810 case TargetOpcode::G_CTLZ:
1811 case TargetOpcode::G_CTTZ:
1812 return legalizeCTLZ_CTTZ(MI, MRI, B);
1813 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1814 return legalizeFPTruncRound(MI, B);
1815 default:
1816 return false;
1817 }
1818
1819 llvm_unreachable("expected switch to return");
1820 }
1821
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1822 Register AMDGPULegalizerInfo::getSegmentAperture(
1823 unsigned AS,
1824 MachineRegisterInfo &MRI,
1825 MachineIRBuilder &B) const {
1826 MachineFunction &MF = B.getMF();
1827 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1828 const LLT S32 = LLT::scalar(32);
1829 const LLT S64 = LLT::scalar(64);
1830
1831 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1832
1833 if (ST.hasApertureRegs()) {
1834 // Note: this register is somewhat broken. When used as a 32-bit operand,
1835 // it only returns zeroes. The real value is in the upper 32 bits.
1836 // Thus, we must emit extract the high 32 bits.
1837 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
1838 ? AMDGPU::SRC_SHARED_BASE
1839 : AMDGPU::SRC_PRIVATE_BASE;
1840 // FIXME: It would be more natural to emit a COPY here, but then copy
1841 // coalescing would kick in and it would think it's okay to use the "HI"
1842 // subregister (instead of extracting the HI 32 bits) which is an artificial
1843 // (unusable) register.
1844 // Register TableGen definitions would need an overhaul to get rid of the
1845 // artificial "HI" aperture registers and prevent this kind of issue from
1846 // happening.
1847 Register Dst = MRI.createGenericVirtualRegister(S64);
1848 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
1849 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
1850 return B.buildUnmerge(S32, Dst).getReg(1);
1851 }
1852
1853 // TODO: can we be smarter about machine pointer info?
1854 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1855 Register LoadAddr = MRI.createGenericVirtualRegister(
1856 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1857 // For code object version 5, private_base and shared_base are passed through
1858 // implicit kernargs.
1859 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
1860 AMDGPUTargetLowering::ImplicitParameter Param =
1861 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
1862 : AMDGPUTargetLowering::PRIVATE_BASE;
1863 uint64_t Offset =
1864 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1865
1866 Register KernargPtrReg = MRI.createGenericVirtualRegister(
1867 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1868
1869 if (!loadInputValue(KernargPtrReg, B,
1870 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
1871 return Register();
1872
1873 MachineMemOperand *MMO = MF.getMachineMemOperand(
1874 PtrInfo,
1875 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1876 MachineMemOperand::MOInvariant,
1877 LLT::scalar(32), commonAlignment(Align(64), Offset));
1878
1879 // Pointer address
1880 B.buildPtrAdd(LoadAddr, KernargPtrReg,
1881 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1882 // Load address
1883 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1884 }
1885
1886 Register QueuePtr = MRI.createGenericVirtualRegister(
1887 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1888
1889 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1890 return Register();
1891
1892 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1893 // private_segment_aperture_base_hi.
1894 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1895
1896 MachineMemOperand *MMO = MF.getMachineMemOperand(
1897 PtrInfo,
1898 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1899 MachineMemOperand::MOInvariant,
1900 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1901
1902 B.buildPtrAdd(LoadAddr, QueuePtr,
1903 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1904 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1905 }
1906
1907 /// Return true if the value is a known valid address, such that a null check is
1908 /// not necessary.
isKnownNonNull(Register Val,MachineRegisterInfo & MRI,const AMDGPUTargetMachine & TM,unsigned AddrSpace)1909 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1910 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1911 MachineInstr *Def = MRI.getVRegDef(Val);
1912 switch (Def->getOpcode()) {
1913 case AMDGPU::G_FRAME_INDEX:
1914 case AMDGPU::G_GLOBAL_VALUE:
1915 case AMDGPU::G_BLOCK_ADDR:
1916 return true;
1917 case AMDGPU::G_CONSTANT: {
1918 const ConstantInt *CI = Def->getOperand(1).getCImm();
1919 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1920 }
1921 default:
1922 return false;
1923 }
1924
1925 return false;
1926 }
1927
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1928 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1929 MachineInstr &MI, MachineRegisterInfo &MRI,
1930 MachineIRBuilder &B) const {
1931 MachineFunction &MF = B.getMF();
1932
1933 const LLT S32 = LLT::scalar(32);
1934 Register Dst = MI.getOperand(0).getReg();
1935 Register Src = MI.getOperand(1).getReg();
1936
1937 LLT DstTy = MRI.getType(Dst);
1938 LLT SrcTy = MRI.getType(Src);
1939 unsigned DestAS = DstTy.getAddressSpace();
1940 unsigned SrcAS = SrcTy.getAddressSpace();
1941
1942 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1943 // vector element.
1944 assert(!DstTy.isVector());
1945
1946 const AMDGPUTargetMachine &TM
1947 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1948
1949 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1950 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1951 return true;
1952 }
1953
1954 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1955 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1956 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1957 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1958 // Extract low 32-bits of the pointer.
1959 B.buildExtract(Dst, Src, 0);
1960 MI.eraseFromParent();
1961 return true;
1962 }
1963
1964 unsigned NullVal = TM.getNullPointerValue(DestAS);
1965
1966 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1967 auto FlatNull = B.buildConstant(SrcTy, 0);
1968
1969 // Extract low 32-bits of the pointer.
1970 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1971
1972 auto CmpRes =
1973 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1974 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1975
1976 MI.eraseFromParent();
1977 return true;
1978 }
1979
1980 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1981 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1982 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1983 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1984 if (!ApertureReg.isValid())
1985 return false;
1986
1987 // Coerce the type of the low half of the result so we can use merge_values.
1988 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1989
1990 // TODO: Should we allow mismatched types but matching sizes in merges to
1991 // avoid the ptrtoint?
1992 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
1993
1994 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1995 B.buildCopy(Dst, BuildPtr);
1996 MI.eraseFromParent();
1997 return true;
1998 }
1999
2000 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2001 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2002
2003 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2004 SegmentNull.getReg(0));
2005
2006 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2007
2008 MI.eraseFromParent();
2009 return true;
2010 }
2011
2012 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2013 SrcTy.getSizeInBits() == 64) {
2014 // Truncate.
2015 B.buildExtract(Dst, Src, 0);
2016 MI.eraseFromParent();
2017 return true;
2018 }
2019
2020 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2021 DstTy.getSizeInBits() == 64) {
2022 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2023 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2024 auto PtrLo = B.buildPtrToInt(S32, Src);
2025 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2026 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2027 MI.eraseFromParent();
2028 return true;
2029 }
2030
2031 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2032 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2033
2034 LLVMContext &Ctx = MF.getFunction().getContext();
2035 Ctx.diagnose(InvalidAddrSpaceCast);
2036 B.buildUndef(Dst);
2037 MI.eraseFromParent();
2038 return true;
2039 }
2040
legalizeFrint(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2041 bool AMDGPULegalizerInfo::legalizeFrint(
2042 MachineInstr &MI, MachineRegisterInfo &MRI,
2043 MachineIRBuilder &B) const {
2044 Register Src = MI.getOperand(1).getReg();
2045 LLT Ty = MRI.getType(Src);
2046 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2047
2048 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2049 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2050
2051 auto C1 = B.buildFConstant(Ty, C1Val);
2052 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2053
2054 // TODO: Should this propagate fast-math-flags?
2055 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2056 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2057
2058 auto C2 = B.buildFConstant(Ty, C2Val);
2059 auto Fabs = B.buildFAbs(Ty, Src);
2060
2061 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2062 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2063 MI.eraseFromParent();
2064 return true;
2065 }
2066
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2067 bool AMDGPULegalizerInfo::legalizeFceil(
2068 MachineInstr &MI, MachineRegisterInfo &MRI,
2069 MachineIRBuilder &B) const {
2070
2071 const LLT S1 = LLT::scalar(1);
2072 const LLT S64 = LLT::scalar(64);
2073
2074 Register Src = MI.getOperand(1).getReg();
2075 assert(MRI.getType(Src) == S64);
2076
2077 // result = trunc(src)
2078 // if (src > 0.0 && src != result)
2079 // result += 1.0
2080
2081 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2082
2083 const auto Zero = B.buildFConstant(S64, 0.0);
2084 const auto One = B.buildFConstant(S64, 1.0);
2085 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2086 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2087 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2088 auto Add = B.buildSelect(S64, And, One, Zero);
2089
2090 // TODO: Should this propagate fast-math-flags?
2091 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2092 MI.eraseFromParent();
2093 return true;
2094 }
2095
legalizeFrem(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2096 bool AMDGPULegalizerInfo::legalizeFrem(
2097 MachineInstr &MI, MachineRegisterInfo &MRI,
2098 MachineIRBuilder &B) const {
2099 Register DstReg = MI.getOperand(0).getReg();
2100 Register Src0Reg = MI.getOperand(1).getReg();
2101 Register Src1Reg = MI.getOperand(2).getReg();
2102 auto Flags = MI.getFlags();
2103 LLT Ty = MRI.getType(DstReg);
2104
2105 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2106 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2107 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2108 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2109 MI.eraseFromParent();
2110 return true;
2111 }
2112
extractF64Exponent(Register Hi,MachineIRBuilder & B)2113 static MachineInstrBuilder extractF64Exponent(Register Hi,
2114 MachineIRBuilder &B) {
2115 const unsigned FractBits = 52;
2116 const unsigned ExpBits = 11;
2117 LLT S32 = LLT::scalar(32);
2118
2119 auto Const0 = B.buildConstant(S32, FractBits - 32);
2120 auto Const1 = B.buildConstant(S32, ExpBits);
2121
2122 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2123 .addUse(Hi)
2124 .addUse(Const0.getReg(0))
2125 .addUse(Const1.getReg(0));
2126
2127 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2128 }
2129
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2130 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2131 MachineInstr &MI, MachineRegisterInfo &MRI,
2132 MachineIRBuilder &B) const {
2133 const LLT S1 = LLT::scalar(1);
2134 const LLT S32 = LLT::scalar(32);
2135 const LLT S64 = LLT::scalar(64);
2136
2137 Register Src = MI.getOperand(1).getReg();
2138 assert(MRI.getType(Src) == S64);
2139
2140 // TODO: Should this use extract since the low half is unused?
2141 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2142 Register Hi = Unmerge.getReg(1);
2143
2144 // Extract the upper half, since this is where we will find the sign and
2145 // exponent.
2146 auto Exp = extractF64Exponent(Hi, B);
2147
2148 const unsigned FractBits = 52;
2149
2150 // Extract the sign bit.
2151 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2152 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2153
2154 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2155
2156 const auto Zero32 = B.buildConstant(S32, 0);
2157
2158 // Extend back to 64-bits.
2159 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2160
2161 auto Shr = B.buildAShr(S64, FractMask, Exp);
2162 auto Not = B.buildNot(S64, Shr);
2163 auto Tmp0 = B.buildAnd(S64, Src, Not);
2164 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2165
2166 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2167 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2168
2169 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2170 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2171 MI.eraseFromParent();
2172 return true;
2173 }
2174
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2175 bool AMDGPULegalizerInfo::legalizeITOFP(
2176 MachineInstr &MI, MachineRegisterInfo &MRI,
2177 MachineIRBuilder &B, bool Signed) const {
2178
2179 Register Dst = MI.getOperand(0).getReg();
2180 Register Src = MI.getOperand(1).getReg();
2181
2182 const LLT S64 = LLT::scalar(64);
2183 const LLT S32 = LLT::scalar(32);
2184
2185 assert(MRI.getType(Src) == S64);
2186
2187 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2188 auto ThirtyTwo = B.buildConstant(S32, 32);
2189
2190 if (MRI.getType(Dst) == S64) {
2191 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2192 : B.buildUITOFP(S64, Unmerge.getReg(1));
2193
2194 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2195 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2196 .addUse(CvtHi.getReg(0))
2197 .addUse(ThirtyTwo.getReg(0));
2198
2199 // TODO: Should this propagate fast-math-flags?
2200 B.buildFAdd(Dst, LdExp, CvtLo);
2201 MI.eraseFromParent();
2202 return true;
2203 }
2204
2205 assert(MRI.getType(Dst) == S32);
2206
2207 auto One = B.buildConstant(S32, 1);
2208
2209 MachineInstrBuilder ShAmt;
2210 if (Signed) {
2211 auto ThirtyOne = B.buildConstant(S32, 31);
2212 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2213 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2214 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2215 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2216 /*HasSideEffects=*/false)
2217 .addUse(Unmerge.getReg(1));
2218 auto LS2 = B.buildSub(S32, LS, One);
2219 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2220 } else
2221 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2222 auto Norm = B.buildShl(S64, Src, ShAmt);
2223 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2224 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2225 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2226 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2227 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2228 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2229 /*HasSideEffects=*/false)
2230 .addUse(FVal.getReg(0))
2231 .addUse(Scale.getReg(0));
2232 MI.eraseFromParent();
2233 return true;
2234 }
2235
2236 // TODO: Copied from DAG implementation. Verify logic and document how this
2237 // actually works.
legalizeFPTOI(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2238 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2239 MachineRegisterInfo &MRI,
2240 MachineIRBuilder &B,
2241 bool Signed) const {
2242
2243 Register Dst = MI.getOperand(0).getReg();
2244 Register Src = MI.getOperand(1).getReg();
2245
2246 const LLT S64 = LLT::scalar(64);
2247 const LLT S32 = LLT::scalar(32);
2248
2249 const LLT SrcLT = MRI.getType(Src);
2250 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2251
2252 unsigned Flags = MI.getFlags();
2253
2254 // The basic idea of converting a floating point number into a pair of 32-bit
2255 // integers is illustrated as follows:
2256 //
2257 // tf := trunc(val);
2258 // hif := floor(tf * 2^-32);
2259 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2260 // hi := fptoi(hif);
2261 // lo := fptoi(lof);
2262 //
2263 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2264 MachineInstrBuilder Sign;
2265 if (Signed && SrcLT == S32) {
2266 // However, a 32-bit floating point number has only 23 bits mantissa and
2267 // it's not enough to hold all the significant bits of `lof` if val is
2268 // negative. To avoid the loss of precision, We need to take the absolute
2269 // value after truncating and flip the result back based on the original
2270 // signedness.
2271 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2272 Trunc = B.buildFAbs(S32, Trunc, Flags);
2273 }
2274 MachineInstrBuilder K0, K1;
2275 if (SrcLT == S64) {
2276 K0 = B.buildFConstant(S64,
2277 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2278 K1 = B.buildFConstant(S64,
2279 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2280 } else {
2281 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2282 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2283 }
2284
2285 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2286 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2287 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2288
2289 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2290 : B.buildFPTOUI(S32, FloorMul);
2291 auto Lo = B.buildFPTOUI(S32, Fma);
2292
2293 if (Signed && SrcLT == S32) {
2294 // Flip the result based on the signedness, which is either all 0s or 1s.
2295 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2296 // r := xor({lo, hi}, sign) - sign;
2297 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2298 Sign);
2299 } else
2300 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2301 MI.eraseFromParent();
2302
2303 return true;
2304 }
2305
legalizeMinNumMaxNum(LegalizerHelper & Helper,MachineInstr & MI) const2306 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2307 MachineInstr &MI) const {
2308 MachineFunction &MF = Helper.MIRBuilder.getMF();
2309 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2310
2311 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2312 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2313
2314 // With ieee_mode disabled, the instructions have the correct behavior
2315 // already for G_FMINNUM/G_FMAXNUM
2316 if (!MFI->getMode().IEEE)
2317 return !IsIEEEOp;
2318
2319 if (IsIEEEOp)
2320 return true;
2321
2322 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2323 }
2324
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2325 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2326 MachineInstr &MI, MachineRegisterInfo &MRI,
2327 MachineIRBuilder &B) const {
2328 // TODO: Should move some of this into LegalizerHelper.
2329
2330 // TODO: Promote dynamic indexing of s16 to s32
2331
2332 // FIXME: Artifact combiner probably should have replaced the truncated
2333 // constant before this, so we shouldn't need
2334 // getIConstantVRegValWithLookThrough.
2335 std::optional<ValueAndVReg> MaybeIdxVal =
2336 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2337 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2338 return true;
2339 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2340
2341 Register Dst = MI.getOperand(0).getReg();
2342 Register Vec = MI.getOperand(1).getReg();
2343
2344 LLT VecTy = MRI.getType(Vec);
2345 LLT EltTy = VecTy.getElementType();
2346 assert(EltTy == MRI.getType(Dst));
2347
2348 if (IdxVal < VecTy.getNumElements()) {
2349 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2350 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2351 } else {
2352 B.buildUndef(Dst);
2353 }
2354
2355 MI.eraseFromParent();
2356 return true;
2357 }
2358
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2359 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2360 MachineInstr &MI, MachineRegisterInfo &MRI,
2361 MachineIRBuilder &B) const {
2362 // TODO: Should move some of this into LegalizerHelper.
2363
2364 // TODO: Promote dynamic indexing of s16 to s32
2365
2366 // FIXME: Artifact combiner probably should have replaced the truncated
2367 // constant before this, so we shouldn't need
2368 // getIConstantVRegValWithLookThrough.
2369 std::optional<ValueAndVReg> MaybeIdxVal =
2370 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2371 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2372 return true;
2373
2374 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2375 Register Dst = MI.getOperand(0).getReg();
2376 Register Vec = MI.getOperand(1).getReg();
2377 Register Ins = MI.getOperand(2).getReg();
2378
2379 LLT VecTy = MRI.getType(Vec);
2380 LLT EltTy = VecTy.getElementType();
2381 assert(EltTy == MRI.getType(Ins));
2382 (void)Ins;
2383
2384 unsigned NumElts = VecTy.getNumElements();
2385 if (IdxVal < NumElts) {
2386 SmallVector<Register, 8> SrcRegs;
2387 for (unsigned i = 0; i < NumElts; ++i)
2388 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2389 B.buildUnmerge(SrcRegs, Vec);
2390
2391 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2392 B.buildMergeLikeInstr(Dst, SrcRegs);
2393 } else {
2394 B.buildUndef(Dst);
2395 }
2396
2397 MI.eraseFromParent();
2398 return true;
2399 }
2400
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2401 bool AMDGPULegalizerInfo::legalizeSinCos(
2402 MachineInstr &MI, MachineRegisterInfo &MRI,
2403 MachineIRBuilder &B) const {
2404
2405 Register DstReg = MI.getOperand(0).getReg();
2406 Register SrcReg = MI.getOperand(1).getReg();
2407 LLT Ty = MRI.getType(DstReg);
2408 unsigned Flags = MI.getFlags();
2409
2410 Register TrigVal;
2411 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2412 if (ST.hasTrigReducedRange()) {
2413 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2414 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2415 .addUse(MulVal.getReg(0))
2416 .setMIFlags(Flags).getReg(0);
2417 } else
2418 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2419
2420 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2421 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2422 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false)
2423 .addUse(TrigVal)
2424 .setMIFlags(Flags);
2425 MI.eraseFromParent();
2426 return true;
2427 }
2428
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,int64_t Offset,unsigned GAFlags) const2429 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2430 MachineIRBuilder &B,
2431 const GlobalValue *GV,
2432 int64_t Offset,
2433 unsigned GAFlags) const {
2434 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2435 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2436 // to the following code sequence:
2437 //
2438 // For constant address space:
2439 // s_getpc_b64 s[0:1]
2440 // s_add_u32 s0, s0, $symbol
2441 // s_addc_u32 s1, s1, 0
2442 //
2443 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2444 // a fixup or relocation is emitted to replace $symbol with a literal
2445 // constant, which is a pc-relative offset from the encoding of the $symbol
2446 // operand to the global variable.
2447 //
2448 // For global address space:
2449 // s_getpc_b64 s[0:1]
2450 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2451 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2452 //
2453 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2454 // fixups or relocations are emitted to replace $symbol@*@lo and
2455 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2456 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2457 // operand to the global variable.
2458 //
2459 // What we want here is an offset from the value returned by s_getpc
2460 // (which is the address of the s_add_u32 instruction) to the global
2461 // variable, but since the encoding of $symbol starts 4 bytes after the start
2462 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2463 // small. This requires us to add 4 to the global variable offset in order to
2464 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2465 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2466 // instruction.
2467
2468 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2469
2470 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2471 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2472
2473 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2474 .addDef(PCReg);
2475
2476 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2477 if (GAFlags == SIInstrInfo::MO_NONE)
2478 MIB.addImm(0);
2479 else
2480 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2481
2482 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2483
2484 if (PtrTy.getSizeInBits() == 32)
2485 B.buildExtract(DstReg, PCReg, 0);
2486 return true;
2487 }
2488
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2489 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2490 MachineInstr &MI, MachineRegisterInfo &MRI,
2491 MachineIRBuilder &B) const {
2492 Register DstReg = MI.getOperand(0).getReg();
2493 LLT Ty = MRI.getType(DstReg);
2494 unsigned AS = Ty.getAddressSpace();
2495
2496 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2497 MachineFunction &MF = B.getMF();
2498 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2499
2500 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2501 if (!MFI->isModuleEntryFunction() &&
2502 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2503 const Function &Fn = MF.getFunction();
2504 DiagnosticInfoUnsupported BadLDSDecl(
2505 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2506 DS_Warning);
2507 Fn.getContext().diagnose(BadLDSDecl);
2508
2509 // We currently don't have a way to correctly allocate LDS objects that
2510 // aren't directly associated with a kernel. We do force inlining of
2511 // functions that use local objects. However, if these dead functions are
2512 // not eliminated, we don't want a compile time error. Just emit a warning
2513 // and a trap, since there should be no callable path here.
2514 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2515 B.buildUndef(DstReg);
2516 MI.eraseFromParent();
2517 return true;
2518 }
2519
2520 // TODO: We could emit code to handle the initialization somewhere.
2521 // We ignore the initializer for now and legalize it to allow selection.
2522 // The initializer will anyway get errored out during assembly emission.
2523 const SITargetLowering *TLI = ST.getTargetLowering();
2524 if (!TLI->shouldUseLDSConstAddress(GV)) {
2525 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2526 return true; // Leave in place;
2527 }
2528
2529 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2530 Type *Ty = GV->getValueType();
2531 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2532 // zero-sized type in other languages to declare the dynamic shared
2533 // memory which size is not known at the compile time. They will be
2534 // allocated by the runtime and placed directly after the static
2535 // allocated ones. They all share the same offset.
2536 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2537 // Adjust alignment for that dynamic shared memory array.
2538 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2539 LLT S32 = LLT::scalar(32);
2540 auto Sz =
2541 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2542 B.buildIntToPtr(DstReg, Sz);
2543 MI.eraseFromParent();
2544 return true;
2545 }
2546 }
2547
2548 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2549 *cast<GlobalVariable>(GV)));
2550 MI.eraseFromParent();
2551 return true;
2552 }
2553
2554 const SITargetLowering *TLI = ST.getTargetLowering();
2555
2556 if (TLI->shouldEmitFixup(GV)) {
2557 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2558 MI.eraseFromParent();
2559 return true;
2560 }
2561
2562 if (TLI->shouldEmitPCReloc(GV)) {
2563 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2564 MI.eraseFromParent();
2565 return true;
2566 }
2567
2568 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2569 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2570
2571 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2572 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2573 MachinePointerInfo::getGOT(MF),
2574 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2575 MachineMemOperand::MOInvariant,
2576 LoadTy, Align(8));
2577
2578 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2579
2580 if (Ty.getSizeInBits() == 32) {
2581 // Truncate if this is a 32-bit constant address.
2582 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2583 B.buildExtract(DstReg, Load, 0);
2584 } else
2585 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2586
2587 MI.eraseFromParent();
2588 return true;
2589 }
2590
widenToNextPowerOf2(LLT Ty)2591 static LLT widenToNextPowerOf2(LLT Ty) {
2592 if (Ty.isVector())
2593 return Ty.changeElementCount(
2594 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2595 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2596 }
2597
legalizeLoad(LegalizerHelper & Helper,MachineInstr & MI) const2598 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2599 MachineInstr &MI) const {
2600 MachineIRBuilder &B = Helper.MIRBuilder;
2601 MachineRegisterInfo &MRI = *B.getMRI();
2602 GISelChangeObserver &Observer = Helper.Observer;
2603
2604 Register PtrReg = MI.getOperand(1).getReg();
2605 LLT PtrTy = MRI.getType(PtrReg);
2606 unsigned AddrSpace = PtrTy.getAddressSpace();
2607
2608 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2609 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2610 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2611 Observer.changingInstr(MI);
2612 MI.getOperand(1).setReg(Cast.getReg(0));
2613 Observer.changedInstr(MI);
2614 return true;
2615 }
2616
2617 if (MI.getOpcode() != AMDGPU::G_LOAD)
2618 return false;
2619
2620 Register ValReg = MI.getOperand(0).getReg();
2621 LLT ValTy = MRI.getType(ValReg);
2622
2623 MachineMemOperand *MMO = *MI.memoperands_begin();
2624 const unsigned ValSize = ValTy.getSizeInBits();
2625 const LLT MemTy = MMO->getMemoryType();
2626 const Align MemAlign = MMO->getAlign();
2627 const unsigned MemSize = MemTy.getSizeInBits();
2628 const uint64_t AlignInBits = 8 * MemAlign.value();
2629
2630 // Widen non-power-of-2 loads to the alignment if needed
2631 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2632 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2633
2634 // This was already the correct extending load result type, so just adjust
2635 // the memory type.
2636 if (WideMemSize == ValSize) {
2637 MachineFunction &MF = B.getMF();
2638
2639 MachineMemOperand *WideMMO =
2640 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2641 Observer.changingInstr(MI);
2642 MI.setMemRefs(MF, {WideMMO});
2643 Observer.changedInstr(MI);
2644 return true;
2645 }
2646
2647 // Don't bother handling edge case that should probably never be produced.
2648 if (ValSize > WideMemSize)
2649 return false;
2650
2651 LLT WideTy = widenToNextPowerOf2(ValTy);
2652
2653 Register WideLoad;
2654 if (!WideTy.isVector()) {
2655 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2656 B.buildTrunc(ValReg, WideLoad).getReg(0);
2657 } else {
2658 // Extract the subvector.
2659
2660 if (isRegisterType(ValTy)) {
2661 // If this a case where G_EXTRACT is legal, use it.
2662 // (e.g. <3 x s32> -> <4 x s32>)
2663 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2664 B.buildExtract(ValReg, WideLoad, 0);
2665 } else {
2666 // For cases where the widened type isn't a nice register value, unmerge
2667 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2668 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2669 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2670 }
2671 }
2672
2673 MI.eraseFromParent();
2674 return true;
2675 }
2676
2677 return false;
2678 }
2679
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2680 bool AMDGPULegalizerInfo::legalizeFMad(
2681 MachineInstr &MI, MachineRegisterInfo &MRI,
2682 MachineIRBuilder &B) const {
2683 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2684 assert(Ty.isScalar());
2685
2686 MachineFunction &MF = B.getMF();
2687 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2688
2689 // TODO: Always legal with future ftz flag.
2690 // FIXME: Do we need just output?
2691 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2692 return true;
2693 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2694 return true;
2695
2696 MachineIRBuilder HelperBuilder(MI);
2697 GISelObserverWrapper DummyObserver;
2698 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2699 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2700 }
2701
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2702 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2703 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2704 Register DstReg = MI.getOperand(0).getReg();
2705 Register PtrReg = MI.getOperand(1).getReg();
2706 Register CmpVal = MI.getOperand(2).getReg();
2707 Register NewVal = MI.getOperand(3).getReg();
2708
2709 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2710 "this should not have been custom lowered");
2711
2712 LLT ValTy = MRI.getType(CmpVal);
2713 LLT VecTy = LLT::fixed_vector(2, ValTy);
2714
2715 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2716
2717 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2718 .addDef(DstReg)
2719 .addUse(PtrReg)
2720 .addUse(PackedVal)
2721 .setMemRefs(MI.memoperands());
2722
2723 MI.eraseFromParent();
2724 return true;
2725 }
2726
legalizeFlog(MachineInstr & MI,MachineIRBuilder & B,double Log2BaseInverted) const2727 bool AMDGPULegalizerInfo::legalizeFlog(
2728 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2729 Register Dst = MI.getOperand(0).getReg();
2730 Register Src = MI.getOperand(1).getReg();
2731 LLT Ty = B.getMRI()->getType(Dst);
2732 unsigned Flags = MI.getFlags();
2733
2734 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2735 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2736
2737 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2738 MI.eraseFromParent();
2739 return true;
2740 }
2741
legalizeFExp(MachineInstr & MI,MachineIRBuilder & B) const2742 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2743 MachineIRBuilder &B) const {
2744 Register Dst = MI.getOperand(0).getReg();
2745 Register Src = MI.getOperand(1).getReg();
2746 unsigned Flags = MI.getFlags();
2747 LLT Ty = B.getMRI()->getType(Dst);
2748
2749 auto K = B.buildFConstant(Ty, numbers::log2e);
2750 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2751 B.buildFExp2(Dst, Mul, Flags);
2752 MI.eraseFromParent();
2753 return true;
2754 }
2755
legalizeFPow(MachineInstr & MI,MachineIRBuilder & B) const2756 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2757 MachineIRBuilder &B) const {
2758 Register Dst = MI.getOperand(0).getReg();
2759 Register Src0 = MI.getOperand(1).getReg();
2760 Register Src1 = MI.getOperand(2).getReg();
2761 unsigned Flags = MI.getFlags();
2762 LLT Ty = B.getMRI()->getType(Dst);
2763 const LLT S16 = LLT::scalar(16);
2764 const LLT S32 = LLT::scalar(32);
2765
2766 if (Ty == S32) {
2767 auto Log = B.buildFLog2(S32, Src0, Flags);
2768 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2769 .addUse(Log.getReg(0))
2770 .addUse(Src1)
2771 .setMIFlags(Flags);
2772 B.buildFExp2(Dst, Mul, Flags);
2773 } else if (Ty == S16) {
2774 // There's no f16 fmul_legacy, so we need to convert for it.
2775 auto Log = B.buildFLog2(S16, Src0, Flags);
2776 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2777 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2778 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2779 .addUse(Ext0.getReg(0))
2780 .addUse(Ext1.getReg(0))
2781 .setMIFlags(Flags);
2782
2783 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2784 } else
2785 return false;
2786
2787 MI.eraseFromParent();
2788 return true;
2789 }
2790
2791 // Find a source register, ignoring any possible source modifiers.
stripAnySourceMods(Register OrigSrc,MachineRegisterInfo & MRI)2792 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2793 Register ModSrc = OrigSrc;
2794 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2795 ModSrc = SrcFNeg->getOperand(1).getReg();
2796 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2797 ModSrc = SrcFAbs->getOperand(1).getReg();
2798 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2799 ModSrc = SrcFAbs->getOperand(1).getReg();
2800 return ModSrc;
2801 }
2802
legalizeFFloor(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2803 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2804 MachineRegisterInfo &MRI,
2805 MachineIRBuilder &B) const {
2806
2807 const LLT S1 = LLT::scalar(1);
2808 const LLT S64 = LLT::scalar(64);
2809 Register Dst = MI.getOperand(0).getReg();
2810 Register OrigSrc = MI.getOperand(1).getReg();
2811 unsigned Flags = MI.getFlags();
2812 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2813 "this should not have been custom lowered");
2814
2815 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2816 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2817 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2818 // V_FRACT bug is:
2819 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2820 //
2821 // Convert floor(x) to (x - fract(x))
2822
2823 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2824 .addUse(OrigSrc)
2825 .setMIFlags(Flags);
2826
2827 // Give source modifier matching some assistance before obscuring a foldable
2828 // pattern.
2829
2830 // TODO: We can avoid the neg on the fract? The input sign to fract
2831 // shouldn't matter?
2832 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2833
2834 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2835
2836 Register Min = MRI.createGenericVirtualRegister(S64);
2837
2838 // We don't need to concern ourselves with the snan handling difference, so
2839 // use the one which will directly select.
2840 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2841 if (MFI->getMode().IEEE)
2842 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2843 else
2844 B.buildFMinNum(Min, Fract, Const, Flags);
2845
2846 Register CorrectedFract = Min;
2847 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2848 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2849 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2850 }
2851
2852 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2853 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2854
2855 MI.eraseFromParent();
2856 return true;
2857 }
2858
2859 // Turn an illegal packed v2s16 build vector into bit operations.
2860 // TODO: This should probably be a bitcast action in LegalizerHelper.
legalizeBuildVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2861 bool AMDGPULegalizerInfo::legalizeBuildVector(
2862 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2863 Register Dst = MI.getOperand(0).getReg();
2864 const LLT S32 = LLT::scalar(32);
2865 const LLT S16 = LLT::scalar(16);
2866 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2867
2868 Register Src0 = MI.getOperand(1).getReg();
2869 Register Src1 = MI.getOperand(2).getReg();
2870
2871 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
2872 assert(MRI.getType(Src0) == S32);
2873 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
2874 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
2875 }
2876
2877 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
2878 B.buildBitcast(Dst, Merge);
2879
2880 MI.eraseFromParent();
2881 return true;
2882 }
2883
2884 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
2885 //
2886 // Source and accumulation registers must all be 32-bits.
2887 //
2888 // TODO: When the multiply is uniform, we should produce a code sequence
2889 // that is better suited to instruction selection on the SALU. Instead of
2890 // the outer loop going over parts of the result, the outer loop should go
2891 // over parts of one of the factors. This should result in instruction
2892 // selection that makes full use of S_ADDC_U32 instructions.
buildMultiply(LegalizerHelper & Helper,MutableArrayRef<Register> Accum,ArrayRef<Register> Src0,ArrayRef<Register> Src1,bool UsePartialMad64_32,bool SeparateOddAlignedProducts) const2893 void AMDGPULegalizerInfo::buildMultiply(
2894 LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
2895 ArrayRef<Register> Src0, ArrayRef<Register> Src1,
2896 bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
2897 // Use (possibly empty) vectors of S1 registers to represent the set of
2898 // carries from one pair of positions to the next.
2899 using Carry = SmallVector<Register, 2>;
2900
2901 MachineIRBuilder &B = Helper.MIRBuilder;
2902
2903 const LLT S1 = LLT::scalar(1);
2904 const LLT S32 = LLT::scalar(32);
2905 const LLT S64 = LLT::scalar(64);
2906
2907 Register Zero32;
2908 Register Zero64;
2909
2910 auto getZero32 = [&]() -> Register {
2911 if (!Zero32)
2912 Zero32 = B.buildConstant(S32, 0).getReg(0);
2913 return Zero32;
2914 };
2915 auto getZero64 = [&]() -> Register {
2916 if (!Zero64)
2917 Zero64 = B.buildConstant(S64, 0).getReg(0);
2918 return Zero64;
2919 };
2920
2921 // Merge the given carries into the 32-bit LocalAccum, which is modified
2922 // in-place.
2923 //
2924 // Returns the carry-out, which is a single S1 register or null.
2925 auto mergeCarry =
2926 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
2927 if (CarryIn.empty())
2928 return Register();
2929
2930 bool HaveCarryOut = true;
2931 Register CarryAccum;
2932 if (CarryIn.size() == 1) {
2933 if (!LocalAccum) {
2934 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2935 return Register();
2936 }
2937
2938 CarryAccum = getZero32();
2939 } else {
2940 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2941 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
2942 CarryAccum =
2943 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
2944 .getReg(0);
2945 }
2946
2947 if (!LocalAccum) {
2948 LocalAccum = getZero32();
2949 HaveCarryOut = false;
2950 }
2951 }
2952
2953 auto Add =
2954 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2955 LocalAccum = Add.getReg(0);
2956 return HaveCarryOut ? Add.getReg(1) : Register();
2957 };
2958
2959 // Build a multiply-add chain to compute
2960 //
2961 // LocalAccum + (partial products at DstIndex)
2962 // + (opportunistic subset of CarryIn)
2963 //
2964 // LocalAccum is an array of one or two 32-bit registers that are updated
2965 // in-place. The incoming registers may be null.
2966 //
2967 // In some edge cases, carry-ins can be consumed "for free". In that case,
2968 // the consumed carry bits are removed from CarryIn in-place.
2969 auto buildMadChain =
2970 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
2971 -> Carry {
2972 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
2973 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
2974
2975 Carry CarryOut;
2976 unsigned j0 = 0;
2977
2978 // Use plain 32-bit multiplication for the most significant part of the
2979 // result by default.
2980 if (LocalAccum.size() == 1 &&
2981 (!UsePartialMad64_32 || !CarryIn.empty())) {
2982 do {
2983 unsigned j1 = DstIndex - j0;
2984 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
2985 if (!LocalAccum[0]) {
2986 LocalAccum[0] = Mul.getReg(0);
2987 } else {
2988 if (CarryIn.empty()) {
2989 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
2990 } else {
2991 LocalAccum[0] =
2992 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
2993 .getReg(0);
2994 CarryIn.pop_back();
2995 }
2996 }
2997 ++j0;
2998 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
2999 }
3000
3001 // Build full 64-bit multiplies.
3002 if (j0 <= DstIndex) {
3003 bool HaveSmallAccum = false;
3004 Register Tmp;
3005
3006 if (LocalAccum[0]) {
3007 if (LocalAccum.size() == 1) {
3008 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3009 HaveSmallAccum = true;
3010 } else if (LocalAccum[1]) {
3011 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3012 HaveSmallAccum = false;
3013 } else {
3014 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3015 HaveSmallAccum = true;
3016 }
3017 } else {
3018 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3019 Tmp = getZero64();
3020 HaveSmallAccum = true;
3021 }
3022
3023 do {
3024 unsigned j1 = DstIndex - j0;
3025 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3026 {Src0[j0], Src1[j1], Tmp});
3027 Tmp = Mad.getReg(0);
3028 if (!HaveSmallAccum)
3029 CarryOut.push_back(Mad.getReg(1));
3030 HaveSmallAccum = false;
3031 ++j0;
3032 } while (j0 <= DstIndex);
3033
3034 auto Unmerge = B.buildUnmerge(S32, Tmp);
3035 LocalAccum[0] = Unmerge.getReg(0);
3036 if (LocalAccum.size() > 1)
3037 LocalAccum[1] = Unmerge.getReg(1);
3038 }
3039
3040 return CarryOut;
3041 };
3042
3043 // Outer multiply loop, iterating over destination parts from least
3044 // significant to most significant parts.
3045 //
3046 // The columns of the following diagram correspond to the destination parts
3047 // affected by one iteration of the outer loop (ignoring boundary
3048 // conditions).
3049 //
3050 // Dest index relative to 2 * i: 1 0 -1
3051 // ------
3052 // Carries from previous iteration: e o
3053 // Even-aligned partial product sum: E E .
3054 // Odd-aligned partial product sum: O O
3055 //
3056 // 'o' is OddCarry, 'e' is EvenCarry.
3057 // EE and OO are computed from partial products via buildMadChain and use
3058 // accumulation where possible and appropriate.
3059 //
3060 Register SeparateOddCarry;
3061 Carry EvenCarry;
3062 Carry OddCarry;
3063
3064 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3065 Carry OddCarryIn = std::move(OddCarry);
3066 Carry EvenCarryIn = std::move(EvenCarry);
3067 OddCarry.clear();
3068 EvenCarry.clear();
3069
3070 // Partial products at offset 2 * i.
3071 if (2 * i < Accum.size()) {
3072 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3073 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3074 }
3075
3076 // Partial products at offset 2 * i - 1.
3077 if (i > 0) {
3078 if (!SeparateOddAlignedProducts) {
3079 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3080 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3081 } else {
3082 bool IsHighest = 2 * i >= Accum.size();
3083 Register SeparateOddOut[2];
3084 auto LocalAccum = MutableArrayRef(SeparateOddOut)
3085 .take_front(IsHighest ? 1 : 2);
3086 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3087
3088 MachineInstr *Lo;
3089
3090 if (i == 1) {
3091 if (!IsHighest)
3092 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3093 else
3094 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3095 } else {
3096 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3097 SeparateOddCarry);
3098 }
3099 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3100
3101 if (!IsHighest) {
3102 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3103 Lo->getOperand(1).getReg());
3104 Accum[2 * i] = Hi.getReg(0);
3105 SeparateOddCarry = Hi.getReg(1);
3106 }
3107 }
3108 }
3109
3110 // Add in the carries from the previous iteration
3111 if (i > 0) {
3112 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3113 EvenCarryIn.push_back(CarryOut);
3114
3115 if (2 * i < Accum.size()) {
3116 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3117 OddCarry.push_back(CarryOut);
3118 }
3119 }
3120 }
3121 }
3122
3123 // Custom narrowing of wide multiplies using wide multiply-add instructions.
3124 //
3125 // TODO: If the multiply is followed by an addition, we should attempt to
3126 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
legalizeMul(LegalizerHelper & Helper,MachineInstr & MI) const3127 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
3128 MachineInstr &MI) const {
3129 assert(ST.hasMad64_32());
3130 assert(MI.getOpcode() == TargetOpcode::G_MUL);
3131
3132 MachineIRBuilder &B = Helper.MIRBuilder;
3133 MachineRegisterInfo &MRI = *B.getMRI();
3134
3135 Register DstReg = MI.getOperand(0).getReg();
3136 Register Src0 = MI.getOperand(1).getReg();
3137 Register Src1 = MI.getOperand(2).getReg();
3138
3139 LLT Ty = MRI.getType(DstReg);
3140 assert(Ty.isScalar());
3141
3142 unsigned Size = Ty.getSizeInBits();
3143 unsigned NumParts = Size / 32;
3144 assert((Size % 32) == 0);
3145 assert(NumParts >= 2);
3146
3147 // Whether to use MAD_64_32 for partial products whose high half is
3148 // discarded. This avoids some ADD instructions but risks false dependency
3149 // stalls on some subtargets in some cases.
3150 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3151
3152 // Whether to compute odd-aligned partial products separately. This is
3153 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3154 // in an even-aligned VGPR.
3155 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3156
3157 LLT S32 = LLT::scalar(32);
3158 SmallVector<Register, 2> Src0Parts, Src1Parts;
3159 for (unsigned i = 0; i < NumParts; ++i) {
3160 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3161 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3162 }
3163 B.buildUnmerge(Src0Parts, Src0);
3164 B.buildUnmerge(Src1Parts, Src1);
3165
3166 SmallVector<Register, 2> AccumRegs(NumParts);
3167 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3168 SeparateOddAlignedProducts);
3169
3170 B.buildMergeLikeInstr(DstReg, AccumRegs);
3171 MI.eraseFromParent();
3172 return true;
3173
3174 }
3175
3176 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3177 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3178 // case with a single min instruction instead of a compare+select.
legalizeCTLZ_CTTZ(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3179 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3180 MachineRegisterInfo &MRI,
3181 MachineIRBuilder &B) const {
3182 Register Dst = MI.getOperand(0).getReg();
3183 Register Src = MI.getOperand(1).getReg();
3184 LLT DstTy = MRI.getType(Dst);
3185 LLT SrcTy = MRI.getType(Src);
3186
3187 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3188 ? AMDGPU::G_AMDGPU_FFBH_U32
3189 : AMDGPU::G_AMDGPU_FFBL_B32;
3190 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3191 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3192
3193 MI.eraseFromParent();
3194 return true;
3195 }
3196
3197 // Check that this is a G_XOR x, -1
isNot(const MachineRegisterInfo & MRI,const MachineInstr & MI)3198 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3199 if (MI.getOpcode() != TargetOpcode::G_XOR)
3200 return false;
3201 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3202 return ConstVal && *ConstVal == -1;
3203 }
3204
3205 // Return the use branch instruction, otherwise null if the usage is invalid.
3206 static MachineInstr *
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br,MachineBasicBlock * & UncondBrTarget,bool & Negated)3207 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3208 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3209 Register CondDef = MI.getOperand(0).getReg();
3210 if (!MRI.hasOneNonDBGUse(CondDef))
3211 return nullptr;
3212
3213 MachineBasicBlock *Parent = MI.getParent();
3214 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3215
3216 if (isNot(MRI, *UseMI)) {
3217 Register NegatedCond = UseMI->getOperand(0).getReg();
3218 if (!MRI.hasOneNonDBGUse(NegatedCond))
3219 return nullptr;
3220
3221 // We're deleting the def of this value, so we need to remove it.
3222 eraseInstr(*UseMI, MRI);
3223
3224 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3225 Negated = true;
3226 }
3227
3228 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3229 return nullptr;
3230
3231 // Make sure the cond br is followed by a G_BR, or is the last instruction.
3232 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3233 if (Next == Parent->end()) {
3234 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3235 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3236 return nullptr;
3237 UncondBrTarget = &*NextMBB;
3238 } else {
3239 if (Next->getOpcode() != AMDGPU::G_BR)
3240 return nullptr;
3241 Br = &*Next;
3242 UncondBrTarget = Br->getOperand(0).getMBB();
3243 }
3244
3245 return UseMI;
3246 }
3247
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg,const TargetRegisterClass * ArgRC,LLT ArgTy) const3248 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3249 const ArgDescriptor *Arg,
3250 const TargetRegisterClass *ArgRC,
3251 LLT ArgTy) const {
3252 MCRegister SrcReg = Arg->getRegister();
3253 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
3254 assert(DstReg.isVirtual() && "Virtual register expected");
3255
3256 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
3257 *ArgRC, B.getDebugLoc(), ArgTy);
3258 if (Arg->isMasked()) {
3259 // TODO: Should we try to emit this once in the entry block?
3260 const LLT S32 = LLT::scalar(32);
3261 const unsigned Mask = Arg->getMask();
3262 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
3263
3264 Register AndMaskSrc = LiveIn;
3265
3266 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
3267 // 0.
3268 if (Shift != 0) {
3269 auto ShiftAmt = B.buildConstant(S32, Shift);
3270 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3271 }
3272
3273 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
3274 } else {
3275 B.buildCopy(DstReg, LiveIn);
3276 }
3277
3278 return true;
3279 }
3280
loadInputValue(Register DstReg,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const3281 bool AMDGPULegalizerInfo::loadInputValue(
3282 Register DstReg, MachineIRBuilder &B,
3283 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3284 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3285 const ArgDescriptor *Arg;
3286 const TargetRegisterClass *ArgRC;
3287 LLT ArgTy;
3288 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3289
3290 if (!Arg) {
3291 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
3292 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3293 // case the pointer argument may be missing and we use null.
3294 B.buildConstant(DstReg, 0);
3295 return true;
3296 }
3297
3298 // It's undefined behavior if a function marked with the amdgpu-no-*
3299 // attributes uses the corresponding intrinsic.
3300 B.buildUndef(DstReg);
3301 return true;
3302 }
3303
3304 if (!Arg->isRegister() || !Arg->getRegister().isValid())
3305 return false; // TODO: Handle these
3306 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
3307 }
3308
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const3309 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
3310 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3311 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3312 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3313 return false;
3314
3315 MI.eraseFromParent();
3316 return true;
3317 }
3318
replaceWithConstant(MachineIRBuilder & B,MachineInstr & MI,int64_t C)3319 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
3320 int64_t C) {
3321 B.buildConstant(MI.getOperand(0).getReg(), C);
3322 MI.eraseFromParent();
3323 return true;
3324 }
3325
legalizeWorkitemIDIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned Dim,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const3326 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
3327 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3328 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3329 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3330 if (MaxID == 0)
3331 return replaceWithConstant(B, MI, 0);
3332
3333 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3334 const ArgDescriptor *Arg;
3335 const TargetRegisterClass *ArgRC;
3336 LLT ArgTy;
3337 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3338
3339 Register DstReg = MI.getOperand(0).getReg();
3340 if (!Arg) {
3341 // It's undefined behavior if a function marked with the amdgpu-no-*
3342 // attributes uses the corresponding intrinsic.
3343 B.buildUndef(DstReg);
3344 MI.eraseFromParent();
3345 return true;
3346 }
3347
3348 if (Arg->isMasked()) {
3349 // Don't bother inserting AssertZext for packed IDs since we're emitting the
3350 // masking operations anyway.
3351 //
3352 // TODO: We could assert the top bit is 0 for the source copy.
3353 if (!loadInputValue(DstReg, B, ArgType))
3354 return false;
3355 } else {
3356 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
3357 if (!loadInputValue(TmpReg, B, ArgType))
3358 return false;
3359 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
3360 }
3361
3362 MI.eraseFromParent();
3363 return true;
3364 }
3365
getKernargParameterPtr(MachineIRBuilder & B,int64_t Offset) const3366 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
3367 int64_t Offset) const {
3368 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3369 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3370
3371 // TODO: If we passed in the base kernel offset we could have a better
3372 // alignment than 4, but we don't really need it.
3373 if (!loadInputValue(KernArgReg, B,
3374 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3375 llvm_unreachable("failed to find kernarg segment ptr");
3376
3377 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3378 // TODO: Should get nuw
3379 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3380 }
3381
3382 /// Legalize a value that's loaded from kernel arguments. This is only used by
3383 /// legacy intrinsics.
legalizeKernargMemParameter(MachineInstr & MI,MachineIRBuilder & B,uint64_t Offset,Align Alignment) const3384 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
3385 MachineIRBuilder &B,
3386 uint64_t Offset,
3387 Align Alignment) const {
3388 Register DstReg = MI.getOperand(0).getReg();
3389
3390 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
3391 "unexpected kernarg parameter type");
3392
3393 Register Ptr = getKernargParameterPtr(B, Offset);
3394 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
3395 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3396 MachineMemOperand::MODereferenceable |
3397 MachineMemOperand::MOInvariant);
3398 MI.eraseFromParent();
3399 return true;
3400 }
3401
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3402 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
3403 MachineRegisterInfo &MRI,
3404 MachineIRBuilder &B) const {
3405 Register Dst = MI.getOperand(0).getReg();
3406 LLT DstTy = MRI.getType(Dst);
3407 LLT S16 = LLT::scalar(16);
3408 LLT S32 = LLT::scalar(32);
3409 LLT S64 = LLT::scalar(64);
3410
3411 if (DstTy == S16)
3412 return legalizeFDIV16(MI, MRI, B);
3413 if (DstTy == S32)
3414 return legalizeFDIV32(MI, MRI, B);
3415 if (DstTy == S64)
3416 return legalizeFDIV64(MI, MRI, B);
3417
3418 return false;
3419 }
3420
legalizeUnsignedDIV_REM32Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register X,Register Y) const3421 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3422 Register DstDivReg,
3423 Register DstRemReg,
3424 Register X,
3425 Register Y) const {
3426 const LLT S1 = LLT::scalar(1);
3427 const LLT S32 = LLT::scalar(32);
3428
3429 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3430 // algorithm used here.
3431
3432 // Initial estimate of inv(y).
3433 auto FloatY = B.buildUITOFP(S32, Y);
3434 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3435 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3436 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3437 auto Z = B.buildFPTOUI(S32, ScaledY);
3438
3439 // One round of UNR.
3440 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3441 auto NegYZ = B.buildMul(S32, NegY, Z);
3442 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3443
3444 // Quotient/remainder estimate.
3445 auto Q = B.buildUMulH(S32, X, Z);
3446 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3447
3448 // First quotient/remainder refinement.
3449 auto One = B.buildConstant(S32, 1);
3450 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3451 if (DstDivReg)
3452 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3453 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3454
3455 // Second quotient/remainder refinement.
3456 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3457 if (DstDivReg)
3458 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3459
3460 if (DstRemReg)
3461 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3462 }
3463
3464 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3465 //
3466 // Return lo, hi of result
3467 //
3468 // %cvt.lo = G_UITOFP Val.lo
3469 // %cvt.hi = G_UITOFP Val.hi
3470 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3471 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3472 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3473 // %mul2 = G_FMUL %mul1, 2**(-32)
3474 // %trunc = G_INTRINSIC_TRUNC %mul2
3475 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3476 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
emitReciprocalU64(MachineIRBuilder & B,Register Val)3477 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3478 Register Val) {
3479 const LLT S32 = LLT::scalar(32);
3480 auto Unmerge = B.buildUnmerge(S32, Val);
3481
3482 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3483 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3484
3485 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3486 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3487
3488 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3489 auto Mul1 =
3490 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3491
3492 // 2**(-32)
3493 auto Mul2 =
3494 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3495 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3496
3497 // -(2**32)
3498 auto Mad2 = B.buildFMAD(S32, Trunc,
3499 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3500
3501 auto ResultLo = B.buildFPTOUI(S32, Mad2);
3502 auto ResultHi = B.buildFPTOUI(S32, Trunc);
3503
3504 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3505 }
3506
legalizeUnsignedDIV_REM64Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register Numer,Register Denom) const3507 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3508 Register DstDivReg,
3509 Register DstRemReg,
3510 Register Numer,
3511 Register Denom) const {
3512 const LLT S32 = LLT::scalar(32);
3513 const LLT S64 = LLT::scalar(64);
3514 const LLT S1 = LLT::scalar(1);
3515 Register RcpLo, RcpHi;
3516
3517 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3518
3519 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
3520
3521 auto Zero64 = B.buildConstant(S64, 0);
3522 auto NegDenom = B.buildSub(S64, Zero64, Denom);
3523
3524 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3525 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3526
3527 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3528 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3529 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3530
3531 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3532 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3533 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
3534
3535 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3536 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3537 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3538 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3539 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3540
3541 auto Zero32 = B.buildConstant(S32, 0);
3542 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3543 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3544 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
3545
3546 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3547 Register NumerLo = UnmergeNumer.getReg(0);
3548 Register NumerHi = UnmergeNumer.getReg(1);
3549
3550 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3551 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3552 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3553 Register Mul3_Lo = UnmergeMul3.getReg(0);
3554 Register Mul3_Hi = UnmergeMul3.getReg(1);
3555 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3556 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3557 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3558 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
3559
3560 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3561 Register DenomLo = UnmergeDenom.getReg(0);
3562 Register DenomHi = UnmergeDenom.getReg(1);
3563
3564 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3565 auto C1 = B.buildSExt(S32, CmpHi);
3566
3567 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3568 auto C2 = B.buildSExt(S32, CmpLo);
3569
3570 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3571 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3572
3573 // TODO: Here and below portions of the code can be enclosed into if/endif.
3574 // Currently control flow is unconditional and we have 4 selects after
3575 // potential endif to substitute PHIs.
3576
3577 // if C3 != 0 ...
3578 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3579 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3580 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3581 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
3582
3583 auto One64 = B.buildConstant(S64, 1);
3584 auto Add3 = B.buildAdd(S64, MulHi3, One64);
3585
3586 auto C4 =
3587 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3588 auto C5 =
3589 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3590 auto C6 = B.buildSelect(
3591 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3592
3593 // if (C6 != 0)
3594 auto Add4 = B.buildAdd(S64, Add3, One64);
3595 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3596
3597 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3598 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3599 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
3600
3601 // endif C6
3602 // endif C3
3603
3604 if (DstDivReg) {
3605 auto Sel1 = B.buildSelect(
3606 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3607 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3608 Sel1, MulHi3);
3609 }
3610
3611 if (DstRemReg) {
3612 auto Sel2 = B.buildSelect(
3613 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3614 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3615 Sel2, Sub1);
3616 }
3617 }
3618
legalizeUnsignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3619 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3620 MachineRegisterInfo &MRI,
3621 MachineIRBuilder &B) const {
3622 Register DstDivReg, DstRemReg;
3623 switch (MI.getOpcode()) {
3624 default:
3625 llvm_unreachable("Unexpected opcode!");
3626 case AMDGPU::G_UDIV: {
3627 DstDivReg = MI.getOperand(0).getReg();
3628 break;
3629 }
3630 case AMDGPU::G_UREM: {
3631 DstRemReg = MI.getOperand(0).getReg();
3632 break;
3633 }
3634 case AMDGPU::G_UDIVREM: {
3635 DstDivReg = MI.getOperand(0).getReg();
3636 DstRemReg = MI.getOperand(1).getReg();
3637 break;
3638 }
3639 }
3640
3641 const LLT S64 = LLT::scalar(64);
3642 const LLT S32 = LLT::scalar(32);
3643 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3644 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3645 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3646 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3647
3648 if (Ty == S32)
3649 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3650 else if (Ty == S64)
3651 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3652 else
3653 return false;
3654
3655 MI.eraseFromParent();
3656 return true;
3657 }
3658
legalizeSignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3659 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3660 MachineRegisterInfo &MRI,
3661 MachineIRBuilder &B) const {
3662 const LLT S64 = LLT::scalar(64);
3663 const LLT S32 = LLT::scalar(32);
3664
3665 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3666 if (Ty != S32 && Ty != S64)
3667 return false;
3668
3669 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3670 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3671 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3672
3673 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3674 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3675 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3676
3677 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3678 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3679
3680 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3681 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3682
3683 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3684 switch (MI.getOpcode()) {
3685 default:
3686 llvm_unreachable("Unexpected opcode!");
3687 case AMDGPU::G_SDIV: {
3688 DstDivReg = MI.getOperand(0).getReg();
3689 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3690 break;
3691 }
3692 case AMDGPU::G_SREM: {
3693 DstRemReg = MI.getOperand(0).getReg();
3694 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3695 break;
3696 }
3697 case AMDGPU::G_SDIVREM: {
3698 DstDivReg = MI.getOperand(0).getReg();
3699 DstRemReg = MI.getOperand(1).getReg();
3700 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3701 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3702 break;
3703 }
3704 }
3705
3706 if (Ty == S32)
3707 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3708 else
3709 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3710
3711 if (DstDivReg) {
3712 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3713 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3714 B.buildSub(DstDivReg, SignXor, Sign);
3715 }
3716
3717 if (DstRemReg) {
3718 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3719 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3720 B.buildSub(DstRemReg, SignXor, Sign);
3721 }
3722
3723 MI.eraseFromParent();
3724 return true;
3725 }
3726
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3727 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3728 MachineRegisterInfo &MRI,
3729 MachineIRBuilder &B) const {
3730 Register Res = MI.getOperand(0).getReg();
3731 Register LHS = MI.getOperand(1).getReg();
3732 Register RHS = MI.getOperand(2).getReg();
3733 uint16_t Flags = MI.getFlags();
3734 LLT ResTy = MRI.getType(Res);
3735
3736 const MachineFunction &MF = B.getMF();
3737 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3738 MI.getFlag(MachineInstr::FmAfn);
3739
3740 if (!AllowInaccurateRcp)
3741 return false;
3742
3743 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3744 // 1 / x -> RCP(x)
3745 if (CLHS->isExactlyValue(1.0)) {
3746 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3747 .addUse(RHS)
3748 .setMIFlags(Flags);
3749
3750 MI.eraseFromParent();
3751 return true;
3752 }
3753
3754 // -1 / x -> RCP( FNEG(x) )
3755 if (CLHS->isExactlyValue(-1.0)) {
3756 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3757 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3758 .addUse(FNeg.getReg(0))
3759 .setMIFlags(Flags);
3760
3761 MI.eraseFromParent();
3762 return true;
3763 }
3764 }
3765
3766 // x / y -> x * (1.0 / y)
3767 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3768 .addUse(RHS)
3769 .setMIFlags(Flags);
3770 B.buildFMul(Res, LHS, RCP, Flags);
3771
3772 MI.eraseFromParent();
3773 return true;
3774 }
3775
legalizeFastUnsafeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3776 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3777 MachineRegisterInfo &MRI,
3778 MachineIRBuilder &B) const {
3779 Register Res = MI.getOperand(0).getReg();
3780 Register X = MI.getOperand(1).getReg();
3781 Register Y = MI.getOperand(2).getReg();
3782 uint16_t Flags = MI.getFlags();
3783 LLT ResTy = MRI.getType(Res);
3784
3785 const MachineFunction &MF = B.getMF();
3786 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3787 MI.getFlag(MachineInstr::FmAfn);
3788
3789 if (!AllowInaccurateRcp)
3790 return false;
3791
3792 auto NegY = B.buildFNeg(ResTy, Y);
3793 auto One = B.buildFConstant(ResTy, 1.0);
3794
3795 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3796 .addUse(Y)
3797 .setMIFlags(Flags);
3798
3799 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3800 R = B.buildFMA(ResTy, Tmp0, R, R);
3801
3802 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3803 R = B.buildFMA(ResTy, Tmp1, R, R);
3804
3805 auto Ret = B.buildFMul(ResTy, X, R);
3806 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3807
3808 B.buildFMA(Res, Tmp2, R, Ret);
3809 MI.eraseFromParent();
3810 return true;
3811 }
3812
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3813 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3814 MachineRegisterInfo &MRI,
3815 MachineIRBuilder &B) const {
3816 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3817 return true;
3818
3819 Register Res = MI.getOperand(0).getReg();
3820 Register LHS = MI.getOperand(1).getReg();
3821 Register RHS = MI.getOperand(2).getReg();
3822
3823 uint16_t Flags = MI.getFlags();
3824
3825 LLT S16 = LLT::scalar(16);
3826 LLT S32 = LLT::scalar(32);
3827
3828 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3829 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3830
3831 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3832 .addUse(RHSExt.getReg(0))
3833 .setMIFlags(Flags);
3834
3835 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3836 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3837
3838 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3839 .addUse(RDst.getReg(0))
3840 .addUse(RHS)
3841 .addUse(LHS)
3842 .setMIFlags(Flags);
3843
3844 MI.eraseFromParent();
3845 return true;
3846 }
3847
3848 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3849 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,AMDGPU::SIModeRegisterDefaults Mode)3850 static void toggleSPDenormMode(bool Enable,
3851 MachineIRBuilder &B,
3852 const GCNSubtarget &ST,
3853 AMDGPU::SIModeRegisterDefaults Mode) {
3854 // Set SP denorm mode to this value.
3855 unsigned SPDenormMode =
3856 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3857
3858 if (ST.hasDenormModeInst()) {
3859 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3860 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3861
3862 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3863 B.buildInstr(AMDGPU::S_DENORM_MODE)
3864 .addImm(NewDenormModeValue);
3865
3866 } else {
3867 // Select FP32 bit field in mode register.
3868 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3869 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3870 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3871
3872 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3873 .addImm(SPDenormMode)
3874 .addImm(SPDenormModeBitField);
3875 }
3876 }
3877
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3878 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3879 MachineRegisterInfo &MRI,
3880 MachineIRBuilder &B) const {
3881 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3882 return true;
3883
3884 Register Res = MI.getOperand(0).getReg();
3885 Register LHS = MI.getOperand(1).getReg();
3886 Register RHS = MI.getOperand(2).getReg();
3887 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3888 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3889
3890 uint16_t Flags = MI.getFlags();
3891
3892 LLT S32 = LLT::scalar(32);
3893 LLT S1 = LLT::scalar(1);
3894
3895 auto One = B.buildFConstant(S32, 1.0f);
3896
3897 auto DenominatorScaled =
3898 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3899 .addUse(LHS)
3900 .addUse(RHS)
3901 .addImm(0)
3902 .setMIFlags(Flags);
3903 auto NumeratorScaled =
3904 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3905 .addUse(LHS)
3906 .addUse(RHS)
3907 .addImm(1)
3908 .setMIFlags(Flags);
3909
3910 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3911 .addUse(DenominatorScaled.getReg(0))
3912 .setMIFlags(Flags);
3913 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3914
3915 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3916 // aren't modeled as reading it.
3917 if (!Mode.allFP32Denormals())
3918 toggleSPDenormMode(true, B, ST, Mode);
3919
3920 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3921 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3922 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3923 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3924 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3925 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3926
3927 if (!Mode.allFP32Denormals())
3928 toggleSPDenormMode(false, B, ST, Mode);
3929
3930 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3931 .addUse(Fma4.getReg(0))
3932 .addUse(Fma1.getReg(0))
3933 .addUse(Fma3.getReg(0))
3934 .addUse(NumeratorScaled.getReg(1))
3935 .setMIFlags(Flags);
3936
3937 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3938 .addUse(Fmas.getReg(0))
3939 .addUse(RHS)
3940 .addUse(LHS)
3941 .setMIFlags(Flags);
3942
3943 MI.eraseFromParent();
3944 return true;
3945 }
3946
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3947 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3948 MachineRegisterInfo &MRI,
3949 MachineIRBuilder &B) const {
3950 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3951 return true;
3952
3953 Register Res = MI.getOperand(0).getReg();
3954 Register LHS = MI.getOperand(1).getReg();
3955 Register RHS = MI.getOperand(2).getReg();
3956
3957 uint16_t Flags = MI.getFlags();
3958
3959 LLT S64 = LLT::scalar(64);
3960 LLT S1 = LLT::scalar(1);
3961
3962 auto One = B.buildFConstant(S64, 1.0);
3963
3964 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3965 .addUse(LHS)
3966 .addUse(RHS)
3967 .addImm(0)
3968 .setMIFlags(Flags);
3969
3970 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3971
3972 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3973 .addUse(DivScale0.getReg(0))
3974 .setMIFlags(Flags);
3975
3976 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3977 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3978 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3979
3980 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3981 .addUse(LHS)
3982 .addUse(RHS)
3983 .addImm(1)
3984 .setMIFlags(Flags);
3985
3986 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3987 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3988 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3989
3990 Register Scale;
3991 if (!ST.hasUsableDivScaleConditionOutput()) {
3992 // Workaround a hardware bug on SI where the condition output from div_scale
3993 // is not usable.
3994
3995 LLT S32 = LLT::scalar(32);
3996
3997 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3998 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3999 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4000 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4001
4002 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4003 Scale1Unmerge.getReg(1));
4004 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4005 Scale0Unmerge.getReg(1));
4006 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4007 } else {
4008 Scale = DivScale1.getReg(1);
4009 }
4010
4011 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4012 .addUse(Fma4.getReg(0))
4013 .addUse(Fma3.getReg(0))
4014 .addUse(Mul.getReg(0))
4015 .addUse(Scale)
4016 .setMIFlags(Flags);
4017
4018 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false)
4019 .addUse(Fmas.getReg(0))
4020 .addUse(RHS)
4021 .addUse(LHS)
4022 .setMIFlags(Flags);
4023
4024 MI.eraseFromParent();
4025 return true;
4026 }
4027
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4028 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
4029 MachineRegisterInfo &MRI,
4030 MachineIRBuilder &B) const {
4031 Register Res = MI.getOperand(0).getReg();
4032 Register LHS = MI.getOperand(2).getReg();
4033 Register RHS = MI.getOperand(3).getReg();
4034 uint16_t Flags = MI.getFlags();
4035
4036 LLT S32 = LLT::scalar(32);
4037 LLT S1 = LLT::scalar(1);
4038
4039 auto Abs = B.buildFAbs(S32, RHS, Flags);
4040 const APFloat C0Val(1.0f);
4041
4042 auto C0 = B.buildConstant(S32, 0x6f800000);
4043 auto C1 = B.buildConstant(S32, 0x2f800000);
4044 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
4045
4046 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4047 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4048
4049 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4050
4051 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4052 .addUse(Mul0.getReg(0))
4053 .setMIFlags(Flags);
4054
4055 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4056
4057 B.buildFMul(Res, Sel, Mul1, Flags);
4058
4059 MI.eraseFromParent();
4060 return true;
4061 }
4062
4063 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4064 // FIXME: Why do we handle this one but not other removed instructions?
4065 //
4066 // Reciprocal square root. The clamp prevents infinite results, clamping
4067 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
4068 // +-max_float.
legalizeRsqClampIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4069 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4070 MachineRegisterInfo &MRI,
4071 MachineIRBuilder &B) const {
4072 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4073 return true;
4074
4075 Register Dst = MI.getOperand(0).getReg();
4076 Register Src = MI.getOperand(2).getReg();
4077 auto Flags = MI.getFlags();
4078
4079 LLT Ty = MRI.getType(Dst);
4080
4081 const fltSemantics *FltSemantics;
4082 if (Ty == LLT::scalar(32))
4083 FltSemantics = &APFloat::IEEEsingle();
4084 else if (Ty == LLT::scalar(64))
4085 FltSemantics = &APFloat::IEEEdouble();
4086 else
4087 return false;
4088
4089 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4090 .addUse(Src)
4091 .setMIFlags(Flags);
4092
4093 // We don't need to concern ourselves with the snan handling difference, since
4094 // the rsq quieted (or not) so use the one which will directly select.
4095 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4096 const bool UseIEEE = MFI->getMode().IEEE;
4097
4098 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4099 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4100 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4101
4102 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4103
4104 if (UseIEEE)
4105 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4106 else
4107 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4108 MI.eraseFromParent();
4109 return true;
4110 }
4111
getDSFPAtomicOpcode(Intrinsic::ID IID)4112 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4113 switch (IID) {
4114 case Intrinsic::amdgcn_ds_fadd:
4115 return AMDGPU::G_ATOMICRMW_FADD;
4116 case Intrinsic::amdgcn_ds_fmin:
4117 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4118 case Intrinsic::amdgcn_ds_fmax:
4119 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4120 default:
4121 llvm_unreachable("not a DS FP intrinsic");
4122 }
4123 }
4124
legalizeDSAtomicFPIntrinsic(LegalizerHelper & Helper,MachineInstr & MI,Intrinsic::ID IID) const4125 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
4126 MachineInstr &MI,
4127 Intrinsic::ID IID) const {
4128 GISelChangeObserver &Observer = Helper.Observer;
4129 Observer.changingInstr(MI);
4130
4131 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4132
4133 // The remaining operands were used to set fields in the MemOperand on
4134 // construction.
4135 for (int I = 6; I > 3; --I)
4136 MI.removeOperand(I);
4137
4138 MI.removeOperand(1); // Remove the intrinsic ID.
4139 Observer.changedInstr(MI);
4140 return true;
4141 }
4142
getImplicitArgPtr(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4143 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
4144 MachineRegisterInfo &MRI,
4145 MachineIRBuilder &B) const {
4146 uint64_t Offset =
4147 ST.getTargetLowering()->getImplicitParameterOffset(
4148 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
4149 LLT DstTy = MRI.getType(DstReg);
4150 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4151
4152 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4153 if (!loadInputValue(KernargPtrReg, B,
4154 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4155 return false;
4156
4157 // FIXME: This should be nuw
4158 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4159 return true;
4160 }
4161
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4162 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
4163 MachineRegisterInfo &MRI,
4164 MachineIRBuilder &B) const {
4165 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4166 if (!MFI->isEntryFunction()) {
4167 return legalizePreloadedArgIntrin(MI, MRI, B,
4168 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
4169 }
4170
4171 Register DstReg = MI.getOperand(0).getReg();
4172 if (!getImplicitArgPtr(DstReg, MRI, B))
4173 return false;
4174
4175 MI.eraseFromParent();
4176 return true;
4177 }
4178
getLDSKernelId(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4179 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
4180 MachineRegisterInfo &MRI,
4181 MachineIRBuilder &B) const {
4182 Function &F = B.getMF().getFunction();
4183 std::optional<uint32_t> KnownSize =
4184 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
4185 if (KnownSize.has_value())
4186 B.buildConstant(DstReg, *KnownSize);
4187 return false;
4188 }
4189
legalizeLDSKernelId(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4190 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
4191 MachineRegisterInfo &MRI,
4192 MachineIRBuilder &B) const {
4193
4194 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4195 if (!MFI->isEntryFunction()) {
4196 return legalizePreloadedArgIntrin(MI, MRI, B,
4197 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
4198 }
4199
4200 Register DstReg = MI.getOperand(0).getReg();
4201 if (!getLDSKernelId(DstReg, MRI, B))
4202 return false;
4203
4204 MI.eraseFromParent();
4205 return true;
4206 }
4207
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const4208 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
4209 MachineRegisterInfo &MRI,
4210 MachineIRBuilder &B,
4211 unsigned AddrSpace) const {
4212 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4213 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4214 Register Hi32 = Unmerge.getReg(1);
4215
4216 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
4217 MI.eraseFromParent();
4218 return true;
4219 }
4220
4221 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
4222 // offset (the offset that is included in bounds checking and swizzling, to be
4223 // split between the instruction's voffset and immoffset fields) and soffset
4224 // (the offset that is excluded from bounds checking and swizzling, to go in
4225 // the instruction's soffset field). This function takes the first kind of
4226 // offset and figures out how to split it between voffset and immoffset.
4227 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const4228 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
4229 Register OrigOffset) const {
4230 const unsigned MaxImm = 4095;
4231 Register BaseReg;
4232 unsigned ImmOffset;
4233 const LLT S32 = LLT::scalar(32);
4234 MachineRegisterInfo &MRI = *B.getMRI();
4235
4236 std::tie(BaseReg, ImmOffset) =
4237 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
4238
4239 // If BaseReg is a pointer, convert it to int.
4240 if (MRI.getType(BaseReg).isPointer())
4241 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
4242
4243 // If the immediate value is too big for the immoffset field, put the value
4244 // and -4096 into the immoffset field so that the value that is copied/added
4245 // for the voffset field is a multiple of 4096, and it stands more chance
4246 // of being CSEd with the copy/add for another similar load/store.
4247 // However, do not do that rounding down to a multiple of 4096 if that is a
4248 // negative number, as it appears to be illegal to have a negative offset
4249 // in the vgpr, even if adding the immediate offset makes it positive.
4250 unsigned Overflow = ImmOffset & ~MaxImm;
4251 ImmOffset -= Overflow;
4252 if ((int32_t)Overflow < 0) {
4253 Overflow += ImmOffset;
4254 ImmOffset = 0;
4255 }
4256
4257 if (Overflow != 0) {
4258 if (!BaseReg) {
4259 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
4260 } else {
4261 auto OverflowVal = B.buildConstant(S32, Overflow);
4262 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4263 }
4264 }
4265
4266 if (!BaseReg)
4267 BaseReg = B.buildConstant(S32, 0).getReg(0);
4268
4269 return std::pair(BaseReg, ImmOffset);
4270 }
4271
4272 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
updateBufferMMO(MachineMemOperand * MMO,Register VOffset,Register SOffset,unsigned ImmOffset,Register VIndex,MachineRegisterInfo & MRI) const4273 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
4274 Register VOffset, Register SOffset,
4275 unsigned ImmOffset, Register VIndex,
4276 MachineRegisterInfo &MRI) const {
4277 std::optional<ValueAndVReg> MaybeVOffsetVal =
4278 getIConstantVRegValWithLookThrough(VOffset, MRI);
4279 std::optional<ValueAndVReg> MaybeSOffsetVal =
4280 getIConstantVRegValWithLookThrough(SOffset, MRI);
4281 std::optional<ValueAndVReg> MaybeVIndexVal =
4282 getIConstantVRegValWithLookThrough(VIndex, MRI);
4283 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4284 // update the MMO with that offset. The stride is unknown so we can only do
4285 // this if VIndex is constant 0.
4286 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4287 MaybeVIndexVal->Value == 0) {
4288 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4289 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4290 MMO->setOffset(TotalOffset);
4291 } else {
4292 // We don't have a constant combined offset to use in the MMO. Give up.
4293 MMO->setValue((Value *)nullptr);
4294 }
4295 }
4296
4297 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg,bool ImageStore) const4298 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
4299 MachineRegisterInfo &MRI,
4300 Register Reg,
4301 bool ImageStore) const {
4302 const LLT S16 = LLT::scalar(16);
4303 const LLT S32 = LLT::scalar(32);
4304 LLT StoreVT = MRI.getType(Reg);
4305 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
4306
4307 if (ST.hasUnpackedD16VMem()) {
4308 auto Unmerge = B.buildUnmerge(S16, Reg);
4309
4310 SmallVector<Register, 4> WideRegs;
4311 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4312 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
4313
4314 int NumElts = StoreVT.getNumElements();
4315
4316 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4317 .getReg(0);
4318 }
4319
4320 if (ImageStore && ST.hasImageStoreD16Bug()) {
4321 if (StoreVT.getNumElements() == 2) {
4322 SmallVector<Register, 4> PackedRegs;
4323 Reg = B.buildBitcast(S32, Reg).getReg(0);
4324 PackedRegs.push_back(Reg);
4325 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4326 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4327 .getReg(0);
4328 }
4329
4330 if (StoreVT.getNumElements() == 3) {
4331 SmallVector<Register, 4> PackedRegs;
4332 auto Unmerge = B.buildUnmerge(S16, Reg);
4333 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4334 PackedRegs.push_back(Unmerge.getReg(I));
4335 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4336 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4337 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4338 }
4339
4340 if (StoreVT.getNumElements() == 4) {
4341 SmallVector<Register, 4> PackedRegs;
4342 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4343 auto Unmerge = B.buildUnmerge(S32, Reg);
4344 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4345 PackedRegs.push_back(Unmerge.getReg(I));
4346 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4347 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4348 .getReg(0);
4349 }
4350
4351 llvm_unreachable("invalid data type");
4352 }
4353
4354 if (StoreVT == LLT::fixed_vector(3, S16)) {
4355 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4356 .getReg(0);
4357 }
4358 return Reg;
4359 }
4360
fixStoreSourceType(MachineIRBuilder & B,Register VData,bool IsFormat) const4361 Register AMDGPULegalizerInfo::fixStoreSourceType(
4362 MachineIRBuilder &B, Register VData, bool IsFormat) const {
4363 MachineRegisterInfo *MRI = B.getMRI();
4364 LLT Ty = MRI->getType(VData);
4365
4366 const LLT S16 = LLT::scalar(16);
4367
4368 // Fixup illegal register types for i8 stores.
4369 if (Ty == LLT::scalar(8) || Ty == S16) {
4370 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4371 return AnyExt;
4372 }
4373
4374 if (Ty.isVector()) {
4375 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4376 if (IsFormat)
4377 return handleD16VData(B, *MRI, VData);
4378 }
4379 }
4380
4381 return VData;
4382 }
4383
legalizeBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsTyped,bool IsFormat) const4384 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
4385 MachineRegisterInfo &MRI,
4386 MachineIRBuilder &B,
4387 bool IsTyped,
4388 bool IsFormat) const {
4389 Register VData = MI.getOperand(1).getReg();
4390 LLT Ty = MRI.getType(VData);
4391 LLT EltTy = Ty.getScalarType();
4392 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4393 const LLT S32 = LLT::scalar(32);
4394
4395 VData = fixStoreSourceType(B, VData, IsFormat);
4396 Register RSrc = MI.getOperand(2).getReg();
4397
4398 MachineMemOperand *MMO = *MI.memoperands_begin();
4399 const int MemSize = MMO->getSize();
4400
4401 unsigned ImmOffset;
4402
4403 // The typed intrinsics add an immediate after the registers.
4404 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4405
4406 // The struct intrinsic variants add one additional operand over raw.
4407 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4408 Register VIndex;
4409 int OpOffset = 0;
4410 if (HasVIndex) {
4411 VIndex = MI.getOperand(3).getReg();
4412 OpOffset = 1;
4413 } else {
4414 VIndex = B.buildConstant(S32, 0).getReg(0);
4415 }
4416
4417 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4418 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4419
4420 unsigned Format = 0;
4421 if (IsTyped) {
4422 Format = MI.getOperand(5 + OpOffset).getImm();
4423 ++OpOffset;
4424 }
4425
4426 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4427
4428 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4429 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4430
4431 unsigned Opc;
4432 if (IsTyped) {
4433 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4434 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4435 } else if (IsFormat) {
4436 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4437 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4438 } else {
4439 switch (MemSize) {
4440 case 1:
4441 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4442 break;
4443 case 2:
4444 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4445 break;
4446 default:
4447 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4448 break;
4449 }
4450 }
4451
4452 auto MIB = B.buildInstr(Opc)
4453 .addUse(VData) // vdata
4454 .addUse(RSrc) // rsrc
4455 .addUse(VIndex) // vindex
4456 .addUse(VOffset) // voffset
4457 .addUse(SOffset) // soffset
4458 .addImm(ImmOffset); // offset(imm)
4459
4460 if (IsTyped)
4461 MIB.addImm(Format);
4462
4463 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4464 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4465 .addMemOperand(MMO);
4466
4467 MI.eraseFromParent();
4468 return true;
4469 }
4470
buildBufferLoad(unsigned Opc,Register LoadDstReg,Register RSrc,Register VIndex,Register VOffset,Register SOffset,unsigned ImmOffset,unsigned Format,unsigned AuxiliaryData,MachineMemOperand * MMO,bool IsTyped,bool HasVIndex,MachineIRBuilder & B)4471 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
4472 Register VIndex, Register VOffset, Register SOffset,
4473 unsigned ImmOffset, unsigned Format,
4474 unsigned AuxiliaryData, MachineMemOperand *MMO,
4475 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
4476 auto MIB = B.buildInstr(Opc)
4477 .addDef(LoadDstReg) // vdata
4478 .addUse(RSrc) // rsrc
4479 .addUse(VIndex) // vindex
4480 .addUse(VOffset) // voffset
4481 .addUse(SOffset) // soffset
4482 .addImm(ImmOffset); // offset(imm)
4483
4484 if (IsTyped)
4485 MIB.addImm(Format);
4486
4487 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4488 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4489 .addMemOperand(MMO);
4490 }
4491
legalizeBufferLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat,bool IsTyped) const4492 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4493 MachineRegisterInfo &MRI,
4494 MachineIRBuilder &B,
4495 bool IsFormat,
4496 bool IsTyped) const {
4497 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4498 MachineMemOperand *MMO = *MI.memoperands_begin();
4499 const LLT MemTy = MMO->getMemoryType();
4500 const LLT S32 = LLT::scalar(32);
4501
4502 Register Dst = MI.getOperand(0).getReg();
4503
4504 Register StatusDst;
4505 int OpOffset = 0;
4506 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
4507 bool IsTFE = MI.getNumExplicitDefs() == 2;
4508 if (IsTFE) {
4509 StatusDst = MI.getOperand(1).getReg();
4510 ++OpOffset;
4511 }
4512
4513 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
4514
4515 // The typed intrinsics add an immediate after the registers.
4516 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4517
4518 // The struct intrinsic variants add one additional operand over raw.
4519 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
4520 Register VIndex;
4521 if (HasVIndex) {
4522 VIndex = MI.getOperand(3 + OpOffset).getReg();
4523 ++OpOffset;
4524 } else {
4525 VIndex = B.buildConstant(S32, 0).getReg(0);
4526 }
4527
4528 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4529 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4530
4531 unsigned Format = 0;
4532 if (IsTyped) {
4533 Format = MI.getOperand(5 + OpOffset).getImm();
4534 ++OpOffset;
4535 }
4536
4537 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4538 unsigned ImmOffset;
4539
4540 LLT Ty = MRI.getType(Dst);
4541 LLT EltTy = Ty.getScalarType();
4542 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4543 const bool Unpacked = ST.hasUnpackedD16VMem();
4544
4545 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4546 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4547
4548 unsigned Opc;
4549
4550 // TODO: Support TFE for typed and narrow loads.
4551 if (IsTyped) {
4552 if (IsTFE)
4553 return false;
4554 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4555 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4556 } else if (IsFormat) {
4557 if (IsD16) {
4558 if (IsTFE)
4559 return false;
4560 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
4561 } else {
4562 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
4563 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4564 }
4565 } else {
4566 if (IsTFE)
4567 return false;
4568 switch (MemTy.getSizeInBits()) {
4569 case 8:
4570 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4571 break;
4572 case 16:
4573 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4574 break;
4575 default:
4576 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4577 break;
4578 }
4579 }
4580
4581 if (IsTFE) {
4582 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
4583 unsigned NumLoadDWords = NumValueDWords + 1;
4584 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
4585 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
4586 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4587 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4588 if (NumValueDWords == 1) {
4589 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
4590 } else {
4591 SmallVector<Register, 5> LoadElts;
4592 for (unsigned I = 0; I != NumValueDWords; ++I)
4593 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
4594 LoadElts.push_back(StatusDst);
4595 B.buildUnmerge(LoadElts, LoadDstReg);
4596 LoadElts.truncate(NumValueDWords);
4597 B.buildMergeLikeInstr(Dst, LoadElts);
4598 }
4599 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
4600 (IsD16 && !Ty.isVector())) {
4601 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4602 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4603 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4604 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4605 B.buildTrunc(Dst, LoadDstReg);
4606 } else if (Unpacked && IsD16 && Ty.isVector()) {
4607 LLT UnpackedTy = Ty.changeElementSize(32);
4608 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4609 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4610 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4611 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4612 // FIXME: G_TRUNC should work, but legalization currently fails
4613 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4614 SmallVector<Register, 4> Repack;
4615 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4616 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4617 B.buildMergeLikeInstr(Dst, Repack);
4618 } else {
4619 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
4620 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4621 }
4622
4623 MI.eraseFromParent();
4624 return true;
4625 }
4626
legalizeAtomicIncDec(MachineInstr & MI,MachineIRBuilder & B,bool IsInc) const4627 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4628 MachineIRBuilder &B,
4629 bool IsInc) const {
4630 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4631 AMDGPU::G_AMDGPU_ATOMIC_DEC;
4632 B.buildInstr(Opc)
4633 .addDef(MI.getOperand(0).getReg())
4634 .addUse(MI.getOperand(2).getReg())
4635 .addUse(MI.getOperand(3).getReg())
4636 .cloneMemRefs(MI);
4637 MI.eraseFromParent();
4638 return true;
4639 }
4640
getBufferAtomicPseudo(Intrinsic::ID IntrID)4641 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4642 switch (IntrID) {
4643 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4644 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4645 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4646 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4647 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4648 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4649 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4650 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4651 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4652 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4653 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4654 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4655 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4656 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4657 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4658 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4659 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4660 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4661 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4662 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4663 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4664 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4665 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4666 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4667 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4668 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4669 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4670 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4671 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4672 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4673 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4674 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4675 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4676 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4677 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4678 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4679 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4680 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4681 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4682 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4683 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4684 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4685 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4686 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4687 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4688 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4689 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4690 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4691 default:
4692 llvm_unreachable("unhandled atomic opcode");
4693 }
4694 }
4695
legalizeBufferAtomic(MachineInstr & MI,MachineIRBuilder & B,Intrinsic::ID IID) const4696 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4697 MachineIRBuilder &B,
4698 Intrinsic::ID IID) const {
4699 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4700 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4701 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4702
4703 Register Dst;
4704
4705 int OpOffset = 0;
4706 if (HasReturn) {
4707 // A few FP atomics do not support return values.
4708 Dst = MI.getOperand(0).getReg();
4709 } else {
4710 OpOffset = -1;
4711 }
4712
4713 Register VData = MI.getOperand(2 + OpOffset).getReg();
4714 Register CmpVal;
4715
4716 if (IsCmpSwap) {
4717 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4718 ++OpOffset;
4719 }
4720
4721 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4722 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4723
4724 // The struct intrinsic variants add one additional operand over raw.
4725 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4726 Register VIndex;
4727 if (HasVIndex) {
4728 VIndex = MI.getOperand(4 + OpOffset).getReg();
4729 ++OpOffset;
4730 } else {
4731 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4732 }
4733
4734 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4735 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4736 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4737
4738 MachineMemOperand *MMO = *MI.memoperands_begin();
4739
4740 unsigned ImmOffset;
4741 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4742 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4743
4744 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4745
4746 if (HasReturn)
4747 MIB.addDef(Dst);
4748
4749 MIB.addUse(VData); // vdata
4750
4751 if (IsCmpSwap)
4752 MIB.addReg(CmpVal);
4753
4754 MIB.addUse(RSrc) // rsrc
4755 .addUse(VIndex) // vindex
4756 .addUse(VOffset) // voffset
4757 .addUse(SOffset) // soffset
4758 .addImm(ImmOffset) // offset(imm)
4759 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4760 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4761 .addMemOperand(MMO);
4762
4763 MI.eraseFromParent();
4764 return true;
4765 }
4766
4767 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4768 /// vector with s16 typed elements.
packImage16bitOpsToDwords(MachineIRBuilder & B,MachineInstr & MI,SmallVectorImpl<Register> & PackedAddrs,unsigned ArgOffset,const AMDGPU::ImageDimIntrinsicInfo * Intr,bool IsA16,bool IsG16)4769 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4770 SmallVectorImpl<Register> &PackedAddrs,
4771 unsigned ArgOffset,
4772 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4773 bool IsA16, bool IsG16) {
4774 const LLT S16 = LLT::scalar(16);
4775 const LLT V2S16 = LLT::fixed_vector(2, 16);
4776 auto EndIdx = Intr->VAddrEnd;
4777
4778 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4779 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4780 if (!SrcOp.isReg())
4781 continue; // _L to _LZ may have eliminated this.
4782
4783 Register AddrReg = SrcOp.getReg();
4784
4785 if ((I < Intr->GradientStart) ||
4786 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4787 (I >= Intr->CoordStart && !IsA16)) {
4788 if ((I < Intr->GradientStart) && IsA16 &&
4789 (B.getMRI()->getType(AddrReg) == S16)) {
4790 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4791 // Special handling of bias when A16 is on. Bias is of type half but
4792 // occupies full 32-bit.
4793 PackedAddrs.push_back(
4794 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4795 .getReg(0));
4796 } else {
4797 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4798 "Bias needs to be converted to 16 bit in A16 mode");
4799 // Handle any gradient or coordinate operands that should not be packed
4800 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4801 PackedAddrs.push_back(AddrReg);
4802 }
4803 } else {
4804 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4805 // derivatives dx/dh and dx/dv are packed with undef.
4806 if (((I + 1) >= EndIdx) ||
4807 ((Intr->NumGradients / 2) % 2 == 1 &&
4808 (I == static_cast<unsigned>(Intr->GradientStart +
4809 (Intr->NumGradients / 2) - 1) ||
4810 I == static_cast<unsigned>(Intr->GradientStart +
4811 Intr->NumGradients - 1))) ||
4812 // Check for _L to _LZ optimization
4813 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4814 PackedAddrs.push_back(
4815 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4816 .getReg(0));
4817 } else {
4818 PackedAddrs.push_back(
4819 B.buildBuildVector(
4820 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4821 .getReg(0));
4822 ++I;
4823 }
4824 }
4825 }
4826 }
4827
4828 /// Convert from separate vaddr components to a single vector address register,
4829 /// and replace the remaining operands with $noreg.
convertImageAddrToPacked(MachineIRBuilder & B,MachineInstr & MI,int DimIdx,int NumVAddrs)4830 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4831 int DimIdx, int NumVAddrs) {
4832 const LLT S32 = LLT::scalar(32);
4833 (void)S32;
4834 SmallVector<Register, 8> AddrRegs;
4835 for (int I = 0; I != NumVAddrs; ++I) {
4836 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4837 if (SrcOp.isReg()) {
4838 AddrRegs.push_back(SrcOp.getReg());
4839 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4840 }
4841 }
4842
4843 int NumAddrRegs = AddrRegs.size();
4844 if (NumAddrRegs != 1) {
4845 auto VAddr =
4846 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4847 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4848 }
4849
4850 for (int I = 1; I != NumVAddrs; ++I) {
4851 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4852 if (SrcOp.isReg())
4853 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4854 }
4855 }
4856
4857 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4858 ///
4859 /// Depending on the subtarget, load/store with 16-bit element data need to be
4860 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4861 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4862 /// registers.
4863 ///
4864 /// We don't want to directly select image instructions just yet, but also want
4865 /// to exposes all register repacking to the legalizer/combiners. We also don't
4866 /// want a selected instruction entering RegBankSelect. In order to avoid
4867 /// defining a multitude of intermediate image instructions, directly hack on
4868 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4869 /// padding now unnecessary arguments with $noreg.
legalizeImageIntrinsic(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer,const AMDGPU::ImageDimIntrinsicInfo * Intr) const4870 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4871 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4872 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4873
4874 const MachineFunction &MF = *MI.getMF();
4875 const unsigned NumDefs = MI.getNumExplicitDefs();
4876 const unsigned ArgOffset = NumDefs + 1;
4877 bool IsTFE = NumDefs == 2;
4878 // We are only processing the operands of d16 image operations on subtargets
4879 // that use the unpacked register layout, or need to repack the TFE result.
4880
4881 // TODO: Do we need to guard against already legalized intrinsics?
4882 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4883 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4884
4885 MachineRegisterInfo *MRI = B.getMRI();
4886 const LLT S32 = LLT::scalar(32);
4887 const LLT S16 = LLT::scalar(16);
4888 const LLT V2S16 = LLT::fixed_vector(2, 16);
4889
4890 unsigned DMask = 0;
4891 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4892 LLT Ty = MRI->getType(VData);
4893
4894 // Check for 16 bit addresses and pack if true.
4895 LLT GradTy =
4896 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4897 LLT AddrTy =
4898 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4899 const bool IsG16 = GradTy == S16;
4900 const bool IsA16 = AddrTy == S16;
4901 const bool IsD16 = Ty.getScalarType() == S16;
4902
4903 int DMaskLanes = 0;
4904 if (!BaseOpcode->Atomic) {
4905 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4906 if (BaseOpcode->Gather4) {
4907 DMaskLanes = 4;
4908 } else if (DMask != 0) {
4909 DMaskLanes = llvm::popcount(DMask);
4910 } else if (!IsTFE && !BaseOpcode->Store) {
4911 // If dmask is 0, this is a no-op load. This can be eliminated.
4912 B.buildUndef(MI.getOperand(0));
4913 MI.eraseFromParent();
4914 return true;
4915 }
4916 }
4917
4918 Observer.changingInstr(MI);
4919 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4920
4921 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4922 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4923 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4924 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4925 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4926
4927 // Track that we legalized this
4928 MI.setDesc(B.getTII().get(NewOpcode));
4929
4930 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4931 // dmask to be at least 1 otherwise the instruction will fail
4932 if (IsTFE && DMask == 0) {
4933 DMask = 0x1;
4934 DMaskLanes = 1;
4935 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4936 }
4937
4938 if (BaseOpcode->Atomic) {
4939 Register VData0 = MI.getOperand(2).getReg();
4940 LLT Ty = MRI->getType(VData0);
4941
4942 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4943 if (Ty.isVector())
4944 return false;
4945
4946 if (BaseOpcode->AtomicX2) {
4947 Register VData1 = MI.getOperand(3).getReg();
4948 // The two values are packed in one register.
4949 LLT PackedTy = LLT::fixed_vector(2, Ty);
4950 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4951 MI.getOperand(2).setReg(Concat.getReg(0));
4952 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4953 }
4954 }
4955
4956 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4957
4958 // Rewrite the addressing register layout before doing anything else.
4959 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4960 // 16 bit gradients are supported, but are tied to the A16 control
4961 // so both gradients and addresses must be 16 bit
4962 return false;
4963 }
4964
4965 if (IsA16 && !ST.hasA16()) {
4966 // A16 not supported
4967 return false;
4968 }
4969
4970 if (IsA16 || IsG16) {
4971 if (Intr->NumVAddrs > 1) {
4972 SmallVector<Register, 4> PackedRegs;
4973
4974 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4975 IsG16);
4976
4977 // See also below in the non-a16 branch
4978 const bool UseNSA = ST.hasNSAEncoding() &&
4979 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
4980 PackedRegs.size() <= ST.getNSAMaxSize();
4981
4982 if (!UseNSA && PackedRegs.size() > 1) {
4983 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4984 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4985 PackedRegs[0] = Concat.getReg(0);
4986 PackedRegs.resize(1);
4987 }
4988
4989 const unsigned NumPacked = PackedRegs.size();
4990 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4991 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4992 if (!SrcOp.isReg()) {
4993 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4994 continue;
4995 }
4996
4997 assert(SrcOp.getReg() != AMDGPU::NoRegister);
4998
4999 if (I - Intr->VAddrStart < NumPacked)
5000 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
5001 else
5002 SrcOp.setReg(AMDGPU::NoRegister);
5003 }
5004 }
5005 } else {
5006 // If the register allocator cannot place the address registers contiguously
5007 // without introducing moves, then using the non-sequential address encoding
5008 // is always preferable, since it saves VALU instructions and is usually a
5009 // wash in terms of code size or even better.
5010 //
5011 // However, we currently have no way of hinting to the register allocator
5012 // that MIMG addresses should be placed contiguously when it is possible to
5013 // do so, so force non-NSA for the common 2-address case as a heuristic.
5014 //
5015 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5016 // allocation when possible.
5017 //
5018 // TODO: we can actually allow partial NSA where the final register is a
5019 // contiguous set of the remaining addresses.
5020 // This could help where there are more addresses than supported.
5021 const bool UseNSA = ST.hasNSAEncoding() &&
5022 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
5023 CorrectedNumVAddrs <= ST.getNSAMaxSize();
5024
5025 if (!UseNSA && Intr->NumVAddrs > 1)
5026 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
5027 Intr->NumVAddrs);
5028 }
5029
5030 int Flags = 0;
5031 if (IsA16)
5032 Flags |= 1;
5033 if (IsG16)
5034 Flags |= 2;
5035 MI.addOperand(MachineOperand::CreateImm(Flags));
5036
5037 if (BaseOpcode->Store) { // No TFE for stores?
5038 // TODO: Handle dmask trim
5039 if (!Ty.isVector() || !IsD16)
5040 return true;
5041
5042 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
5043 if (RepackedReg != VData) {
5044 MI.getOperand(1).setReg(RepackedReg);
5045 }
5046
5047 return true;
5048 }
5049
5050 Register DstReg = MI.getOperand(0).getReg();
5051 const LLT EltTy = Ty.getScalarType();
5052 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
5053
5054 // Confirm that the return type is large enough for the dmask specified
5055 if (NumElts < DMaskLanes)
5056 return false;
5057
5058 if (NumElts > 4 || DMaskLanes > 4)
5059 return false;
5060
5061 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5062 const LLT AdjustedTy =
5063 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
5064
5065 // The raw dword aligned data component of the load. The only legal cases
5066 // where this matters should be when using the packed D16 format, for
5067 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
5068 LLT RoundedTy;
5069
5070 // S32 vector to cover all data, plus TFE result element.
5071 LLT TFETy;
5072
5073 // Register type to use for each loaded component. Will be S32 or V2S16.
5074 LLT RegTy;
5075
5076 if (IsD16 && ST.hasUnpackedD16VMem()) {
5077 RoundedTy =
5078 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
5079 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
5080 RegTy = S32;
5081 } else {
5082 unsigned EltSize = EltTy.getSizeInBits();
5083 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
5084 unsigned RoundedSize = 32 * RoundedElts;
5085 RoundedTy = LLT::scalarOrVector(
5086 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
5087 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
5088 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
5089 }
5090
5091 // The return type does not need adjustment.
5092 // TODO: Should we change s16 case to s32 or <2 x s16>?
5093 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
5094 return true;
5095
5096 Register Dst1Reg;
5097
5098 // Insert after the instruction.
5099 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
5100
5101 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
5102 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
5103 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
5104 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
5105
5106 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
5107
5108 MI.getOperand(0).setReg(NewResultReg);
5109
5110 // In the IR, TFE is supposed to be used with a 2 element struct return
5111 // type. The instruction really returns these two values in one contiguous
5112 // register, with one additional dword beyond the loaded data. Rewrite the
5113 // return type to use a single register result.
5114
5115 if (IsTFE) {
5116 Dst1Reg = MI.getOperand(1).getReg();
5117 if (MRI->getType(Dst1Reg) != S32)
5118 return false;
5119
5120 // TODO: Make sure the TFE operand bit is set.
5121 MI.removeOperand(1);
5122
5123 // Handle the easy case that requires no repack instructions.
5124 if (Ty == S32) {
5125 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
5126 return true;
5127 }
5128 }
5129
5130 // Now figure out how to copy the new result register back into the old
5131 // result.
5132 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
5133
5134 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
5135
5136 if (ResultNumRegs == 1) {
5137 assert(!IsTFE);
5138 ResultRegs[0] = NewResultReg;
5139 } else {
5140 // We have to repack into a new vector of some kind.
5141 for (int I = 0; I != NumDataRegs; ++I)
5142 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
5143 B.buildUnmerge(ResultRegs, NewResultReg);
5144
5145 // Drop the final TFE element to get the data part. The TFE result is
5146 // directly written to the right place already.
5147 if (IsTFE)
5148 ResultRegs.resize(NumDataRegs);
5149 }
5150
5151 // For an s16 scalar result, we form an s32 result with a truncate regardless
5152 // of packed vs. unpacked.
5153 if (IsD16 && !Ty.isVector()) {
5154 B.buildTrunc(DstReg, ResultRegs[0]);
5155 return true;
5156 }
5157
5158 // Avoid a build/concat_vector of 1 entry.
5159 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
5160 B.buildBitcast(DstReg, ResultRegs[0]);
5161 return true;
5162 }
5163
5164 assert(Ty.isVector());
5165
5166 if (IsD16) {
5167 // For packed D16 results with TFE enabled, all the data components are
5168 // S32. Cast back to the expected type.
5169 //
5170 // TODO: We don't really need to use load s32 elements. We would only need one
5171 // cast for the TFE result if a multiple of v2s16 was used.
5172 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
5173 for (Register &Reg : ResultRegs)
5174 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
5175 } else if (ST.hasUnpackedD16VMem()) {
5176 for (Register &Reg : ResultRegs)
5177 Reg = B.buildTrunc(S16, Reg).getReg(0);
5178 }
5179 }
5180
5181 auto padWithUndef = [&](LLT Ty, int NumElts) {
5182 if (NumElts == 0)
5183 return;
5184 Register Undef = B.buildUndef(Ty).getReg(0);
5185 for (int I = 0; I != NumElts; ++I)
5186 ResultRegs.push_back(Undef);
5187 };
5188
5189 // Pad out any elements eliminated due to the dmask.
5190 LLT ResTy = MRI->getType(ResultRegs[0]);
5191 if (!ResTy.isVector()) {
5192 padWithUndef(ResTy, NumElts - ResultRegs.size());
5193 B.buildBuildVector(DstReg, ResultRegs);
5194 return true;
5195 }
5196
5197 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
5198 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
5199
5200 // Deal with the one annoying legal case.
5201 const LLT V3S16 = LLT::fixed_vector(3, 16);
5202 if (Ty == V3S16) {
5203 if (IsTFE) {
5204 if (ResultRegs.size() == 1) {
5205 NewResultReg = ResultRegs[0];
5206 } else if (ResultRegs.size() == 2) {
5207 LLT V4S16 = LLT::fixed_vector(4, 16);
5208 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
5209 } else {
5210 return false;
5211 }
5212 }
5213
5214 if (MRI->getType(DstReg).getNumElements() <
5215 MRI->getType(NewResultReg).getNumElements()) {
5216 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
5217 } else {
5218 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
5219 }
5220 return true;
5221 }
5222
5223 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
5224 B.buildConcatVectors(DstReg, ResultRegs);
5225 return true;
5226 }
5227
legalizeSBufferLoad(LegalizerHelper & Helper,MachineInstr & MI) const5228 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
5229 LegalizerHelper &Helper, MachineInstr &MI) const {
5230 MachineIRBuilder &B = Helper.MIRBuilder;
5231 GISelChangeObserver &Observer = Helper.Observer;
5232
5233 Register Dst = MI.getOperand(0).getReg();
5234 LLT Ty = B.getMRI()->getType(Dst);
5235 unsigned Size = Ty.getSizeInBits();
5236 MachineFunction &MF = B.getMF();
5237
5238 Observer.changingInstr(MI);
5239
5240 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
5241 Ty = getBitcastRegisterType(Ty);
5242 Helper.bitcastDst(MI, Ty, 0);
5243 Dst = MI.getOperand(0).getReg();
5244 B.setInsertPt(B.getMBB(), MI);
5245 }
5246
5247 // FIXME: We don't really need this intermediate instruction. The intrinsic
5248 // should be fixed to have a memory operand. Since it's readnone, we're not
5249 // allowed to add one.
5250 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
5251 MI.removeOperand(1); // Remove intrinsic ID
5252
5253 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
5254 // TODO: Should this use datalayout alignment?
5255 const unsigned MemSize = (Size + 7) / 8;
5256 const Align MemAlign(4);
5257 MachineMemOperand *MMO = MF.getMachineMemOperand(
5258 MachinePointerInfo(),
5259 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5260 MachineMemOperand::MOInvariant,
5261 MemSize, MemAlign);
5262 MI.addMemOperand(MF, MMO);
5263
5264 // There are no 96-bit result scalar loads, but widening to 128-bit should
5265 // always be legal. We may need to restore this to a 96-bit result if it turns
5266 // out this needs to be converted to a vector load during RegBankSelect.
5267 if (!isPowerOf2_32(Size)) {
5268 if (Ty.isVector())
5269 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
5270 else
5271 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
5272 }
5273
5274 Observer.changedInstr(MI);
5275 return true;
5276 }
5277
5278 // TODO: Move to selection
legalizeTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5279 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
5280 MachineRegisterInfo &MRI,
5281 MachineIRBuilder &B) const {
5282 if (!ST.isTrapHandlerEnabled() ||
5283 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
5284 return legalizeTrapEndpgm(MI, MRI, B);
5285
5286 if (std::optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
5287 switch (*HsaAbiVer) {
5288 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
5289 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
5290 return legalizeTrapHsaQueuePtr(MI, MRI, B);
5291 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
5292 case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
5293 return ST.supportsGetDoorbellID() ?
5294 legalizeTrapHsa(MI, MRI, B) :
5295 legalizeTrapHsaQueuePtr(MI, MRI, B);
5296 }
5297 }
5298
5299 llvm_unreachable("Unknown trap handler");
5300 }
5301
legalizeTrapEndpgm(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5302 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
5303 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5304 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
5305 MI.eraseFromParent();
5306 return true;
5307 }
5308
legalizeTrapHsaQueuePtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5309 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
5310 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5311 MachineFunction &MF = B.getMF();
5312 const LLT S64 = LLT::scalar(64);
5313
5314 Register SGPR01(AMDGPU::SGPR0_SGPR1);
5315 // For code object version 5, queue_ptr is passed through implicit kernarg.
5316 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
5317 AMDGPUTargetLowering::ImplicitParameter Param =
5318 AMDGPUTargetLowering::QUEUE_PTR;
5319 uint64_t Offset =
5320 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
5321
5322 Register KernargPtrReg = MRI.createGenericVirtualRegister(
5323 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5324
5325 if (!loadInputValue(KernargPtrReg, B,
5326 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5327 return false;
5328
5329 // TODO: can we be smarter about machine pointer info?
5330 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5331 MachineMemOperand *MMO = MF.getMachineMemOperand(
5332 PtrInfo,
5333 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5334 MachineMemOperand::MOInvariant,
5335 LLT::scalar(64), commonAlignment(Align(64), Offset));
5336
5337 // Pointer address
5338 Register LoadAddr = MRI.createGenericVirtualRegister(
5339 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5340 B.buildPtrAdd(LoadAddr, KernargPtrReg,
5341 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
5342 // Load address
5343 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
5344 B.buildCopy(SGPR01, Temp);
5345 B.buildInstr(AMDGPU::S_TRAP)
5346 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5347 .addReg(SGPR01, RegState::Implicit);
5348 MI.eraseFromParent();
5349 return true;
5350 }
5351
5352 // Pass queue pointer to trap handler as input, and insert trap instruction
5353 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
5354 Register LiveIn =
5355 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5356 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
5357 return false;
5358
5359 B.buildCopy(SGPR01, LiveIn);
5360 B.buildInstr(AMDGPU::S_TRAP)
5361 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5362 .addReg(SGPR01, RegState::Implicit);
5363
5364 MI.eraseFromParent();
5365 return true;
5366 }
5367
legalizeTrapHsa(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5368 bool AMDGPULegalizerInfo::legalizeTrapHsa(
5369 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5370 B.buildInstr(AMDGPU::S_TRAP)
5371 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
5372 MI.eraseFromParent();
5373 return true;
5374 }
5375
legalizeDebugTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5376 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
5377 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5378 // Is non-HSA path or trap-handler disabled? Then, report a warning
5379 // accordingly
5380 if (!ST.isTrapHandlerEnabled() ||
5381 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
5382 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
5383 "debugtrap handler not supported",
5384 MI.getDebugLoc(), DS_Warning);
5385 LLVMContext &Ctx = B.getMF().getFunction().getContext();
5386 Ctx.diagnose(NoTrap);
5387 } else {
5388 // Insert debug-trap instruction
5389 B.buildInstr(AMDGPU::S_TRAP)
5390 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
5391 }
5392
5393 MI.eraseFromParent();
5394 return true;
5395 }
5396
legalizeBVHIntrinsic(MachineInstr & MI,MachineIRBuilder & B) const5397 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
5398 MachineIRBuilder &B) const {
5399 MachineRegisterInfo &MRI = *B.getMRI();
5400 const LLT S16 = LLT::scalar(16);
5401 const LLT S32 = LLT::scalar(32);
5402 const LLT V2S16 = LLT::fixed_vector(2, 16);
5403 const LLT V3S32 = LLT::fixed_vector(3, 32);
5404
5405 Register DstReg = MI.getOperand(0).getReg();
5406 Register NodePtr = MI.getOperand(2).getReg();
5407 Register RayExtent = MI.getOperand(3).getReg();
5408 Register RayOrigin = MI.getOperand(4).getReg();
5409 Register RayDir = MI.getOperand(5).getReg();
5410 Register RayInvDir = MI.getOperand(6).getReg();
5411 Register TDescr = MI.getOperand(7).getReg();
5412
5413 if (!ST.hasGFX10_AEncoding()) {
5414 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
5415 "intrinsic not supported on subtarget",
5416 MI.getDebugLoc());
5417 B.getMF().getFunction().getContext().diagnose(BadIntrin);
5418 return false;
5419 }
5420
5421 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
5422 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
5423 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
5424 const unsigned NumVDataDwords = 4;
5425 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
5426 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
5427 const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
5428 const unsigned BaseOpcodes[2][2] = {
5429 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5430 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5431 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5432 int Opcode;
5433 if (UseNSA) {
5434 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
5435 IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
5436 : AMDGPU::MIMGEncGfx10NSA,
5437 NumVDataDwords, NumVAddrDwords);
5438 } else {
5439 Opcode = AMDGPU::getMIMGOpcode(
5440 BaseOpcodes[Is64][IsA16],
5441 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
5442 NumVDataDwords, NumVAddrDwords);
5443 }
5444 assert(Opcode != -1);
5445
5446 SmallVector<Register, 12> Ops;
5447 if (UseNSA && IsGFX11Plus) {
5448 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
5449 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5450 auto Merged = B.buildMergeLikeInstr(
5451 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
5452 Ops.push_back(Merged.getReg(0));
5453 };
5454
5455 Ops.push_back(NodePtr);
5456 Ops.push_back(RayExtent);
5457 packLanes(RayOrigin);
5458
5459 if (IsA16) {
5460 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5461 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5462 auto MergedDir = B.buildMergeLikeInstr(
5463 V3S32,
5464 {B.buildBitcast(
5465 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
5466 UnmergeRayDir.getReg(0)}))
5467 .getReg(0),
5468 B.buildBitcast(
5469 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
5470 UnmergeRayDir.getReg(1)}))
5471 .getReg(0),
5472 B.buildBitcast(
5473 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
5474 UnmergeRayDir.getReg(2)}))
5475 .getReg(0)});
5476 Ops.push_back(MergedDir.getReg(0));
5477 } else {
5478 packLanes(RayDir);
5479 packLanes(RayInvDir);
5480 }
5481 } else {
5482 if (Is64) {
5483 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5484 Ops.push_back(Unmerge.getReg(0));
5485 Ops.push_back(Unmerge.getReg(1));
5486 } else {
5487 Ops.push_back(NodePtr);
5488 }
5489 Ops.push_back(RayExtent);
5490
5491 auto packLanes = [&Ops, &S32, &B](Register Src) {
5492 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5493 Ops.push_back(Unmerge.getReg(0));
5494 Ops.push_back(Unmerge.getReg(1));
5495 Ops.push_back(Unmerge.getReg(2));
5496 };
5497
5498 packLanes(RayOrigin);
5499 if (IsA16) {
5500 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5501 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5502 Register R1 = MRI.createGenericVirtualRegister(S32);
5503 Register R2 = MRI.createGenericVirtualRegister(S32);
5504 Register R3 = MRI.createGenericVirtualRegister(S32);
5505 B.buildMergeLikeInstr(R1,
5506 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5507 B.buildMergeLikeInstr(
5508 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5509 B.buildMergeLikeInstr(
5510 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5511 Ops.push_back(R1);
5512 Ops.push_back(R2);
5513 Ops.push_back(R3);
5514 } else {
5515 packLanes(RayDir);
5516 packLanes(RayInvDir);
5517 }
5518 }
5519
5520 if (!UseNSA) {
5521 // Build a single vector containing all the operands so far prepared.
5522 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5523 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
5524 Ops.clear();
5525 Ops.push_back(MergedOps);
5526 }
5527
5528 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5529 .addDef(DstReg)
5530 .addImm(Opcode);
5531
5532 for (Register R : Ops) {
5533 MIB.addUse(R);
5534 }
5535
5536 MIB.addUse(TDescr)
5537 .addImm(IsA16 ? 1 : 0)
5538 .cloneMemRefs(MI);
5539
5540 MI.eraseFromParent();
5541 return true;
5542 }
5543
legalizeFPTruncRound(MachineInstr & MI,MachineIRBuilder & B) const5544 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
5545 MachineIRBuilder &B) const {
5546 unsigned Opc;
5547 int RoundMode = MI.getOperand(2).getImm();
5548
5549 if (RoundMode == (int)RoundingMode::TowardPositive)
5550 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
5551 else if (RoundMode == (int)RoundingMode::TowardNegative)
5552 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
5553 else
5554 return false;
5555
5556 B.buildInstr(Opc)
5557 .addDef(MI.getOperand(0).getReg())
5558 .addUse(MI.getOperand(1).getReg());
5559
5560 MI.eraseFromParent();
5561
5562 return true;
5563 }
5564
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const5565 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
5566 MachineInstr &MI) const {
5567 MachineIRBuilder &B = Helper.MIRBuilder;
5568 MachineRegisterInfo &MRI = *B.getMRI();
5569
5570 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5571 auto IntrID = MI.getIntrinsicID();
5572 switch (IntrID) {
5573 case Intrinsic::amdgcn_if:
5574 case Intrinsic::amdgcn_else: {
5575 MachineInstr *Br = nullptr;
5576 MachineBasicBlock *UncondBrTarget = nullptr;
5577 bool Negated = false;
5578 if (MachineInstr *BrCond =
5579 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5580 const SIRegisterInfo *TRI
5581 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5582
5583 Register Def = MI.getOperand(1).getReg();
5584 Register Use = MI.getOperand(3).getReg();
5585
5586 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5587
5588 if (Negated)
5589 std::swap(CondBrTarget, UncondBrTarget);
5590
5591 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5592 if (IntrID == Intrinsic::amdgcn_if) {
5593 B.buildInstr(AMDGPU::SI_IF)
5594 .addDef(Def)
5595 .addUse(Use)
5596 .addMBB(UncondBrTarget);
5597 } else {
5598 B.buildInstr(AMDGPU::SI_ELSE)
5599 .addDef(Def)
5600 .addUse(Use)
5601 .addMBB(UncondBrTarget);
5602 }
5603
5604 if (Br) {
5605 Br->getOperand(0).setMBB(CondBrTarget);
5606 } else {
5607 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5608 // since we're swapping branch targets it needs to be reinserted.
5609 // FIXME: IRTranslator should probably not do this
5610 B.buildBr(*CondBrTarget);
5611 }
5612
5613 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5614 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5615 MI.eraseFromParent();
5616 BrCond->eraseFromParent();
5617 return true;
5618 }
5619
5620 return false;
5621 }
5622 case Intrinsic::amdgcn_loop: {
5623 MachineInstr *Br = nullptr;
5624 MachineBasicBlock *UncondBrTarget = nullptr;
5625 bool Negated = false;
5626 if (MachineInstr *BrCond =
5627 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5628 const SIRegisterInfo *TRI
5629 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5630
5631 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5632 Register Reg = MI.getOperand(2).getReg();
5633
5634 if (Negated)
5635 std::swap(CondBrTarget, UncondBrTarget);
5636
5637 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5638 B.buildInstr(AMDGPU::SI_LOOP)
5639 .addUse(Reg)
5640 .addMBB(UncondBrTarget);
5641
5642 if (Br)
5643 Br->getOperand(0).setMBB(CondBrTarget);
5644 else
5645 B.buildBr(*CondBrTarget);
5646
5647 MI.eraseFromParent();
5648 BrCond->eraseFromParent();
5649 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5650 return true;
5651 }
5652
5653 return false;
5654 }
5655 case Intrinsic::amdgcn_kernarg_segment_ptr:
5656 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5657 // This only makes sense to call in a kernel, so just lower to null.
5658 B.buildConstant(MI.getOperand(0).getReg(), 0);
5659 MI.eraseFromParent();
5660 return true;
5661 }
5662
5663 return legalizePreloadedArgIntrin(
5664 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5665 case Intrinsic::amdgcn_implicitarg_ptr:
5666 return legalizeImplicitArgPtr(MI, MRI, B);
5667 case Intrinsic::amdgcn_workitem_id_x:
5668 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
5669 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5670 case Intrinsic::amdgcn_workitem_id_y:
5671 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
5672 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5673 case Intrinsic::amdgcn_workitem_id_z:
5674 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
5675 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5676 case Intrinsic::amdgcn_workgroup_id_x:
5677 return legalizePreloadedArgIntrin(MI, MRI, B,
5678 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5679 case Intrinsic::amdgcn_workgroup_id_y:
5680 return legalizePreloadedArgIntrin(MI, MRI, B,
5681 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5682 case Intrinsic::amdgcn_workgroup_id_z:
5683 return legalizePreloadedArgIntrin(MI, MRI, B,
5684 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5685 case Intrinsic::amdgcn_lds_kernel_id:
5686 return legalizePreloadedArgIntrin(MI, MRI, B,
5687 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5688 case Intrinsic::amdgcn_dispatch_ptr:
5689 return legalizePreloadedArgIntrin(MI, MRI, B,
5690 AMDGPUFunctionArgInfo::DISPATCH_PTR);
5691 case Intrinsic::amdgcn_queue_ptr:
5692 return legalizePreloadedArgIntrin(MI, MRI, B,
5693 AMDGPUFunctionArgInfo::QUEUE_PTR);
5694 case Intrinsic::amdgcn_implicit_buffer_ptr:
5695 return legalizePreloadedArgIntrin(
5696 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5697 case Intrinsic::amdgcn_dispatch_id:
5698 return legalizePreloadedArgIntrin(MI, MRI, B,
5699 AMDGPUFunctionArgInfo::DISPATCH_ID);
5700 case Intrinsic::r600_read_ngroups_x:
5701 // TODO: Emit error for hsa
5702 return legalizeKernargMemParameter(MI, B,
5703 SI::KernelInputOffsets::NGROUPS_X);
5704 case Intrinsic::r600_read_ngroups_y:
5705 return legalizeKernargMemParameter(MI, B,
5706 SI::KernelInputOffsets::NGROUPS_Y);
5707 case Intrinsic::r600_read_ngroups_z:
5708 return legalizeKernargMemParameter(MI, B,
5709 SI::KernelInputOffsets::NGROUPS_Z);
5710 case Intrinsic::r600_read_local_size_x:
5711 // TODO: Could insert G_ASSERT_ZEXT from s16
5712 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
5713 case Intrinsic::r600_read_local_size_y:
5714 // TODO: Could insert G_ASSERT_ZEXT from s16
5715 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y);
5716 // TODO: Could insert G_ASSERT_ZEXT from s16
5717 case Intrinsic::r600_read_local_size_z:
5718 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
5719 case Intrinsic::r600_read_global_size_x:
5720 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
5721 case Intrinsic::r600_read_global_size_y:
5722 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
5723 case Intrinsic::r600_read_global_size_z:
5724 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
5725 case Intrinsic::amdgcn_fdiv_fast:
5726 return legalizeFDIVFastIntrin(MI, MRI, B);
5727 case Intrinsic::amdgcn_is_shared:
5728 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5729 case Intrinsic::amdgcn_is_private:
5730 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5731 case Intrinsic::amdgcn_wavefrontsize: {
5732 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5733 MI.eraseFromParent();
5734 return true;
5735 }
5736 case Intrinsic::amdgcn_s_buffer_load:
5737 return legalizeSBufferLoad(Helper, MI);
5738 case Intrinsic::amdgcn_raw_buffer_store:
5739 case Intrinsic::amdgcn_struct_buffer_store:
5740 return legalizeBufferStore(MI, MRI, B, false, false);
5741 case Intrinsic::amdgcn_raw_buffer_store_format:
5742 case Intrinsic::amdgcn_struct_buffer_store_format:
5743 return legalizeBufferStore(MI, MRI, B, false, true);
5744 case Intrinsic::amdgcn_raw_tbuffer_store:
5745 case Intrinsic::amdgcn_struct_tbuffer_store:
5746 return legalizeBufferStore(MI, MRI, B, true, true);
5747 case Intrinsic::amdgcn_raw_buffer_load:
5748 case Intrinsic::amdgcn_struct_buffer_load:
5749 return legalizeBufferLoad(MI, MRI, B, false, false);
5750 case Intrinsic::amdgcn_raw_buffer_load_format:
5751 case Intrinsic::amdgcn_struct_buffer_load_format:
5752 return legalizeBufferLoad(MI, MRI, B, true, false);
5753 case Intrinsic::amdgcn_raw_tbuffer_load:
5754 case Intrinsic::amdgcn_struct_tbuffer_load:
5755 return legalizeBufferLoad(MI, MRI, B, true, true);
5756 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5757 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5758 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5759 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5760 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5761 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5762 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5763 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5764 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5765 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5766 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5767 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5768 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5769 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5770 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5771 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5772 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5773 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5774 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5775 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5776 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5777 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5778 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5779 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5780 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5781 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5782 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5783 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5784 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5785 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5786 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5787 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5788 return legalizeBufferAtomic(MI, B, IntrID);
5789 case Intrinsic::amdgcn_atomic_inc:
5790 return legalizeAtomicIncDec(MI, B, true);
5791 case Intrinsic::amdgcn_atomic_dec:
5792 return legalizeAtomicIncDec(MI, B, false);
5793 case Intrinsic::trap:
5794 return legalizeTrapIntrinsic(MI, MRI, B);
5795 case Intrinsic::debugtrap:
5796 return legalizeDebugTrapIntrinsic(MI, MRI, B);
5797 case Intrinsic::amdgcn_rsq_clamp:
5798 return legalizeRsqClampIntrinsic(MI, MRI, B);
5799 case Intrinsic::amdgcn_ds_fadd:
5800 case Intrinsic::amdgcn_ds_fmin:
5801 case Intrinsic::amdgcn_ds_fmax:
5802 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5803 case Intrinsic::amdgcn_image_bvh_intersect_ray:
5804 return legalizeBVHIntrinsic(MI, B);
5805 default: {
5806 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5807 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5808 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5809 return true;
5810 }
5811 }
5812
5813 return true;
5814 }
5815