1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPULegalizerInfo.h"
15
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIInstrInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "SIRegisterInfo.h"
24 #include "Utils/AMDGPUBaseInfo.h"
25 #include "llvm/ADT/ScopeExit.h"
26 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31 #include "llvm/CodeGen/GlobalISel/Utils.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/IntrinsicsAMDGPU.h"
35 #include "llvm/IR/IntrinsicsR600.h"
36
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44
45 // Hack until load/store selection patterns support any tuple of legal types.
46 static cl::opt<bool> EnableNewLegality(
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
51 cl::ReallyHidden);
52
53 static constexpr unsigned MaxRegisterSize = 1024;
54
55 // Round the number of elements to the next power of two elements
getPow2VectorType(LLT Ty)56 static LLT getPow2VectorType(LLT Ty) {
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60 }
61
62 // Round the number of bits to the next power of two bits
getPow2ScalarType(LLT Ty)63 static LLT getPow2ScalarType(LLT Ty) {
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
67 }
68
69 /// \returns true if this is an odd sized vector which should widen by adding an
70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71 /// excludes s1 vectors, which should always be scalarized.
isSmallOddVector(unsigned TypeIdx)72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84 }
85
sizeIsMultipleOf32(unsigned TypeIdx)86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91 }
92
isWideVec16(unsigned TypeIdx)93 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99 }
100
oneMoreElement(unsigned TypeIdx)101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107 };
108 }
109
fewerEltsToSize64Vector(unsigned TypeIdx)110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
119 };
120 }
121
122 // Increase the number of vector elements to reach the next multiple of 32-bit
123 // type.
moreEltsToNext32Bit(unsigned TypeIdx)124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137 };
138 }
139
140 // Increase the number of vector elements to reach the next legal RegClass.
moreElementsToNextExistingRegClass(unsigned TypeIdx)141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
149 assert(Ty.getSizeInBits() < MaxRegisterSize);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
156 }
157
158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159 };
160 }
161
getBufferRsrcScalarType(const LLT Ty)162 static LLT getBufferRsrcScalarType(const LLT Ty) {
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
167 }
168
getBufferRsrcRegisterType(const LLT Ty)169 static LLT getBufferRsrcRegisterType(const LLT Ty) {
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174 }
175
getBitcastRegisterType(const LLT Ty)176 static LLT getBitcastRegisterType(const LLT Ty) {
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
183 }
184
185 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
186 }
187
bitcastToRegisterType(unsigned TypeIdx)188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193 }
194
bitcastToVectorElement32(unsigned TypeIdx)195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
201 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
202 };
203 }
204
vectorSmallerThan(unsigned TypeIdx,unsigned Size)205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210 }
211
vectorWiderThan(unsigned TypeIdx,unsigned Size)212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217 }
218
numElementsNotEven(unsigned TypeIdx)219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224 }
225
isRegisterSize(unsigned Size)226 static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228 }
229
isRegisterVectorElementType(LLT EltTy)230 static bool isRegisterVectorElementType(LLT EltTy) {
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233 }
234
isRegisterVectorType(LLT Ty)235 static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240 }
241
isRegisterType(LLT Ty)242 static bool isRegisterType(LLT Ty) {
243 if (!isRegisterSize(Ty.getSizeInBits()))
244 return false;
245
246 if (Ty.isVector())
247 return isRegisterVectorType(Ty);
248
249 return true;
250 }
251
252 // Any combination of 32 or 64-bit elements up the maximum register size, and
253 // multiples of v2s16.
isRegisterType(unsigned TypeIdx)254 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
255 return [=](const LegalityQuery &Query) {
256 return isRegisterType(Query.Types[TypeIdx]);
257 };
258 }
259
260 // RegisterType that doesn't have a corresponding RegClass.
isIllegalRegisterType(unsigned TypeIdx)261 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
262 return [=](const LegalityQuery &Query) {
263 LLT Ty = Query.Types[TypeIdx];
264 return isRegisterType(Ty) &&
265 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
266 };
267 }
268
elementTypeIsLegal(unsigned TypeIdx)269 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
270 return [=](const LegalityQuery &Query) {
271 const LLT QueryTy = Query.Types[TypeIdx];
272 if (!QueryTy.isVector())
273 return false;
274 const LLT EltTy = QueryTy.getElementType();
275 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
276 };
277 }
278
279 // If we have a truncating store or an extending load with a data size larger
280 // than 32-bits, we need to reduce to a 32-bit type.
isWideScalarExtLoadTruncStore(unsigned TypeIdx)281 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
282 return [=](const LegalityQuery &Query) {
283 const LLT Ty = Query.Types[TypeIdx];
284 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
285 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
286 };
287 }
288
289 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
290 // handle some operations by just promoting the register during
291 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
maxSizeForAddrSpace(const GCNSubtarget & ST,unsigned AS,bool IsLoad,bool IsAtomic)292 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
293 bool IsLoad, bool IsAtomic) {
294 switch (AS) {
295 case AMDGPUAS::PRIVATE_ADDRESS:
296 // FIXME: Private element size.
297 return ST.enableFlatScratch() ? 128 : 32;
298 case AMDGPUAS::LOCAL_ADDRESS:
299 return ST.useDS128() ? 128 : 64;
300 case AMDGPUAS::GLOBAL_ADDRESS:
301 case AMDGPUAS::CONSTANT_ADDRESS:
302 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
303 case AMDGPUAS::BUFFER_RESOURCE:
304 // Treat constant and global as identical. SMRD loads are sometimes usable for
305 // global loads (ideally constant address space should be eliminated)
306 // depending on the context. Legality cannot be context dependent, but
307 // RegBankSelect can split the load as necessary depending on the pointer
308 // register bank/uniformity and if the memory is invariant or not written in a
309 // kernel.
310 return IsLoad ? 512 : 128;
311 default:
312 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
313 // if they may alias scratch depending on the subtarget. This needs to be
314 // moved to custom handling to use addressMayBeAccessedAsPrivate
315 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
316 }
317 }
318
isLoadStoreSizeLegal(const GCNSubtarget & ST,const LegalityQuery & Query)319 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
320 const LegalityQuery &Query) {
321 const LLT Ty = Query.Types[0];
322
323 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
324 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
325
326 unsigned RegSize = Ty.getSizeInBits();
327 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
328 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
329 unsigned AS = Query.Types[1].getAddressSpace();
330
331 // All of these need to be custom lowered to cast the pointer operand.
332 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
333 return false;
334
335 // Do not handle extending vector loads.
336 if (Ty.isVector() && MemSize != RegSize)
337 return false;
338
339 // TODO: We should be able to widen loads if the alignment is high enough, but
340 // we also need to modify the memory access size.
341 #if 0
342 // Accept widening loads based on alignment.
343 if (IsLoad && MemSize < Size)
344 MemSize = std::max(MemSize, Align);
345 #endif
346
347 // Only 1-byte and 2-byte to 32-bit extloads are valid.
348 if (MemSize != RegSize && RegSize != 32)
349 return false;
350
351 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
352 Query.MMODescrs[0].Ordering !=
353 AtomicOrdering::NotAtomic))
354 return false;
355
356 switch (MemSize) {
357 case 8:
358 case 16:
359 case 32:
360 case 64:
361 case 128:
362 break;
363 case 96:
364 if (!ST.hasDwordx3LoadStores())
365 return false;
366 break;
367 case 256:
368 case 512:
369 // These may contextually need to be broken down.
370 break;
371 default:
372 return false;
373 }
374
375 assert(RegSize >= MemSize);
376
377 if (AlignBits < MemSize) {
378 const SITargetLowering *TLI = ST.getTargetLowering();
379 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
380 Align(AlignBits / 8)))
381 return false;
382 }
383
384 return true;
385 }
386
387 // The newer buffer intrinsic forms take their resource arguments as
388 // pointers in address space 8, aka s128 values. However, in order to not break
389 // SelectionDAG, the underlying operations have to continue to take v4i32
390 // arguments. Therefore, we convert resource pointers - or vectors of them
391 // to integer values here.
hasBufferRsrcWorkaround(const LLT Ty)392 static bool hasBufferRsrcWorkaround(const LLT Ty) {
393 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
394 return true;
395 if (Ty.isVector()) {
396 const LLT ElemTy = Ty.getElementType();
397 return hasBufferRsrcWorkaround(ElemTy);
398 }
399 return false;
400 }
401
402 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
403 // workaround this. Eventually it should ignore the type for loads and only care
404 // about the size. Return true in cases where we will workaround this for now by
405 // bitcasting.
loadStoreBitcastWorkaround(const LLT Ty)406 static bool loadStoreBitcastWorkaround(const LLT Ty) {
407 if (EnableNewLegality)
408 return false;
409
410 const unsigned Size = Ty.getSizeInBits();
411 if (Size <= 64)
412 return false;
413 // Address space 8 pointers get their own workaround.
414 if (hasBufferRsrcWorkaround(Ty))
415 return false;
416 if (!Ty.isVector())
417 return true;
418
419 LLT EltTy = Ty.getElementType();
420 if (EltTy.isPointer())
421 return true;
422
423 unsigned EltSize = EltTy.getSizeInBits();
424 return EltSize != 32 && EltSize != 64;
425 }
426
isLoadStoreLegal(const GCNSubtarget & ST,const LegalityQuery & Query)427 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
428 const LLT Ty = Query.Types[0];
429 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
430 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
431 }
432
433 /// Return true if a load or store of the type should be lowered with a bitcast
434 /// to a different type.
shouldBitcastLoadStoreType(const GCNSubtarget & ST,const LLT Ty,const LLT MemTy)435 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
436 const LLT MemTy) {
437 const unsigned MemSizeInBits = MemTy.getSizeInBits();
438 const unsigned Size = Ty.getSizeInBits();
439 if (Size != MemSizeInBits)
440 return Size <= 32 && Ty.isVector();
441
442 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
443 return true;
444
445 // Don't try to handle bitcasting vector ext loads for now.
446 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
447 (Size <= 32 || isRegisterSize(Size)) &&
448 !isRegisterVectorElementType(Ty.getElementType());
449 }
450
451 /// Return true if we should legalize a load by widening an odd sized memory
452 /// access up to the alignment. Note this case when the memory access itself
453 /// changes, not the size of the result register.
shouldWidenLoad(const GCNSubtarget & ST,LLT MemoryTy,uint64_t AlignInBits,unsigned AddrSpace,unsigned Opcode)454 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
455 uint64_t AlignInBits, unsigned AddrSpace,
456 unsigned Opcode) {
457 unsigned SizeInBits = MemoryTy.getSizeInBits();
458 // We don't want to widen cases that are naturally legal.
459 if (isPowerOf2_32(SizeInBits))
460 return false;
461
462 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
463 // end up widening these for a scalar load during RegBankSelect, if we don't
464 // have 96-bit scalar loads.
465 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
466 return false;
467
468 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
469 return false;
470
471 // A load is known dereferenceable up to the alignment, so it's legal to widen
472 // to it.
473 //
474 // TODO: Could check dereferenceable for less aligned cases.
475 unsigned RoundedSize = NextPowerOf2(SizeInBits);
476 if (AlignInBits < RoundedSize)
477 return false;
478
479 // Do not widen if it would introduce a slow unaligned load.
480 const SITargetLowering *TLI = ST.getTargetLowering();
481 unsigned Fast = 0;
482 return TLI->allowsMisalignedMemoryAccessesImpl(
483 RoundedSize, AddrSpace, Align(AlignInBits / 8),
484 MachineMemOperand::MOLoad, &Fast) &&
485 Fast;
486 }
487
shouldWidenLoad(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)488 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
489 unsigned Opcode) {
490 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
491 return false;
492
493 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
494 Query.MMODescrs[0].AlignInBits,
495 Query.Types[1].getAddressSpace(), Opcode);
496 }
497
498 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
499 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
500 /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
castBufferRsrcFromV4I32(MachineInstr & MI,MachineIRBuilder & B,MachineRegisterInfo & MRI,unsigned Idx)501 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
502 MachineRegisterInfo &MRI, unsigned Idx) {
503 MachineOperand &MO = MI.getOperand(Idx);
504
505 const LLT PointerTy = MRI.getType(MO.getReg());
506
507 // Paranoidly prevent us from doing this multiple times.
508 if (!hasBufferRsrcWorkaround(PointerTy))
509 return PointerTy;
510
511 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
512 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
513 if (!PointerTy.isVector()) {
514 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
515 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
516 const LLT S32 = LLT::scalar(32);
517
518 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
519 std::array<Register, 4> VectorElems;
520 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
521 for (unsigned I = 0; I < NumParts; ++I)
522 VectorElems[I] =
523 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
524 B.buildMergeValues(MO, VectorElems);
525 MO.setReg(VectorReg);
526 return VectorTy;
527 }
528 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
529 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
530 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
531 B.buildIntToPtr(MO, Scalar);
532 MO.setReg(BitcastReg);
533
534 return VectorTy;
535 }
536
537 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
538 /// the form in which the value must be in order to be passed to the low-level
539 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
540 /// needed in order to account for the fact that we can't define a register
541 /// class for s128 without breaking SelectionDAG.
castBufferRsrcToV4I32(Register Pointer,MachineIRBuilder & B)542 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
543 MachineRegisterInfo &MRI = *B.getMRI();
544 const LLT PointerTy = MRI.getType(Pointer);
545 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
546 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
547
548 if (!PointerTy.isVector()) {
549 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
550 SmallVector<Register, 4> PointerParts;
551 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
552 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
553 for (unsigned I = 0; I < NumParts; ++I)
554 PointerParts.push_back(Unmerged.getReg(I));
555 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
556 }
557 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
558 return B.buildBitcast(VectorTy, Scalar).getReg(0);
559 }
560
castBufferRsrcArgToV4I32(MachineInstr & MI,MachineIRBuilder & B,unsigned Idx)561 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
562 unsigned Idx) {
563 MachineOperand &MO = MI.getOperand(Idx);
564
565 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
566 // Paranoidly prevent us from doing this multiple times.
567 if (!hasBufferRsrcWorkaround(PointerTy))
568 return;
569 MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
570 }
571
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)572 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
573 const GCNTargetMachine &TM)
574 : ST(ST_) {
575 using namespace TargetOpcode;
576
577 auto GetAddrSpacePtr = [&TM](unsigned AS) {
578 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
579 };
580
581 const LLT S1 = LLT::scalar(1);
582 const LLT S8 = LLT::scalar(8);
583 const LLT S16 = LLT::scalar(16);
584 const LLT S32 = LLT::scalar(32);
585 const LLT S64 = LLT::scalar(64);
586 const LLT S128 = LLT::scalar(128);
587 const LLT S256 = LLT::scalar(256);
588 const LLT S512 = LLT::scalar(512);
589 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
590
591 const LLT V2S8 = LLT::fixed_vector(2, 8);
592 const LLT V2S16 = LLT::fixed_vector(2, 16);
593 const LLT V4S16 = LLT::fixed_vector(4, 16);
594
595 const LLT V2S32 = LLT::fixed_vector(2, 32);
596 const LLT V3S32 = LLT::fixed_vector(3, 32);
597 const LLT V4S32 = LLT::fixed_vector(4, 32);
598 const LLT V5S32 = LLT::fixed_vector(5, 32);
599 const LLT V6S32 = LLT::fixed_vector(6, 32);
600 const LLT V7S32 = LLT::fixed_vector(7, 32);
601 const LLT V8S32 = LLT::fixed_vector(8, 32);
602 const LLT V9S32 = LLT::fixed_vector(9, 32);
603 const LLT V10S32 = LLT::fixed_vector(10, 32);
604 const LLT V11S32 = LLT::fixed_vector(11, 32);
605 const LLT V12S32 = LLT::fixed_vector(12, 32);
606 const LLT V13S32 = LLT::fixed_vector(13, 32);
607 const LLT V14S32 = LLT::fixed_vector(14, 32);
608 const LLT V15S32 = LLT::fixed_vector(15, 32);
609 const LLT V16S32 = LLT::fixed_vector(16, 32);
610 const LLT V32S32 = LLT::fixed_vector(32, 32);
611
612 const LLT V2S64 = LLT::fixed_vector(2, 64);
613 const LLT V3S64 = LLT::fixed_vector(3, 64);
614 const LLT V4S64 = LLT::fixed_vector(4, 64);
615 const LLT V5S64 = LLT::fixed_vector(5, 64);
616 const LLT V6S64 = LLT::fixed_vector(6, 64);
617 const LLT V7S64 = LLT::fixed_vector(7, 64);
618 const LLT V8S64 = LLT::fixed_vector(8, 64);
619 const LLT V16S64 = LLT::fixed_vector(16, 64);
620
621 std::initializer_list<LLT> AllS32Vectors =
622 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
623 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
624 std::initializer_list<LLT> AllS64Vectors =
625 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
626
627 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
628 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
629 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
630 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
631 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
632 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
633 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
634 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
635 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
636 const LLT BufferStridedPtr =
637 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
638
639 const LLT CodePtr = FlatPtr;
640
641 const std::initializer_list<LLT> AddrSpaces64 = {
642 GlobalPtr, ConstantPtr, FlatPtr
643 };
644
645 const std::initializer_list<LLT> AddrSpaces32 = {
646 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
647 };
648
649 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
650
651 const std::initializer_list<LLT> FPTypesBase = {
652 S32, S64
653 };
654
655 const std::initializer_list<LLT> FPTypes16 = {
656 S32, S64, S16
657 };
658
659 const std::initializer_list<LLT> FPTypesPK16 = {
660 S32, S64, S16, V2S16
661 };
662
663 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
664
665 // s1 for VCC branches, s32 for SCC branches.
666 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
667
668 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
669 // elements for v3s16
670 getActionDefinitionsBuilder(G_PHI)
671 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
672 .legalFor(AllS32Vectors)
673 .legalFor(AllS64Vectors)
674 .legalFor(AddrSpaces64)
675 .legalFor(AddrSpaces32)
676 .legalFor(AddrSpaces128)
677 .legalIf(isPointer(0))
678 .clampScalar(0, S16, S256)
679 .widenScalarToNextPow2(0, 32)
680 .clampMaxNumElements(0, S32, 16)
681 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
682 .scalarize(0);
683
684 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
685 // Full set of gfx9 features.
686 if (ST.hasScalarAddSub64()) {
687 getActionDefinitionsBuilder({G_ADD, G_SUB})
688 .legalFor({S64, S32, S16, V2S16})
689 .clampMaxNumElementsStrict(0, S16, 2)
690 .scalarize(0)
691 .minScalar(0, S16)
692 .widenScalarToNextMultipleOf(0, 32)
693 .maxScalar(0, S32);
694 } else {
695 getActionDefinitionsBuilder({G_ADD, G_SUB})
696 .legalFor({S32, S16, V2S16})
697 .clampMaxNumElementsStrict(0, S16, 2)
698 .scalarize(0)
699 .minScalar(0, S16)
700 .widenScalarToNextMultipleOf(0, 32)
701 .maxScalar(0, S32);
702 }
703
704 if (ST.hasScalarSMulU64()) {
705 getActionDefinitionsBuilder(G_MUL)
706 .legalFor({S64, S32, S16, V2S16})
707 .clampMaxNumElementsStrict(0, S16, 2)
708 .scalarize(0)
709 .minScalar(0, S16)
710 .widenScalarToNextMultipleOf(0, 32)
711 .custom();
712 } else {
713 getActionDefinitionsBuilder(G_MUL)
714 .legalFor({S32, S16, V2S16})
715 .clampMaxNumElementsStrict(0, S16, 2)
716 .scalarize(0)
717 .minScalar(0, S16)
718 .widenScalarToNextMultipleOf(0, 32)
719 .custom();
720 }
721 assert(ST.hasMad64_32());
722
723 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
724 .legalFor({S32, S16, V2S16}) // Clamp modifier
725 .minScalarOrElt(0, S16)
726 .clampMaxNumElementsStrict(0, S16, 2)
727 .scalarize(0)
728 .widenScalarToNextPow2(0, 32)
729 .lower();
730 } else if (ST.has16BitInsts()) {
731 getActionDefinitionsBuilder({G_ADD, G_SUB})
732 .legalFor({S32, S16})
733 .minScalar(0, S16)
734 .widenScalarToNextMultipleOf(0, 32)
735 .maxScalar(0, S32)
736 .scalarize(0);
737
738 getActionDefinitionsBuilder(G_MUL)
739 .legalFor({S32, S16})
740 .scalarize(0)
741 .minScalar(0, S16)
742 .widenScalarToNextMultipleOf(0, 32)
743 .custom();
744 assert(ST.hasMad64_32());
745
746 // Technically the saturating operations require clamp bit support, but this
747 // was introduced at the same time as 16-bit operations.
748 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
749 .legalFor({S32, S16}) // Clamp modifier
750 .minScalar(0, S16)
751 .scalarize(0)
752 .widenScalarToNextPow2(0, 16)
753 .lower();
754
755 // We're just lowering this, but it helps get a better result to try to
756 // coerce to the desired type first.
757 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
758 .minScalar(0, S16)
759 .scalarize(0)
760 .lower();
761 } else {
762 getActionDefinitionsBuilder({G_ADD, G_SUB})
763 .legalFor({S32})
764 .widenScalarToNextMultipleOf(0, 32)
765 .clampScalar(0, S32, S32)
766 .scalarize(0);
767
768 auto &Mul = getActionDefinitionsBuilder(G_MUL)
769 .legalFor({S32})
770 .scalarize(0)
771 .minScalar(0, S32)
772 .widenScalarToNextMultipleOf(0, 32);
773
774 if (ST.hasMad64_32())
775 Mul.custom();
776 else
777 Mul.maxScalar(0, S32);
778
779 if (ST.hasIntClamp()) {
780 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
781 .legalFor({S32}) // Clamp modifier.
782 .scalarize(0)
783 .minScalarOrElt(0, S32)
784 .lower();
785 } else {
786 // Clamp bit support was added in VI, along with 16-bit operations.
787 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
788 .minScalar(0, S32)
789 .scalarize(0)
790 .lower();
791 }
792
793 // FIXME: DAG expansion gets better results. The widening uses the smaller
794 // range values and goes for the min/max lowering directly.
795 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
796 .minScalar(0, S32)
797 .scalarize(0)
798 .lower();
799 }
800
801 getActionDefinitionsBuilder(
802 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
803 .customFor({S32, S64})
804 .clampScalar(0, S32, S64)
805 .widenScalarToNextPow2(0, 32)
806 .scalarize(0);
807
808 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
809 .legalFor({S32})
810 .maxScalar(0, S32);
811
812 if (ST.hasVOP3PInsts()) {
813 Mulh
814 .clampMaxNumElements(0, S8, 2)
815 .lowerFor({V2S8});
816 }
817
818 Mulh
819 .scalarize(0)
820 .lower();
821
822 // Report legal for any types we can handle anywhere. For the cases only legal
823 // on the SALU, RegBankSelect will be able to re-legalize.
824 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
825 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
826 .clampScalar(0, S32, S64)
827 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
828 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
829 .widenScalarToNextPow2(0)
830 .scalarize(0);
831
832 getActionDefinitionsBuilder(
833 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
834 .legalFor({{S32, S1}, {S32, S32}})
835 .clampScalar(0, S32, S32)
836 .scalarize(0);
837
838 getActionDefinitionsBuilder(G_BITCAST)
839 // Don't worry about the size constraint.
840 .legalIf(all(isRegisterType(0), isRegisterType(1)))
841 .lower();
842
843
844 getActionDefinitionsBuilder(G_CONSTANT)
845 .legalFor({S1, S32, S64, S16, GlobalPtr,
846 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
847 .legalIf(isPointer(0))
848 .clampScalar(0, S32, S64)
849 .widenScalarToNextPow2(0);
850
851 getActionDefinitionsBuilder(G_FCONSTANT)
852 .legalFor({S32, S64, S16})
853 .clampScalar(0, S16, S64);
854
855 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
856 .legalIf(isRegisterType(0))
857 // s1 and s16 are special cases because they have legal operations on
858 // them, but don't really occupy registers in the normal way.
859 .legalFor({S1, S16})
860 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
861 .clampScalarOrElt(0, S32, MaxScalar)
862 .widenScalarToNextPow2(0, 32)
863 .clampMaxNumElements(0, S32, 16);
864
865 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
866
867 // If the amount is divergent, we have to do a wave reduction to get the
868 // maximum value, so this is expanded during RegBankSelect.
869 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
870 .legalFor({{PrivatePtr, S32}});
871
872 getActionDefinitionsBuilder(G_STACKSAVE)
873 .customFor({PrivatePtr});
874 getActionDefinitionsBuilder(G_STACKRESTORE)
875 .legalFor({PrivatePtr});
876
877 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
878 .customIf(typeIsNot(0, PrivatePtr));
879
880 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
881
882 auto &FPOpActions = getActionDefinitionsBuilder(
883 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
884 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
885 .legalFor({S32, S64});
886 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
887 .customFor({S32, S64});
888 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
889 .customFor({S32, S64});
890
891 if (ST.has16BitInsts()) {
892 if (ST.hasVOP3PInsts())
893 FPOpActions.legalFor({S16, V2S16});
894 else
895 FPOpActions.legalFor({S16});
896
897 TrigActions.customFor({S16});
898 FDIVActions.customFor({S16});
899 }
900
901 if (ST.hasPackedFP32Ops()) {
902 FPOpActions.legalFor({V2S32});
903 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
904 }
905
906 auto &MinNumMaxNum = getActionDefinitionsBuilder({
907 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
908
909 if (ST.hasVOP3PInsts()) {
910 MinNumMaxNum.customFor(FPTypesPK16)
911 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
912 .clampMaxNumElements(0, S16, 2)
913 .clampScalar(0, S16, S64)
914 .scalarize(0);
915 } else if (ST.has16BitInsts()) {
916 MinNumMaxNum.customFor(FPTypes16)
917 .clampScalar(0, S16, S64)
918 .scalarize(0);
919 } else {
920 MinNumMaxNum.customFor(FPTypesBase)
921 .clampScalar(0, S32, S64)
922 .scalarize(0);
923 }
924
925 if (ST.hasVOP3PInsts())
926 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
927
928 FPOpActions
929 .scalarize(0)
930 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
931
932 TrigActions
933 .scalarize(0)
934 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
935
936 FDIVActions
937 .scalarize(0)
938 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
939
940 getActionDefinitionsBuilder({G_FNEG, G_FABS})
941 .legalFor(FPTypesPK16)
942 .clampMaxNumElementsStrict(0, S16, 2)
943 .scalarize(0)
944 .clampScalar(0, S16, S64);
945
946 if (ST.has16BitInsts()) {
947 getActionDefinitionsBuilder(G_FSQRT)
948 .legalFor({S16})
949 .customFor({S32, S64})
950 .scalarize(0)
951 .unsupported();
952 getActionDefinitionsBuilder(G_FFLOOR)
953 .legalFor({S32, S64, S16})
954 .scalarize(0)
955 .clampScalar(0, S16, S64);
956
957 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
958 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
959 .scalarize(0)
960 .maxScalarIf(typeIs(0, S16), 1, S16)
961 .clampScalar(1, S32, S32)
962 .lower();
963
964 getActionDefinitionsBuilder(G_FFREXP)
965 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
966 .scalarize(0)
967 .lower();
968 } else {
969 getActionDefinitionsBuilder(G_FSQRT)
970 .customFor({S32, S64, S16})
971 .scalarize(0)
972 .unsupported();
973
974
975 if (ST.hasFractBug()) {
976 getActionDefinitionsBuilder(G_FFLOOR)
977 .customFor({S64})
978 .legalFor({S32, S64})
979 .scalarize(0)
980 .clampScalar(0, S32, S64);
981 } else {
982 getActionDefinitionsBuilder(G_FFLOOR)
983 .legalFor({S32, S64})
984 .scalarize(0)
985 .clampScalar(0, S32, S64);
986 }
987
988 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
989 .legalFor({{S32, S32}, {S64, S32}})
990 .scalarize(0)
991 .clampScalar(0, S32, S64)
992 .clampScalar(1, S32, S32)
993 .lower();
994
995 getActionDefinitionsBuilder(G_FFREXP)
996 .customFor({{S32, S32}, {S64, S32}})
997 .scalarize(0)
998 .minScalar(0, S32)
999 .clampScalar(1, S32, S32)
1000 .lower();
1001 }
1002
1003 getActionDefinitionsBuilder(G_FPTRUNC)
1004 .legalFor({{S32, S64}, {S16, S32}})
1005 .scalarize(0)
1006 .lower();
1007
1008 getActionDefinitionsBuilder(G_FPEXT)
1009 .legalFor({{S64, S32}, {S32, S16}})
1010 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1011 .scalarize(0);
1012
1013 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1014 if (ST.has16BitInsts()) {
1015 FSubActions
1016 // Use actual fsub instruction
1017 .legalFor({S32, S16})
1018 // Must use fadd + fneg
1019 .lowerFor({S64, V2S16});
1020 } else {
1021 FSubActions
1022 // Use actual fsub instruction
1023 .legalFor({S32})
1024 // Must use fadd + fneg
1025 .lowerFor({S64, S16, V2S16});
1026 }
1027
1028 FSubActions
1029 .scalarize(0)
1030 .clampScalar(0, S32, S64);
1031
1032 // Whether this is legal depends on the floating point mode for the function.
1033 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1034 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1035 FMad.customFor({S32, S16});
1036 else if (ST.hasMadMacF32Insts())
1037 FMad.customFor({S32});
1038 else if (ST.hasMadF16())
1039 FMad.customFor({S16});
1040 FMad.scalarize(0)
1041 .lower();
1042
1043 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1044 if (ST.has16BitInsts()) {
1045 FRem.customFor({S16, S32, S64});
1046 } else {
1047 FRem.minScalar(0, S32)
1048 .customFor({S32, S64});
1049 }
1050 FRem.scalarize(0);
1051
1052 // TODO: Do we need to clamp maximum bitwidth?
1053 getActionDefinitionsBuilder(G_TRUNC)
1054 .legalIf(isScalar(0))
1055 .legalFor({{V2S16, V2S32}})
1056 .clampMaxNumElements(0, S16, 2)
1057 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1058 // situations (like an invalid implicit use), we don't want to infinite loop
1059 // in the legalizer.
1060 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
1061 .alwaysLegal();
1062
1063 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1064 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1065 {S32, S1}, {S64, S1}, {S16, S1}})
1066 .scalarize(0)
1067 .clampScalar(0, S32, S64)
1068 .widenScalarToNextPow2(1, 32);
1069
1070 // TODO: Split s1->s64 during regbankselect for VALU.
1071 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1072 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1073 .lowerIf(typeIs(1, S1))
1074 .customFor({{S32, S64}, {S64, S64}});
1075 if (ST.has16BitInsts())
1076 IToFP.legalFor({{S16, S16}});
1077 IToFP.clampScalar(1, S32, S64)
1078 .minScalar(0, S32)
1079 .scalarize(0)
1080 .widenScalarToNextPow2(1);
1081
1082 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1083 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1084 .customFor({{S64, S32}, {S64, S64}})
1085 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1086 if (ST.has16BitInsts())
1087 FPToI.legalFor({{S16, S16}});
1088 else
1089 FPToI.minScalar(1, S32);
1090
1091 FPToI.minScalar(0, S32)
1092 .widenScalarToNextPow2(0, 32)
1093 .scalarize(0)
1094 .lower();
1095
1096 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1097 .customFor({S16, S32})
1098 .scalarize(0)
1099 .lower();
1100
1101 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1102 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1103 .scalarize(0)
1104 .lower();
1105
1106 if (ST.has16BitInsts()) {
1107 getActionDefinitionsBuilder(
1108 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1109 .legalFor({S16, S32, S64})
1110 .clampScalar(0, S16, S64)
1111 .scalarize(0);
1112 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1113 getActionDefinitionsBuilder(
1114 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1115 .legalFor({S32, S64})
1116 .clampScalar(0, S32, S64)
1117 .scalarize(0);
1118 } else {
1119 getActionDefinitionsBuilder(
1120 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1121 .legalFor({S32})
1122 .customFor({S64})
1123 .clampScalar(0, S32, S64)
1124 .scalarize(0);
1125 }
1126
1127 getActionDefinitionsBuilder(G_PTR_ADD)
1128 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1129 .legalIf(all(isPointer(0), sameSize(0, 1)))
1130 .scalarize(0)
1131 .scalarSameSizeAs(1, 0);
1132
1133 getActionDefinitionsBuilder(G_PTRMASK)
1134 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1135 .scalarSameSizeAs(1, 0)
1136 .scalarize(0);
1137
1138 auto &CmpBuilder =
1139 getActionDefinitionsBuilder(G_ICMP)
1140 // The compare output type differs based on the register bank of the output,
1141 // so make both s1 and s32 legal.
1142 //
1143 // Scalar compares producing output in scc will be promoted to s32, as that
1144 // is the allocatable register type that will be needed for the copy from
1145 // scc. This will be promoted during RegBankSelect, and we assume something
1146 // before that won't try to use s32 result types.
1147 //
1148 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1149 // bank.
1150 .legalForCartesianProduct(
1151 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1152 .legalForCartesianProduct(
1153 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1154 if (ST.has16BitInsts()) {
1155 CmpBuilder.legalFor({{S1, S16}});
1156 }
1157
1158 CmpBuilder
1159 .widenScalarToNextPow2(1)
1160 .clampScalar(1, S32, S64)
1161 .scalarize(0)
1162 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1163
1164 auto &FCmpBuilder =
1165 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1166 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1167
1168 if (ST.hasSALUFloatInsts())
1169 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1170
1171 FCmpBuilder
1172 .widenScalarToNextPow2(1)
1173 .clampScalar(1, S32, S64)
1174 .scalarize(0);
1175
1176 // FIXME: fpow has a selection pattern that should move to custom lowering.
1177 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1178 if (ST.has16BitInsts())
1179 ExpOps.customFor({{S32}, {S16}});
1180 else
1181 ExpOps.customFor({S32});
1182 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1183 .scalarize(0);
1184
1185 getActionDefinitionsBuilder(G_FPOWI)
1186 .clampScalar(0, MinScalarFPTy, S32)
1187 .lower();
1188
1189 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1190 Log2Ops.customFor({S32});
1191 if (ST.has16BitInsts())
1192 Log2Ops.legalFor({S16});
1193 else
1194 Log2Ops.customFor({S16});
1195 Log2Ops.scalarize(0)
1196 .lower();
1197
1198 auto &LogOps =
1199 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1200 LogOps.customFor({S32, S16});
1201 LogOps.clampScalar(0, MinScalarFPTy, S32)
1202 .scalarize(0);
1203
1204 // The 64-bit versions produce 32-bit results, but only on the SALU.
1205 getActionDefinitionsBuilder(G_CTPOP)
1206 .legalFor({{S32, S32}, {S32, S64}})
1207 .clampScalar(0, S32, S32)
1208 .widenScalarToNextPow2(1, 32)
1209 .clampScalar(1, S32, S64)
1210 .scalarize(0)
1211 .widenScalarToNextPow2(0, 32);
1212
1213 // If no 16 bit instr is available, lower into different instructions.
1214 if (ST.has16BitInsts())
1215 getActionDefinitionsBuilder(G_IS_FPCLASS)
1216 .legalForCartesianProduct({S1}, FPTypes16)
1217 .widenScalarToNextPow2(1)
1218 .scalarize(0)
1219 .lower();
1220 else
1221 getActionDefinitionsBuilder(G_IS_FPCLASS)
1222 .legalForCartesianProduct({S1}, FPTypesBase)
1223 .lowerFor({S1, S16})
1224 .widenScalarToNextPow2(1)
1225 .scalarize(0)
1226 .lower();
1227
1228 // The hardware instructions return a different result on 0 than the generic
1229 // instructions expect. The hardware produces -1, but these produce the
1230 // bitwidth.
1231 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1232 .scalarize(0)
1233 .clampScalar(0, S32, S32)
1234 .clampScalar(1, S32, S64)
1235 .widenScalarToNextPow2(0, 32)
1236 .widenScalarToNextPow2(1, 32)
1237 .custom();
1238
1239 // The 64-bit versions produce 32-bit results, but only on the SALU.
1240 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1241 .legalFor({{S32, S32}, {S32, S64}})
1242 .clampScalar(0, S32, S32)
1243 .clampScalar(1, S32, S64)
1244 .scalarize(0)
1245 .widenScalarToNextPow2(0, 32)
1246 .widenScalarToNextPow2(1, 32);
1247
1248 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1249 // RegBankSelect.
1250 getActionDefinitionsBuilder(G_BITREVERSE)
1251 .legalFor({S32, S64})
1252 .clampScalar(0, S32, S64)
1253 .scalarize(0)
1254 .widenScalarToNextPow2(0);
1255
1256 if (ST.has16BitInsts()) {
1257 getActionDefinitionsBuilder(G_BSWAP)
1258 .legalFor({S16, S32, V2S16})
1259 .clampMaxNumElementsStrict(0, S16, 2)
1260 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1261 // narrowScalar limitation.
1262 .widenScalarToNextPow2(0)
1263 .clampScalar(0, S16, S32)
1264 .scalarize(0);
1265
1266 if (ST.hasVOP3PInsts()) {
1267 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1268 .legalFor({S32, S16, V2S16})
1269 .clampMaxNumElements(0, S16, 2)
1270 .minScalar(0, S16)
1271 .widenScalarToNextPow2(0)
1272 .scalarize(0)
1273 .lower();
1274 } else {
1275 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1276 .legalFor({S32, S16})
1277 .widenScalarToNextPow2(0)
1278 .minScalar(0, S16)
1279 .scalarize(0)
1280 .lower();
1281 }
1282 } else {
1283 // TODO: Should have same legality without v_perm_b32
1284 getActionDefinitionsBuilder(G_BSWAP)
1285 .legalFor({S32})
1286 .lowerIf(scalarNarrowerThan(0, 32))
1287 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1288 // narrowScalar limitation.
1289 .widenScalarToNextPow2(0)
1290 .maxScalar(0, S32)
1291 .scalarize(0)
1292 .lower();
1293
1294 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1295 .legalFor({S32})
1296 .minScalar(0, S32)
1297 .widenScalarToNextPow2(0)
1298 .scalarize(0)
1299 .lower();
1300 }
1301
1302 getActionDefinitionsBuilder(G_INTTOPTR)
1303 // List the common cases
1304 .legalForCartesianProduct(AddrSpaces64, {S64})
1305 .legalForCartesianProduct(AddrSpaces32, {S32})
1306 .scalarize(0)
1307 // Accept any address space as long as the size matches
1308 .legalIf(sameSize(0, 1))
1309 .widenScalarIf(smallerThan(1, 0),
1310 [](const LegalityQuery &Query) {
1311 return std::pair(
1312 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1313 })
1314 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1315 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1316 });
1317
1318 getActionDefinitionsBuilder(G_PTRTOINT)
1319 // List the common cases
1320 .legalForCartesianProduct(AddrSpaces64, {S64})
1321 .legalForCartesianProduct(AddrSpaces32, {S32})
1322 .scalarize(0)
1323 // Accept any address space as long as the size matches
1324 .legalIf(sameSize(0, 1))
1325 .widenScalarIf(smallerThan(0, 1),
1326 [](const LegalityQuery &Query) {
1327 return std::pair(
1328 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1329 })
1330 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1331 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1332 });
1333
1334 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1335 .scalarize(0)
1336 .custom();
1337
1338 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1339 bool IsLoad) -> bool {
1340 const LLT DstTy = Query.Types[0];
1341
1342 // Split vector extloads.
1343 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1344
1345 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1346 return true;
1347
1348 const LLT PtrTy = Query.Types[1];
1349 unsigned AS = PtrTy.getAddressSpace();
1350 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1351 Query.MMODescrs[0].Ordering !=
1352 AtomicOrdering::NotAtomic))
1353 return true;
1354
1355 // Catch weird sized loads that don't evenly divide into the access sizes
1356 // TODO: May be able to widen depending on alignment etc.
1357 unsigned NumRegs = (MemSize + 31) / 32;
1358 if (NumRegs == 3) {
1359 if (!ST.hasDwordx3LoadStores())
1360 return true;
1361 } else {
1362 // If the alignment allows, these should have been widened.
1363 if (!isPowerOf2_32(NumRegs))
1364 return true;
1365 }
1366
1367 return false;
1368 };
1369
1370 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1371 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1372 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1373
1374 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1375 // LDS
1376 // TODO: Unsupported flat for SI.
1377
1378 for (unsigned Op : {G_LOAD, G_STORE}) {
1379 const bool IsStore = Op == G_STORE;
1380
1381 auto &Actions = getActionDefinitionsBuilder(Op);
1382 // Explicitly list some common cases.
1383 // TODO: Does this help compile time at all?
1384 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1385 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1386 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1387 {S64, GlobalPtr, S64, GlobalAlign32},
1388 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1389 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1390 {S32, GlobalPtr, S8, GlobalAlign8},
1391 {S32, GlobalPtr, S16, GlobalAlign16},
1392
1393 {S32, LocalPtr, S32, 32},
1394 {S64, LocalPtr, S64, 32},
1395 {V2S32, LocalPtr, V2S32, 32},
1396 {S32, LocalPtr, S8, 8},
1397 {S32, LocalPtr, S16, 16},
1398 {V2S16, LocalPtr, S32, 32},
1399
1400 {S32, PrivatePtr, S32, 32},
1401 {S32, PrivatePtr, S8, 8},
1402 {S32, PrivatePtr, S16, 16},
1403 {V2S16, PrivatePtr, S32, 32},
1404
1405 {S32, ConstantPtr, S32, GlobalAlign32},
1406 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1407 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1408 {S64, ConstantPtr, S64, GlobalAlign32},
1409 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1410 Actions.legalIf(
1411 [=](const LegalityQuery &Query) -> bool {
1412 return isLoadStoreLegal(ST, Query);
1413 });
1414
1415 // The custom pointers (fat pointers, buffer resources) don't work with load
1416 // and store at this level. Fat pointers should have been lowered to
1417 // intrinsics before the translation to MIR.
1418 Actions.unsupportedIf(
1419 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1420
1421 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1422 // ptrtoint. This is needed to account for the fact that we can't have i128
1423 // as a register class for SelectionDAG reasons.
1424 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1425 return hasBufferRsrcWorkaround(Query.Types[0]);
1426 });
1427
1428 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1429 // 64-bits.
1430 //
1431 // TODO: Should generalize bitcast action into coerce, which will also cover
1432 // inserting addrspacecasts.
1433 Actions.customIf(typeIs(1, Constant32Ptr));
1434
1435 // Turn any illegal element vectors into something easier to deal
1436 // with. These will ultimately produce 32-bit scalar shifts to extract the
1437 // parts anyway.
1438 //
1439 // For odd 16-bit element vectors, prefer to split those into pieces with
1440 // 16-bit vector parts.
1441 Actions.bitcastIf(
1442 [=](const LegalityQuery &Query) -> bool {
1443 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1444 Query.MMODescrs[0].MemoryTy);
1445 }, bitcastToRegisterType(0));
1446
1447 if (!IsStore) {
1448 // Widen suitably aligned loads by loading extra bytes. The standard
1449 // legalization actions can't properly express widening memory operands.
1450 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1451 return shouldWidenLoad(ST, Query, G_LOAD);
1452 });
1453 }
1454
1455 // FIXME: load/store narrowing should be moved to lower action
1456 Actions
1457 .narrowScalarIf(
1458 [=](const LegalityQuery &Query) -> bool {
1459 return !Query.Types[0].isVector() &&
1460 needToSplitMemOp(Query, Op == G_LOAD);
1461 },
1462 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1463 const LLT DstTy = Query.Types[0];
1464 const LLT PtrTy = Query.Types[1];
1465
1466 const unsigned DstSize = DstTy.getSizeInBits();
1467 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1468
1469 // Split extloads.
1470 if (DstSize > MemSize)
1471 return std::pair(0, LLT::scalar(MemSize));
1472
1473 unsigned MaxSize = maxSizeForAddrSpace(
1474 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1475 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1476 if (MemSize > MaxSize)
1477 return std::pair(0, LLT::scalar(MaxSize));
1478
1479 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1480 return std::pair(0, LLT::scalar(Align));
1481 })
1482 .fewerElementsIf(
1483 [=](const LegalityQuery &Query) -> bool {
1484 return Query.Types[0].isVector() &&
1485 needToSplitMemOp(Query, Op == G_LOAD);
1486 },
1487 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1488 const LLT DstTy = Query.Types[0];
1489 const LLT PtrTy = Query.Types[1];
1490
1491 LLT EltTy = DstTy.getElementType();
1492 unsigned MaxSize = maxSizeForAddrSpace(
1493 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1494 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1495
1496 // FIXME: Handle widened to power of 2 results better. This ends
1497 // up scalarizing.
1498 // FIXME: 3 element stores scalarized on SI
1499
1500 // Split if it's too large for the address space.
1501 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1502 if (MemSize > MaxSize) {
1503 unsigned NumElts = DstTy.getNumElements();
1504 unsigned EltSize = EltTy.getSizeInBits();
1505
1506 if (MaxSize % EltSize == 0) {
1507 return std::pair(
1508 0, LLT::scalarOrVector(
1509 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1510 }
1511
1512 unsigned NumPieces = MemSize / MaxSize;
1513
1514 // FIXME: Refine when odd breakdowns handled
1515 // The scalars will need to be re-legalized.
1516 if (NumPieces == 1 || NumPieces >= NumElts ||
1517 NumElts % NumPieces != 0)
1518 return std::pair(0, EltTy);
1519
1520 return std::pair(0,
1521 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1522 }
1523
1524 // FIXME: We could probably handle weird extending loads better.
1525 if (DstTy.getSizeInBits() > MemSize)
1526 return std::pair(0, EltTy);
1527
1528 unsigned EltSize = EltTy.getSizeInBits();
1529 unsigned DstSize = DstTy.getSizeInBits();
1530 if (!isPowerOf2_32(DstSize)) {
1531 // We're probably decomposing an odd sized store. Try to split
1532 // to the widest type. TODO: Account for alignment. As-is it
1533 // should be OK, since the new parts will be further legalized.
1534 unsigned FloorSize = llvm::bit_floor(DstSize);
1535 return std::pair(
1536 0, LLT::scalarOrVector(
1537 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1538 }
1539
1540 // May need relegalization for the scalars.
1541 return std::pair(0, EltTy);
1542 })
1543 .minScalar(0, S32)
1544 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1545 .widenScalarToNextPow2(0)
1546 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1547 .lower();
1548 }
1549
1550 // FIXME: Unaligned accesses not lowered.
1551 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1552 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1553 {S32, GlobalPtr, S16, 2 * 8},
1554 {S32, LocalPtr, S8, 8},
1555 {S32, LocalPtr, S16, 16},
1556 {S32, PrivatePtr, S8, 8},
1557 {S32, PrivatePtr, S16, 16},
1558 {S32, ConstantPtr, S8, 8},
1559 {S32, ConstantPtr, S16, 2 * 8}})
1560 .legalIf(
1561 [=](const LegalityQuery &Query) -> bool {
1562 return isLoadStoreLegal(ST, Query);
1563 });
1564
1565 if (ST.hasFlatAddressSpace()) {
1566 ExtLoads.legalForTypesWithMemDesc(
1567 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1568 }
1569
1570 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1571 // 64-bits.
1572 //
1573 // TODO: Should generalize bitcast action into coerce, which will also cover
1574 // inserting addrspacecasts.
1575 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1576
1577 ExtLoads.clampScalar(0, S32, S32)
1578 .widenScalarToNextPow2(0)
1579 .lower();
1580
1581 auto &Atomics = getActionDefinitionsBuilder(
1582 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1583 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1584 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1585 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1586 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1587 {S64, GlobalPtr}, {S64, LocalPtr},
1588 {S32, RegionPtr}, {S64, RegionPtr}});
1589 if (ST.hasFlatAddressSpace()) {
1590 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1591 }
1592
1593 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1594 if (ST.hasLDSFPAtomicAdd()) {
1595 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1596 if (ST.hasGFX90AInsts())
1597 Atomic.legalFor({{S64, LocalPtr}});
1598 if (ST.hasAtomicDsPkAdd16Insts())
1599 Atomic.legalFor({{V2S16, LocalPtr}});
1600 }
1601 if (ST.hasAtomicFaddInsts())
1602 Atomic.legalFor({{S32, GlobalPtr}});
1603 if (ST.hasFlatAtomicFaddF32Inst())
1604 Atomic.legalFor({{S32, FlatPtr}});
1605
1606 if (ST.hasGFX90AInsts()) {
1607 // These are legal with some caveats, and should have undergone expansion in
1608 // the IR in most situations
1609 // TODO: Move atomic expansion into legalizer
1610 Atomic.legalFor({
1611 {S32, GlobalPtr},
1612 {S64, GlobalPtr},
1613 {S64, FlatPtr}
1614 });
1615 }
1616
1617 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1618 // demarshalling
1619 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1620 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1621 {S32, FlatPtr}, {S64, FlatPtr}})
1622 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1623 {S32, RegionPtr}, {S64, RegionPtr}});
1624 // TODO: Pointer types, any 32-bit or 64-bit vector
1625
1626 // Condition should be s32 for scalar, s1 for vector.
1627 getActionDefinitionsBuilder(G_SELECT)
1628 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1629 LocalPtr, FlatPtr, PrivatePtr,
1630 LLT::fixed_vector(2, LocalPtr),
1631 LLT::fixed_vector(2, PrivatePtr)},
1632 {S1, S32})
1633 .clampScalar(0, S16, S64)
1634 .scalarize(1)
1635 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1636 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1637 .clampMaxNumElements(0, S32, 2)
1638 .clampMaxNumElements(0, LocalPtr, 2)
1639 .clampMaxNumElements(0, PrivatePtr, 2)
1640 .scalarize(0)
1641 .widenScalarToNextPow2(0)
1642 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1643
1644 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1645 // be more flexible with the shift amount type.
1646 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1647 .legalFor({{S32, S32}, {S64, S32}});
1648 if (ST.has16BitInsts()) {
1649 if (ST.hasVOP3PInsts()) {
1650 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1651 .clampMaxNumElements(0, S16, 2);
1652 } else
1653 Shifts.legalFor({{S16, S16}});
1654
1655 // TODO: Support 16-bit shift amounts for all types
1656 Shifts.widenScalarIf(
1657 [=](const LegalityQuery &Query) {
1658 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1659 // 32-bit amount.
1660 const LLT ValTy = Query.Types[0];
1661 const LLT AmountTy = Query.Types[1];
1662 return ValTy.getSizeInBits() <= 16 &&
1663 AmountTy.getSizeInBits() < 16;
1664 }, changeTo(1, S16));
1665 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1666 Shifts.clampScalar(1, S32, S32);
1667 Shifts.widenScalarToNextPow2(0, 16);
1668 Shifts.clampScalar(0, S16, S64);
1669
1670 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1671 .minScalar(0, S16)
1672 .scalarize(0)
1673 .lower();
1674 } else {
1675 // Make sure we legalize the shift amount type first, as the general
1676 // expansion for the shifted type will produce much worse code if it hasn't
1677 // been truncated already.
1678 Shifts.clampScalar(1, S32, S32);
1679 Shifts.widenScalarToNextPow2(0, 32);
1680 Shifts.clampScalar(0, S32, S64);
1681
1682 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1683 .minScalar(0, S32)
1684 .scalarize(0)
1685 .lower();
1686 }
1687 Shifts.scalarize(0);
1688
1689 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1690 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1691 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1692 unsigned IdxTypeIdx = 2;
1693
1694 getActionDefinitionsBuilder(Op)
1695 .customIf([=](const LegalityQuery &Query) {
1696 const LLT EltTy = Query.Types[EltTypeIdx];
1697 const LLT VecTy = Query.Types[VecTypeIdx];
1698 const LLT IdxTy = Query.Types[IdxTypeIdx];
1699 const unsigned EltSize = EltTy.getSizeInBits();
1700 const bool isLegalVecType =
1701 !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
1702 // Address space 8 pointers are 128-bit wide values, but the logic
1703 // below will try to bitcast them to 2N x s64, which will fail.
1704 // Therefore, as an intermediate step, wrap extracts/insertions from a
1705 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1706 // extraction result) in order to produce a vector operation that can
1707 // be handled by the logic below.
1708 if (EltTy.isPointer() && EltSize > 64)
1709 return true;
1710 return (EltSize == 32 || EltSize == 64) &&
1711 VecTy.getSizeInBits() % 32 == 0 &&
1712 VecTy.getSizeInBits() <= MaxRegisterSize &&
1713 IdxTy.getSizeInBits() == 32 &&
1714 isLegalVecType;
1715 })
1716 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1717 bitcastToVectorElement32(VecTypeIdx))
1718 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1719 .bitcastIf(
1720 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1721 [=](const LegalityQuery &Query) {
1722 // For > 64-bit element types, try to turn this into a 64-bit
1723 // element vector since we may be able to do better indexing
1724 // if this is scalar. If not, fall back to 32.
1725 const LLT EltTy = Query.Types[EltTypeIdx];
1726 const LLT VecTy = Query.Types[VecTypeIdx];
1727 const unsigned DstEltSize = EltTy.getSizeInBits();
1728 const unsigned VecSize = VecTy.getSizeInBits();
1729
1730 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1731 return std::pair(
1732 VecTypeIdx,
1733 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1734 })
1735 .clampScalar(EltTypeIdx, S32, S64)
1736 .clampScalar(VecTypeIdx, S32, S64)
1737 .clampScalar(IdxTypeIdx, S32, S32)
1738 .clampMaxNumElements(VecTypeIdx, S32, 32)
1739 // TODO: Clamp elements for 64-bit vectors?
1740 .moreElementsIf(
1741 isIllegalRegisterType(VecTypeIdx),
1742 moreElementsToNextExistingRegClass(VecTypeIdx))
1743 // It should only be necessary with variable indexes.
1744 // As a last resort, lower to the stack
1745 .lower();
1746 }
1747
1748 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1749 .unsupportedIf([=](const LegalityQuery &Query) {
1750 const LLT &EltTy = Query.Types[1].getElementType();
1751 return Query.Types[0] != EltTy;
1752 });
1753
1754 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1755 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1756 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1757
1758 // FIXME: Doesn't handle extract of illegal sizes.
1759 getActionDefinitionsBuilder(Op)
1760 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1761 .lowerIf([=](const LegalityQuery &Query) {
1762 // Sub-vector(or single element) insert and extract.
1763 // TODO: verify immediate offset here since lower only works with
1764 // whole elements.
1765 const LLT BigTy = Query.Types[BigTyIdx];
1766 return BigTy.isVector();
1767 })
1768 // FIXME: Multiples of 16 should not be legal.
1769 .legalIf([=](const LegalityQuery &Query) {
1770 const LLT BigTy = Query.Types[BigTyIdx];
1771 const LLT LitTy = Query.Types[LitTyIdx];
1772 return (BigTy.getSizeInBits() % 32 == 0) &&
1773 (LitTy.getSizeInBits() % 16 == 0);
1774 })
1775 .widenScalarIf(
1776 [=](const LegalityQuery &Query) {
1777 const LLT BigTy = Query.Types[BigTyIdx];
1778 return (BigTy.getScalarSizeInBits() < 16);
1779 },
1780 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1781 .widenScalarIf(
1782 [=](const LegalityQuery &Query) {
1783 const LLT LitTy = Query.Types[LitTyIdx];
1784 return (LitTy.getScalarSizeInBits() < 16);
1785 },
1786 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1787 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1788 .widenScalarToNextPow2(BigTyIdx, 32);
1789
1790 }
1791
1792 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1793 .legalForCartesianProduct(AllS32Vectors, {S32})
1794 .legalForCartesianProduct(AllS64Vectors, {S64})
1795 .clampNumElements(0, V16S32, V32S32)
1796 .clampNumElements(0, V2S64, V16S64)
1797 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1798 .moreElementsIf(
1799 isIllegalRegisterType(0),
1800 moreElementsToNextExistingRegClass(0));
1801
1802 if (ST.hasScalarPackInsts()) {
1803 BuildVector
1804 // FIXME: Should probably widen s1 vectors straight to s32
1805 .minScalarOrElt(0, S16)
1806 .minScalar(1, S16);
1807
1808 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1809 .legalFor({V2S16, S32})
1810 .lower();
1811 } else {
1812 BuildVector.customFor({V2S16, S16});
1813 BuildVector.minScalarOrElt(0, S32);
1814
1815 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1816 .customFor({V2S16, S32})
1817 .lower();
1818 }
1819
1820 BuildVector.legalIf(isRegisterType(0));
1821
1822 // FIXME: Clamp maximum size
1823 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1824 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1825 .clampMaxNumElements(0, S32, 32)
1826 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1827 .clampMaxNumElements(0, S16, 64);
1828
1829 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1830
1831 // Merge/Unmerge
1832 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1833 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1834 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1835
1836 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1837 const LLT Ty = Query.Types[TypeIdx];
1838 if (Ty.isVector()) {
1839 const LLT &EltTy = Ty.getElementType();
1840 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1841 return true;
1842 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1843 return true;
1844 }
1845 return false;
1846 };
1847
1848 auto &Builder = getActionDefinitionsBuilder(Op)
1849 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1850 .lowerFor({{S16, V2S16}})
1851 .lowerIf([=](const LegalityQuery &Query) {
1852 const LLT BigTy = Query.Types[BigTyIdx];
1853 return BigTy.getSizeInBits() == 32;
1854 })
1855 // Try to widen to s16 first for small types.
1856 // TODO: Only do this on targets with legal s16 shifts
1857 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1858 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1859 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1860 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1861 elementTypeIs(1, S16)),
1862 changeTo(1, V2S16))
1863 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1864 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1865 // valid.
1866 .clampScalar(LitTyIdx, S32, S512)
1867 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1868 // Break up vectors with weird elements into scalars
1869 .fewerElementsIf(
1870 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1871 scalarize(0))
1872 .fewerElementsIf(
1873 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1874 scalarize(1))
1875 .clampScalar(BigTyIdx, S32, MaxScalar);
1876
1877 if (Op == G_MERGE_VALUES) {
1878 Builder.widenScalarIf(
1879 // TODO: Use 16-bit shifts if legal for 8-bit values?
1880 [=](const LegalityQuery &Query) {
1881 const LLT Ty = Query.Types[LitTyIdx];
1882 return Ty.getSizeInBits() < 32;
1883 },
1884 changeTo(LitTyIdx, S32));
1885 }
1886
1887 Builder.widenScalarIf(
1888 [=](const LegalityQuery &Query) {
1889 const LLT Ty = Query.Types[BigTyIdx];
1890 return Ty.getSizeInBits() % 16 != 0;
1891 },
1892 [=](const LegalityQuery &Query) {
1893 // Pick the next power of 2, or a multiple of 64 over 128.
1894 // Whichever is smaller.
1895 const LLT &Ty = Query.Types[BigTyIdx];
1896 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1897 if (NewSizeInBits >= 256) {
1898 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1899 if (RoundedTo < NewSizeInBits)
1900 NewSizeInBits = RoundedTo;
1901 }
1902 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1903 })
1904 // Any vectors left are the wrong size. Scalarize them.
1905 .scalarize(0)
1906 .scalarize(1);
1907 }
1908
1909 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1910 // RegBankSelect.
1911 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1912 .legalFor({{S32}, {S64}});
1913
1914 if (ST.hasVOP3PInsts()) {
1915 SextInReg.lowerFor({{V2S16}})
1916 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1917 // get more vector shift opportunities, since we'll get those when
1918 // expanded.
1919 .clampMaxNumElementsStrict(0, S16, 2);
1920 } else if (ST.has16BitInsts()) {
1921 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1922 } else {
1923 // Prefer to promote to s32 before lowering if we don't have 16-bit
1924 // shifts. This avoid a lot of intermediate truncate and extend operations.
1925 SextInReg.lowerFor({{S32}, {S64}});
1926 }
1927
1928 SextInReg
1929 .scalarize(0)
1930 .clampScalar(0, S32, S64)
1931 .lower();
1932
1933 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1934 .scalarize(0)
1935 .lower();
1936
1937 // TODO: Only Try to form v2s16 with legal packed instructions.
1938 getActionDefinitionsBuilder(G_FSHR)
1939 .legalFor({{S32, S32}})
1940 .lowerFor({{V2S16, V2S16}})
1941 .clampMaxNumElementsStrict(0, S16, 2)
1942 .scalarize(0)
1943 .lower();
1944
1945 if (ST.hasVOP3PInsts()) {
1946 getActionDefinitionsBuilder(G_FSHL)
1947 .lowerFor({{V2S16, V2S16}})
1948 .clampMaxNumElementsStrict(0, S16, 2)
1949 .scalarize(0)
1950 .lower();
1951 } else {
1952 getActionDefinitionsBuilder(G_FSHL)
1953 .scalarize(0)
1954 .lower();
1955 }
1956
1957 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1958 .legalFor({S64});
1959
1960 getActionDefinitionsBuilder(G_FENCE)
1961 .alwaysLegal();
1962
1963 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1964 .scalarize(0)
1965 .minScalar(0, S32)
1966 .lower();
1967
1968 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1969 .legalFor({{S32, S32}, {S64, S32}})
1970 .clampScalar(1, S32, S32)
1971 .clampScalar(0, S32, S64)
1972 .widenScalarToNextPow2(0)
1973 .scalarize(0);
1974
1975 getActionDefinitionsBuilder(
1976 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
1977 G_FCOPYSIGN,
1978
1979 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
1980 G_READ_REGISTER, G_WRITE_REGISTER,
1981
1982 G_SADDO, G_SSUBO})
1983 .lower();
1984
1985 if (ST.hasIEEEMinMax()) {
1986 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
1987 .legalFor(FPTypesPK16)
1988 .clampMaxNumElements(0, S16, 2)
1989 .scalarize(0);
1990 } else {
1991 // TODO: Implement
1992 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
1993 }
1994
1995 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1996 .lower();
1997
1998 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1999 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2000 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2001 .unsupported();
2002
2003 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2004
2005 getLegacyLegalizerInfo().computeTables();
2006 verify(*ST.getInstrInfo());
2007 }
2008
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI,LostDebugLocObserver & LocObserver) const2009 bool AMDGPULegalizerInfo::legalizeCustom(
2010 LegalizerHelper &Helper, MachineInstr &MI,
2011 LostDebugLocObserver &LocObserver) const {
2012 MachineIRBuilder &B = Helper.MIRBuilder;
2013 MachineRegisterInfo &MRI = *B.getMRI();
2014
2015 switch (MI.getOpcode()) {
2016 case TargetOpcode::G_ADDRSPACE_CAST:
2017 return legalizeAddrSpaceCast(MI, MRI, B);
2018 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2019 return legalizeFroundeven(MI, MRI, B);
2020 case TargetOpcode::G_FCEIL:
2021 return legalizeFceil(MI, MRI, B);
2022 case TargetOpcode::G_FREM:
2023 return legalizeFrem(MI, MRI, B);
2024 case TargetOpcode::G_INTRINSIC_TRUNC:
2025 return legalizeIntrinsicTrunc(MI, MRI, B);
2026 case TargetOpcode::G_SITOFP:
2027 return legalizeITOFP(MI, MRI, B, true);
2028 case TargetOpcode::G_UITOFP:
2029 return legalizeITOFP(MI, MRI, B, false);
2030 case TargetOpcode::G_FPTOSI:
2031 return legalizeFPTOI(MI, MRI, B, true);
2032 case TargetOpcode::G_FPTOUI:
2033 return legalizeFPTOI(MI, MRI, B, false);
2034 case TargetOpcode::G_FMINNUM:
2035 case TargetOpcode::G_FMAXNUM:
2036 case TargetOpcode::G_FMINNUM_IEEE:
2037 case TargetOpcode::G_FMAXNUM_IEEE:
2038 return legalizeMinNumMaxNum(Helper, MI);
2039 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2040 return legalizeExtractVectorElt(MI, MRI, B);
2041 case TargetOpcode::G_INSERT_VECTOR_ELT:
2042 return legalizeInsertVectorElt(MI, MRI, B);
2043 case TargetOpcode::G_FSIN:
2044 case TargetOpcode::G_FCOS:
2045 return legalizeSinCos(MI, MRI, B);
2046 case TargetOpcode::G_GLOBAL_VALUE:
2047 return legalizeGlobalValue(MI, MRI, B);
2048 case TargetOpcode::G_LOAD:
2049 case TargetOpcode::G_SEXTLOAD:
2050 case TargetOpcode::G_ZEXTLOAD:
2051 return legalizeLoad(Helper, MI);
2052 case TargetOpcode::G_STORE:
2053 return legalizeStore(Helper, MI);
2054 case TargetOpcode::G_FMAD:
2055 return legalizeFMad(MI, MRI, B);
2056 case TargetOpcode::G_FDIV:
2057 return legalizeFDIV(MI, MRI, B);
2058 case TargetOpcode::G_FFREXP:
2059 return legalizeFFREXP(MI, MRI, B);
2060 case TargetOpcode::G_FSQRT:
2061 return legalizeFSQRT(MI, MRI, B);
2062 case TargetOpcode::G_UDIV:
2063 case TargetOpcode::G_UREM:
2064 case TargetOpcode::G_UDIVREM:
2065 return legalizeUnsignedDIV_REM(MI, MRI, B);
2066 case TargetOpcode::G_SDIV:
2067 case TargetOpcode::G_SREM:
2068 case TargetOpcode::G_SDIVREM:
2069 return legalizeSignedDIV_REM(MI, MRI, B);
2070 case TargetOpcode::G_ATOMIC_CMPXCHG:
2071 return legalizeAtomicCmpXChg(MI, MRI, B);
2072 case TargetOpcode::G_FLOG2:
2073 return legalizeFlog2(MI, B);
2074 case TargetOpcode::G_FLOG:
2075 case TargetOpcode::G_FLOG10:
2076 return legalizeFlogCommon(MI, B);
2077 case TargetOpcode::G_FEXP2:
2078 return legalizeFExp2(MI, B);
2079 case TargetOpcode::G_FEXP:
2080 case TargetOpcode::G_FEXP10:
2081 return legalizeFExp(MI, B);
2082 case TargetOpcode::G_FPOW:
2083 return legalizeFPow(MI, B);
2084 case TargetOpcode::G_FFLOOR:
2085 return legalizeFFloor(MI, MRI, B);
2086 case TargetOpcode::G_BUILD_VECTOR:
2087 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2088 return legalizeBuildVector(MI, MRI, B);
2089 case TargetOpcode::G_MUL:
2090 return legalizeMul(Helper, MI);
2091 case TargetOpcode::G_CTLZ:
2092 case TargetOpcode::G_CTTZ:
2093 return legalizeCTLZ_CTTZ(MI, MRI, B);
2094 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2095 return legalizeFPTruncRound(MI, B);
2096 case TargetOpcode::G_STACKSAVE:
2097 return legalizeStackSave(MI, B);
2098 default:
2099 return false;
2100 }
2101
2102 llvm_unreachable("expected switch to return");
2103 }
2104
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2105 Register AMDGPULegalizerInfo::getSegmentAperture(
2106 unsigned AS,
2107 MachineRegisterInfo &MRI,
2108 MachineIRBuilder &B) const {
2109 MachineFunction &MF = B.getMF();
2110 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2111 const LLT S32 = LLT::scalar(32);
2112 const LLT S64 = LLT::scalar(64);
2113
2114 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2115
2116 if (ST.hasApertureRegs()) {
2117 // Note: this register is somewhat broken. When used as a 32-bit operand,
2118 // it only returns zeroes. The real value is in the upper 32 bits.
2119 // Thus, we must emit extract the high 32 bits.
2120 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2121 ? AMDGPU::SRC_SHARED_BASE
2122 : AMDGPU::SRC_PRIVATE_BASE;
2123 // FIXME: It would be more natural to emit a COPY here, but then copy
2124 // coalescing would kick in and it would think it's okay to use the "HI"
2125 // subregister (instead of extracting the HI 32 bits) which is an artificial
2126 // (unusable) register.
2127 // Register TableGen definitions would need an overhaul to get rid of the
2128 // artificial "HI" aperture registers and prevent this kind of issue from
2129 // happening.
2130 Register Dst = MRI.createGenericVirtualRegister(S64);
2131 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2132 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2133 return B.buildUnmerge(S32, Dst).getReg(1);
2134 }
2135
2136 // TODO: can we be smarter about machine pointer info?
2137 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2138 Register LoadAddr = MRI.createGenericVirtualRegister(
2139 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2140 // For code object version 5, private_base and shared_base are passed through
2141 // implicit kernargs.
2142 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
2143 AMDGPU::AMDHSA_COV5) {
2144 AMDGPUTargetLowering::ImplicitParameter Param =
2145 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2146 : AMDGPUTargetLowering::PRIVATE_BASE;
2147 uint64_t Offset =
2148 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2149
2150 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2151 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2152
2153 if (!loadInputValue(KernargPtrReg, B,
2154 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2155 return Register();
2156
2157 MachineMemOperand *MMO = MF.getMachineMemOperand(
2158 PtrInfo,
2159 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2160 MachineMemOperand::MOInvariant,
2161 LLT::scalar(32), commonAlignment(Align(64), Offset));
2162
2163 // Pointer address
2164 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2165 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2166 // Load address
2167 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2168 }
2169
2170 Register QueuePtr = MRI.createGenericVirtualRegister(
2171 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2172
2173 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
2174 return Register();
2175
2176 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2177 // private_segment_aperture_base_hi.
2178 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2179
2180 MachineMemOperand *MMO = MF.getMachineMemOperand(
2181 PtrInfo,
2182 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2183 MachineMemOperand::MOInvariant,
2184 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2185
2186 B.buildPtrAdd(LoadAddr, QueuePtr,
2187 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2188 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2189 }
2190
2191 /// Return true if the value is a known valid address, such that a null check is
2192 /// not necessary.
isKnownNonNull(Register Val,MachineRegisterInfo & MRI,const AMDGPUTargetMachine & TM,unsigned AddrSpace)2193 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2194 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2195 MachineInstr *Def = MRI.getVRegDef(Val);
2196 switch (Def->getOpcode()) {
2197 case AMDGPU::G_FRAME_INDEX:
2198 case AMDGPU::G_GLOBAL_VALUE:
2199 case AMDGPU::G_BLOCK_ADDR:
2200 return true;
2201 case AMDGPU::G_CONSTANT: {
2202 const ConstantInt *CI = Def->getOperand(1).getCImm();
2203 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2204 }
2205 default:
2206 return false;
2207 }
2208
2209 return false;
2210 }
2211
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2212 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2213 MachineInstr &MI, MachineRegisterInfo &MRI,
2214 MachineIRBuilder &B) const {
2215 MachineFunction &MF = B.getMF();
2216
2217 const LLT S32 = LLT::scalar(32);
2218 Register Dst = MI.getOperand(0).getReg();
2219 Register Src = MI.getOperand(1).getReg();
2220
2221 LLT DstTy = MRI.getType(Dst);
2222 LLT SrcTy = MRI.getType(Src);
2223 unsigned DestAS = DstTy.getAddressSpace();
2224 unsigned SrcAS = SrcTy.getAddressSpace();
2225
2226 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2227 // vector element.
2228 assert(!DstTy.isVector());
2229
2230 const AMDGPUTargetMachine &TM
2231 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2232
2233 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2234 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2235 return true;
2236 }
2237
2238 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2239 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2240 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2241 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2242 // Extract low 32-bits of the pointer.
2243 B.buildExtract(Dst, Src, 0);
2244 MI.eraseFromParent();
2245 return true;
2246 }
2247
2248 unsigned NullVal = TM.getNullPointerValue(DestAS);
2249
2250 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2251 auto FlatNull = B.buildConstant(SrcTy, 0);
2252
2253 // Extract low 32-bits of the pointer.
2254 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2255
2256 auto CmpRes =
2257 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2258 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2259
2260 MI.eraseFromParent();
2261 return true;
2262 }
2263
2264 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2265 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2266 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2267 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2268 if (!ApertureReg.isValid())
2269 return false;
2270
2271 // Coerce the type of the low half of the result so we can use merge_values.
2272 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2273
2274 // TODO: Should we allow mismatched types but matching sizes in merges to
2275 // avoid the ptrtoint?
2276 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2277
2278 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2279 B.buildCopy(Dst, BuildPtr);
2280 MI.eraseFromParent();
2281 return true;
2282 }
2283
2284 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2285 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2286
2287 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2288 SegmentNull.getReg(0));
2289
2290 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2291
2292 MI.eraseFromParent();
2293 return true;
2294 }
2295
2296 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2297 SrcTy.getSizeInBits() == 64) {
2298 // Truncate.
2299 B.buildExtract(Dst, Src, 0);
2300 MI.eraseFromParent();
2301 return true;
2302 }
2303
2304 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2305 DstTy.getSizeInBits() == 64) {
2306 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2307 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2308 auto PtrLo = B.buildPtrToInt(S32, Src);
2309 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2310 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2311 MI.eraseFromParent();
2312 return true;
2313 }
2314
2315 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2316 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2317
2318 LLVMContext &Ctx = MF.getFunction().getContext();
2319 Ctx.diagnose(InvalidAddrSpaceCast);
2320 B.buildUndef(Dst);
2321 MI.eraseFromParent();
2322 return true;
2323 }
2324
legalizeFroundeven(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2325 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2326 MachineRegisterInfo &MRI,
2327 MachineIRBuilder &B) const {
2328 Register Src = MI.getOperand(1).getReg();
2329 LLT Ty = MRI.getType(Src);
2330 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2331
2332 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2333 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2334
2335 auto C1 = B.buildFConstant(Ty, C1Val);
2336 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2337
2338 // TODO: Should this propagate fast-math-flags?
2339 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2340 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2341
2342 auto C2 = B.buildFConstant(Ty, C2Val);
2343 auto Fabs = B.buildFAbs(Ty, Src);
2344
2345 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2346 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2347 MI.eraseFromParent();
2348 return true;
2349 }
2350
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2351 bool AMDGPULegalizerInfo::legalizeFceil(
2352 MachineInstr &MI, MachineRegisterInfo &MRI,
2353 MachineIRBuilder &B) const {
2354
2355 const LLT S1 = LLT::scalar(1);
2356 const LLT S64 = LLT::scalar(64);
2357
2358 Register Src = MI.getOperand(1).getReg();
2359 assert(MRI.getType(Src) == S64);
2360
2361 // result = trunc(src)
2362 // if (src > 0.0 && src != result)
2363 // result += 1.0
2364
2365 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2366
2367 const auto Zero = B.buildFConstant(S64, 0.0);
2368 const auto One = B.buildFConstant(S64, 1.0);
2369 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2370 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2371 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2372 auto Add = B.buildSelect(S64, And, One, Zero);
2373
2374 // TODO: Should this propagate fast-math-flags?
2375 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2376 MI.eraseFromParent();
2377 return true;
2378 }
2379
legalizeFrem(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2380 bool AMDGPULegalizerInfo::legalizeFrem(
2381 MachineInstr &MI, MachineRegisterInfo &MRI,
2382 MachineIRBuilder &B) const {
2383 Register DstReg = MI.getOperand(0).getReg();
2384 Register Src0Reg = MI.getOperand(1).getReg();
2385 Register Src1Reg = MI.getOperand(2).getReg();
2386 auto Flags = MI.getFlags();
2387 LLT Ty = MRI.getType(DstReg);
2388
2389 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2390 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2391 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2392 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2393 MI.eraseFromParent();
2394 return true;
2395 }
2396
extractF64Exponent(Register Hi,MachineIRBuilder & B)2397 static MachineInstrBuilder extractF64Exponent(Register Hi,
2398 MachineIRBuilder &B) {
2399 const unsigned FractBits = 52;
2400 const unsigned ExpBits = 11;
2401 LLT S32 = LLT::scalar(32);
2402
2403 auto Const0 = B.buildConstant(S32, FractBits - 32);
2404 auto Const1 = B.buildConstant(S32, ExpBits);
2405
2406 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2407 .addUse(Hi)
2408 .addUse(Const0.getReg(0))
2409 .addUse(Const1.getReg(0));
2410
2411 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2412 }
2413
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2414 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2415 MachineInstr &MI, MachineRegisterInfo &MRI,
2416 MachineIRBuilder &B) const {
2417 const LLT S1 = LLT::scalar(1);
2418 const LLT S32 = LLT::scalar(32);
2419 const LLT S64 = LLT::scalar(64);
2420
2421 Register Src = MI.getOperand(1).getReg();
2422 assert(MRI.getType(Src) == S64);
2423
2424 // TODO: Should this use extract since the low half is unused?
2425 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2426 Register Hi = Unmerge.getReg(1);
2427
2428 // Extract the upper half, since this is where we will find the sign and
2429 // exponent.
2430 auto Exp = extractF64Exponent(Hi, B);
2431
2432 const unsigned FractBits = 52;
2433
2434 // Extract the sign bit.
2435 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2436 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2437
2438 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2439
2440 const auto Zero32 = B.buildConstant(S32, 0);
2441
2442 // Extend back to 64-bits.
2443 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2444
2445 auto Shr = B.buildAShr(S64, FractMask, Exp);
2446 auto Not = B.buildNot(S64, Shr);
2447 auto Tmp0 = B.buildAnd(S64, Src, Not);
2448 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2449
2450 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2451 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2452
2453 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2454 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2455 MI.eraseFromParent();
2456 return true;
2457 }
2458
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2459 bool AMDGPULegalizerInfo::legalizeITOFP(
2460 MachineInstr &MI, MachineRegisterInfo &MRI,
2461 MachineIRBuilder &B, bool Signed) const {
2462
2463 Register Dst = MI.getOperand(0).getReg();
2464 Register Src = MI.getOperand(1).getReg();
2465
2466 const LLT S64 = LLT::scalar(64);
2467 const LLT S32 = LLT::scalar(32);
2468
2469 assert(MRI.getType(Src) == S64);
2470
2471 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2472 auto ThirtyTwo = B.buildConstant(S32, 32);
2473
2474 if (MRI.getType(Dst) == S64) {
2475 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2476 : B.buildUITOFP(S64, Unmerge.getReg(1));
2477
2478 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2479 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2480
2481 // TODO: Should this propagate fast-math-flags?
2482 B.buildFAdd(Dst, LdExp, CvtLo);
2483 MI.eraseFromParent();
2484 return true;
2485 }
2486
2487 assert(MRI.getType(Dst) == S32);
2488
2489 auto One = B.buildConstant(S32, 1);
2490
2491 MachineInstrBuilder ShAmt;
2492 if (Signed) {
2493 auto ThirtyOne = B.buildConstant(S32, 31);
2494 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2495 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2496 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2497 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2498 .addUse(Unmerge.getReg(1));
2499 auto LS2 = B.buildSub(S32, LS, One);
2500 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2501 } else
2502 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2503 auto Norm = B.buildShl(S64, Src, ShAmt);
2504 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2505 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2506 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2507 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2508 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2509 B.buildFLdexp(Dst, FVal, Scale);
2510 MI.eraseFromParent();
2511 return true;
2512 }
2513
2514 // TODO: Copied from DAG implementation. Verify logic and document how this
2515 // actually works.
legalizeFPTOI(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2516 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2517 MachineRegisterInfo &MRI,
2518 MachineIRBuilder &B,
2519 bool Signed) const {
2520
2521 Register Dst = MI.getOperand(0).getReg();
2522 Register Src = MI.getOperand(1).getReg();
2523
2524 const LLT S64 = LLT::scalar(64);
2525 const LLT S32 = LLT::scalar(32);
2526
2527 const LLT SrcLT = MRI.getType(Src);
2528 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2529
2530 unsigned Flags = MI.getFlags();
2531
2532 // The basic idea of converting a floating point number into a pair of 32-bit
2533 // integers is illustrated as follows:
2534 //
2535 // tf := trunc(val);
2536 // hif := floor(tf * 2^-32);
2537 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2538 // hi := fptoi(hif);
2539 // lo := fptoi(lof);
2540 //
2541 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2542 MachineInstrBuilder Sign;
2543 if (Signed && SrcLT == S32) {
2544 // However, a 32-bit floating point number has only 23 bits mantissa and
2545 // it's not enough to hold all the significant bits of `lof` if val is
2546 // negative. To avoid the loss of precision, We need to take the absolute
2547 // value after truncating and flip the result back based on the original
2548 // signedness.
2549 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2550 Trunc = B.buildFAbs(S32, Trunc, Flags);
2551 }
2552 MachineInstrBuilder K0, K1;
2553 if (SrcLT == S64) {
2554 K0 = B.buildFConstant(
2555 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2556 K1 = B.buildFConstant(
2557 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2558 } else {
2559 K0 = B.buildFConstant(
2560 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2561 K1 = B.buildFConstant(
2562 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2563 }
2564
2565 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2566 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2567 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2568
2569 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2570 : B.buildFPTOUI(S32, FloorMul);
2571 auto Lo = B.buildFPTOUI(S32, Fma);
2572
2573 if (Signed && SrcLT == S32) {
2574 // Flip the result based on the signedness, which is either all 0s or 1s.
2575 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2576 // r := xor({lo, hi}, sign) - sign;
2577 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2578 Sign);
2579 } else
2580 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2581 MI.eraseFromParent();
2582
2583 return true;
2584 }
2585
legalizeMinNumMaxNum(LegalizerHelper & Helper,MachineInstr & MI) const2586 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2587 MachineInstr &MI) const {
2588 MachineFunction &MF = Helper.MIRBuilder.getMF();
2589 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2590
2591 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2592 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2593
2594 // With ieee_mode disabled, the instructions have the correct behavior
2595 // already for G_FMINNUM/G_FMAXNUM
2596 if (!MFI->getMode().IEEE)
2597 return !IsIEEEOp;
2598
2599 if (IsIEEEOp)
2600 return true;
2601
2602 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2603 }
2604
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2605 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2606 MachineInstr &MI, MachineRegisterInfo &MRI,
2607 MachineIRBuilder &B) const {
2608 // TODO: Should move some of this into LegalizerHelper.
2609
2610 // TODO: Promote dynamic indexing of s16 to s32
2611
2612 Register Dst = MI.getOperand(0).getReg();
2613 Register Vec = MI.getOperand(1).getReg();
2614
2615 LLT VecTy = MRI.getType(Vec);
2616 LLT EltTy = VecTy.getElementType();
2617 assert(EltTy == MRI.getType(Dst));
2618
2619 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2620 // but we can't go directly to that logic becasue you can't bitcast a vector
2621 // of pointers to a vector of integers. Therefore, introduce an intermediate
2622 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2623 // drive the legalization forward.
2624 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2625 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2626 LLT IntVecTy = VecTy.changeElementType(IntTy);
2627
2628 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2629 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2630 B.buildIntToPtr(Dst, IntElt);
2631
2632 MI.eraseFromParent();
2633 return true;
2634 }
2635
2636 // FIXME: Artifact combiner probably should have replaced the truncated
2637 // constant before this, so we shouldn't need
2638 // getIConstantVRegValWithLookThrough.
2639 std::optional<ValueAndVReg> MaybeIdxVal =
2640 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2641 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2642 return true;
2643 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2644
2645 if (IdxVal < VecTy.getNumElements()) {
2646 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2647 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2648 } else {
2649 B.buildUndef(Dst);
2650 }
2651
2652 MI.eraseFromParent();
2653 return true;
2654 }
2655
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2656 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2657 MachineInstr &MI, MachineRegisterInfo &MRI,
2658 MachineIRBuilder &B) const {
2659 // TODO: Should move some of this into LegalizerHelper.
2660
2661 // TODO: Promote dynamic indexing of s16 to s32
2662
2663 Register Dst = MI.getOperand(0).getReg();
2664 Register Vec = MI.getOperand(1).getReg();
2665 Register Ins = MI.getOperand(2).getReg();
2666
2667 LLT VecTy = MRI.getType(Vec);
2668 LLT EltTy = VecTy.getElementType();
2669 assert(EltTy == MRI.getType(Ins));
2670
2671 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2672 // but we can't go directly to that logic becasue you can't bitcast a vector
2673 // of pointers to a vector of integers. Therefore, make the pointer vector
2674 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2675 // new value, and then inttoptr the result vector back. This will then allow
2676 // the rest of legalization to take over.
2677 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2678 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2679 LLT IntVecTy = VecTy.changeElementType(IntTy);
2680
2681 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2682 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2683 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2684 MI.getOperand(3));
2685 B.buildIntToPtr(Dst, IntVecDest);
2686 MI.eraseFromParent();
2687 return true;
2688 }
2689
2690 // FIXME: Artifact combiner probably should have replaced the truncated
2691 // constant before this, so we shouldn't need
2692 // getIConstantVRegValWithLookThrough.
2693 std::optional<ValueAndVReg> MaybeIdxVal =
2694 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2695 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2696 return true;
2697
2698 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2699
2700 unsigned NumElts = VecTy.getNumElements();
2701 if (IdxVal < NumElts) {
2702 SmallVector<Register, 8> SrcRegs;
2703 for (unsigned i = 0; i < NumElts; ++i)
2704 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2705 B.buildUnmerge(SrcRegs, Vec);
2706
2707 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2708 B.buildMergeLikeInstr(Dst, SrcRegs);
2709 } else {
2710 B.buildUndef(Dst);
2711 }
2712
2713 MI.eraseFromParent();
2714 return true;
2715 }
2716
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2717 bool AMDGPULegalizerInfo::legalizeSinCos(
2718 MachineInstr &MI, MachineRegisterInfo &MRI,
2719 MachineIRBuilder &B) const {
2720
2721 Register DstReg = MI.getOperand(0).getReg();
2722 Register SrcReg = MI.getOperand(1).getReg();
2723 LLT Ty = MRI.getType(DstReg);
2724 unsigned Flags = MI.getFlags();
2725
2726 Register TrigVal;
2727 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2728 if (ST.hasTrigReducedRange()) {
2729 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2730 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2731 .addUse(MulVal.getReg(0))
2732 .setMIFlags(Flags)
2733 .getReg(0);
2734 } else
2735 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2736
2737 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2738 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2739 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2740 .addUse(TrigVal)
2741 .setMIFlags(Flags);
2742 MI.eraseFromParent();
2743 return true;
2744 }
2745
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,int64_t Offset,unsigned GAFlags) const2746 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2747 MachineIRBuilder &B,
2748 const GlobalValue *GV,
2749 int64_t Offset,
2750 unsigned GAFlags) const {
2751 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2752 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2753 // to the following code sequence:
2754 //
2755 // For constant address space:
2756 // s_getpc_b64 s[0:1]
2757 // s_add_u32 s0, s0, $symbol
2758 // s_addc_u32 s1, s1, 0
2759 //
2760 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2761 // a fixup or relocation is emitted to replace $symbol with a literal
2762 // constant, which is a pc-relative offset from the encoding of the $symbol
2763 // operand to the global variable.
2764 //
2765 // For global address space:
2766 // s_getpc_b64 s[0:1]
2767 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2768 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2769 //
2770 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2771 // fixups or relocations are emitted to replace $symbol@*@lo and
2772 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2773 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2774 // operand to the global variable.
2775
2776 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2777
2778 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2779 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2780
2781 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2782 .addDef(PCReg);
2783
2784 MIB.addGlobalAddress(GV, Offset, GAFlags);
2785 if (GAFlags == SIInstrInfo::MO_NONE)
2786 MIB.addImm(0);
2787 else
2788 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2789
2790 if (!B.getMRI()->getRegClassOrNull(PCReg))
2791 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2792
2793 if (PtrTy.getSizeInBits() == 32)
2794 B.buildExtract(DstReg, PCReg, 0);
2795 return true;
2796 }
2797
2798 // Emit a ABS32_LO / ABS32_HI relocation stub.
buildAbsGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,MachineRegisterInfo & MRI) const2799 void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2800 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2801 MachineRegisterInfo &MRI) const {
2802 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2803
2804 LLT S32 = LLT::scalar(32);
2805
2806 // Use the destination directly, if and only if we store the lower address
2807 // part only and we don't have a register class being set.
2808 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2809 ? DstReg
2810 : MRI.createGenericVirtualRegister(S32);
2811
2812 if (!MRI.getRegClassOrNull(AddrLo))
2813 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2814
2815 // Write the lower half.
2816 B.buildInstr(AMDGPU::S_MOV_B32)
2817 .addDef(AddrLo)
2818 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2819
2820 // If required, write the upper half as well.
2821 if (RequiresHighHalf) {
2822 assert(PtrTy.getSizeInBits() == 64 &&
2823 "Must provide a 64-bit pointer type!");
2824
2825 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2826 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2827
2828 B.buildInstr(AMDGPU::S_MOV_B32)
2829 .addDef(AddrHi)
2830 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2831
2832 // Use the destination directly, if and only if we don't have a register
2833 // class being set.
2834 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2835 ? DstReg
2836 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2837
2838 if (!MRI.getRegClassOrNull(AddrDst))
2839 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2840
2841 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2842
2843 // If we created a new register for the destination, cast the result into
2844 // the final output.
2845 if (AddrDst != DstReg)
2846 B.buildCast(DstReg, AddrDst);
2847 } else if (AddrLo != DstReg) {
2848 // If we created a new register for the destination, cast the result into
2849 // the final output.
2850 B.buildCast(DstReg, AddrLo);
2851 }
2852 }
2853
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2854 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2855 MachineInstr &MI, MachineRegisterInfo &MRI,
2856 MachineIRBuilder &B) const {
2857 Register DstReg = MI.getOperand(0).getReg();
2858 LLT Ty = MRI.getType(DstReg);
2859 unsigned AS = Ty.getAddressSpace();
2860
2861 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2862 MachineFunction &MF = B.getMF();
2863 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2864
2865 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2866 if (!MFI->isModuleEntryFunction() &&
2867 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2868 const Function &Fn = MF.getFunction();
2869 DiagnosticInfoUnsupported BadLDSDecl(
2870 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2871 DS_Warning);
2872 Fn.getContext().diagnose(BadLDSDecl);
2873
2874 // We currently don't have a way to correctly allocate LDS objects that
2875 // aren't directly associated with a kernel. We do force inlining of
2876 // functions that use local objects. However, if these dead functions are
2877 // not eliminated, we don't want a compile time error. Just emit a warning
2878 // and a trap, since there should be no callable path here.
2879 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>());
2880 B.buildUndef(DstReg);
2881 MI.eraseFromParent();
2882 return true;
2883 }
2884
2885 // TODO: We could emit code to handle the initialization somewhere.
2886 // We ignore the initializer for now and legalize it to allow selection.
2887 // The initializer will anyway get errored out during assembly emission.
2888 const SITargetLowering *TLI = ST.getTargetLowering();
2889 if (!TLI->shouldUseLDSConstAddress(GV)) {
2890 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2891 return true; // Leave in place;
2892 }
2893
2894 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2895 Type *Ty = GV->getValueType();
2896 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2897 // zero-sized type in other languages to declare the dynamic shared
2898 // memory which size is not known at the compile time. They will be
2899 // allocated by the runtime and placed directly after the static
2900 // allocated ones. They all share the same offset.
2901 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2902 // Adjust alignment for that dynamic shared memory array.
2903 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2904 LLT S32 = LLT::scalar(32);
2905 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2906 B.buildIntToPtr(DstReg, Sz);
2907 MI.eraseFromParent();
2908 return true;
2909 }
2910 }
2911
2912 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2913 *cast<GlobalVariable>(GV)));
2914 MI.eraseFromParent();
2915 return true;
2916 }
2917
2918 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
2919 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
2920 MI.eraseFromParent();
2921 return true;
2922 }
2923
2924 const SITargetLowering *TLI = ST.getTargetLowering();
2925
2926 if (TLI->shouldEmitFixup(GV)) {
2927 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2928 MI.eraseFromParent();
2929 return true;
2930 }
2931
2932 if (TLI->shouldEmitPCReloc(GV)) {
2933 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2934 MI.eraseFromParent();
2935 return true;
2936 }
2937
2938 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2939 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2940
2941 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2942 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2943 MachinePointerInfo::getGOT(MF),
2944 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2945 MachineMemOperand::MOInvariant,
2946 LoadTy, Align(8));
2947
2948 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2949
2950 if (Ty.getSizeInBits() == 32) {
2951 // Truncate if this is a 32-bit constant address.
2952 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2953 B.buildExtract(DstReg, Load, 0);
2954 } else
2955 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2956
2957 MI.eraseFromParent();
2958 return true;
2959 }
2960
widenToNextPowerOf2(LLT Ty)2961 static LLT widenToNextPowerOf2(LLT Ty) {
2962 if (Ty.isVector())
2963 return Ty.changeElementCount(
2964 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2965 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2966 }
2967
legalizeLoad(LegalizerHelper & Helper,MachineInstr & MI) const2968 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2969 MachineInstr &MI) const {
2970 MachineIRBuilder &B = Helper.MIRBuilder;
2971 MachineRegisterInfo &MRI = *B.getMRI();
2972 GISelChangeObserver &Observer = Helper.Observer;
2973
2974 Register PtrReg = MI.getOperand(1).getReg();
2975 LLT PtrTy = MRI.getType(PtrReg);
2976 unsigned AddrSpace = PtrTy.getAddressSpace();
2977
2978 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2979 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2980 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2981 Observer.changingInstr(MI);
2982 MI.getOperand(1).setReg(Cast.getReg(0));
2983 Observer.changedInstr(MI);
2984 return true;
2985 }
2986
2987 if (MI.getOpcode() != AMDGPU::G_LOAD)
2988 return false;
2989
2990 Register ValReg = MI.getOperand(0).getReg();
2991 LLT ValTy = MRI.getType(ValReg);
2992
2993 if (hasBufferRsrcWorkaround(ValTy)) {
2994 Observer.changingInstr(MI);
2995 castBufferRsrcFromV4I32(MI, B, MRI, 0);
2996 Observer.changedInstr(MI);
2997 return true;
2998 }
2999
3000 MachineMemOperand *MMO = *MI.memoperands_begin();
3001 const unsigned ValSize = ValTy.getSizeInBits();
3002 const LLT MemTy = MMO->getMemoryType();
3003 const Align MemAlign = MMO->getAlign();
3004 const unsigned MemSize = MemTy.getSizeInBits();
3005 const uint64_t AlignInBits = 8 * MemAlign.value();
3006
3007 // Widen non-power-of-2 loads to the alignment if needed
3008 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3009 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3010
3011 // This was already the correct extending load result type, so just adjust
3012 // the memory type.
3013 if (WideMemSize == ValSize) {
3014 MachineFunction &MF = B.getMF();
3015
3016 MachineMemOperand *WideMMO =
3017 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3018 Observer.changingInstr(MI);
3019 MI.setMemRefs(MF, {WideMMO});
3020 Observer.changedInstr(MI);
3021 return true;
3022 }
3023
3024 // Don't bother handling edge case that should probably never be produced.
3025 if (ValSize > WideMemSize)
3026 return false;
3027
3028 LLT WideTy = widenToNextPowerOf2(ValTy);
3029
3030 Register WideLoad;
3031 if (!WideTy.isVector()) {
3032 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3033 B.buildTrunc(ValReg, WideLoad).getReg(0);
3034 } else {
3035 // Extract the subvector.
3036
3037 if (isRegisterType(ValTy)) {
3038 // If this a case where G_EXTRACT is legal, use it.
3039 // (e.g. <3 x s32> -> <4 x s32>)
3040 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3041 B.buildExtract(ValReg, WideLoad, 0);
3042 } else {
3043 // For cases where the widened type isn't a nice register value, unmerge
3044 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3045 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3046 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3047 }
3048 }
3049
3050 MI.eraseFromParent();
3051 return true;
3052 }
3053
3054 return false;
3055 }
3056
legalizeStore(LegalizerHelper & Helper,MachineInstr & MI) const3057 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3058 MachineInstr &MI) const {
3059 MachineIRBuilder &B = Helper.MIRBuilder;
3060 MachineRegisterInfo &MRI = *B.getMRI();
3061 GISelChangeObserver &Observer = Helper.Observer;
3062
3063 Register DataReg = MI.getOperand(0).getReg();
3064 LLT DataTy = MRI.getType(DataReg);
3065
3066 if (hasBufferRsrcWorkaround(DataTy)) {
3067 Observer.changingInstr(MI);
3068 castBufferRsrcArgToV4I32(MI, B, 0);
3069 Observer.changedInstr(MI);
3070 return true;
3071 }
3072 return false;
3073 }
3074
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3075 bool AMDGPULegalizerInfo::legalizeFMad(
3076 MachineInstr &MI, MachineRegisterInfo &MRI,
3077 MachineIRBuilder &B) const {
3078 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3079 assert(Ty.isScalar());
3080
3081 MachineFunction &MF = B.getMF();
3082 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3083
3084 // TODO: Always legal with future ftz flag.
3085 // FIXME: Do we need just output?
3086 if (Ty == LLT::float32() &&
3087 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3088 return true;
3089 if (Ty == LLT::float16() &&
3090 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3091 return true;
3092
3093 MachineIRBuilder HelperBuilder(MI);
3094 GISelObserverWrapper DummyObserver;
3095 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3096 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3097 }
3098
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3099 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3100 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3101 Register DstReg = MI.getOperand(0).getReg();
3102 Register PtrReg = MI.getOperand(1).getReg();
3103 Register CmpVal = MI.getOperand(2).getReg();
3104 Register NewVal = MI.getOperand(3).getReg();
3105
3106 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3107 "this should not have been custom lowered");
3108
3109 LLT ValTy = MRI.getType(CmpVal);
3110 LLT VecTy = LLT::fixed_vector(2, ValTy);
3111
3112 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3113
3114 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3115 .addDef(DstReg)
3116 .addUse(PtrReg)
3117 .addUse(PackedVal)
3118 .setMemRefs(MI.memoperands());
3119
3120 MI.eraseFromParent();
3121 return true;
3122 }
3123
3124 /// Return true if it's known that \p Src can never be an f32 denormal value.
valueIsKnownNeverF32Denorm(const MachineRegisterInfo & MRI,Register Src)3125 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3126 Register Src) {
3127 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3128 switch (DefMI->getOpcode()) {
3129 case TargetOpcode::G_INTRINSIC: {
3130 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3131 case Intrinsic::amdgcn_frexp_mant:
3132 return true;
3133 default:
3134 break;
3135 }
3136
3137 break;
3138 }
3139 case TargetOpcode::G_FFREXP: {
3140 if (DefMI->getOperand(0).getReg() == Src)
3141 return true;
3142 break;
3143 }
3144 case TargetOpcode::G_FPEXT: {
3145 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3146 }
3147 default:
3148 return false;
3149 }
3150
3151 return false;
3152 }
3153
allowApproxFunc(const MachineFunction & MF,unsigned Flags)3154 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3155 if (Flags & MachineInstr::FmAfn)
3156 return true;
3157 const auto &Options = MF.getTarget().Options;
3158 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3159 }
3160
needsDenormHandlingF32(const MachineFunction & MF,Register Src,unsigned Flags)3161 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3162 unsigned Flags) {
3163 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3164 MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
3165 DenormalMode::PreserveSign;
3166 }
3167
3168 std::pair<Register, Register>
getScaledLogInput(MachineIRBuilder & B,Register Src,unsigned Flags) const3169 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3170 unsigned Flags) const {
3171 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3172 return {};
3173
3174 const LLT F32 = LLT::scalar(32);
3175 auto SmallestNormal = B.buildFConstant(
3176 F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3177 auto IsLtSmallestNormal =
3178 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3179
3180 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3181 auto One = B.buildFConstant(F32, 1.0);
3182 auto ScaleFactor =
3183 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3184 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3185
3186 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3187 }
3188
legalizeFlog2(MachineInstr & MI,MachineIRBuilder & B) const3189 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3190 MachineIRBuilder &B) const {
3191 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3192 // If we have to handle denormals, scale up the input and adjust the result.
3193
3194 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3195 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3196
3197 Register Dst = MI.getOperand(0).getReg();
3198 Register Src = MI.getOperand(1).getReg();
3199 LLT Ty = B.getMRI()->getType(Dst);
3200 unsigned Flags = MI.getFlags();
3201
3202 if (Ty == LLT::scalar(16)) {
3203 const LLT F32 = LLT::scalar(32);
3204 // Nothing in half is a denormal when promoted to f32.
3205 auto Ext = B.buildFPExt(F32, Src, Flags);
3206 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3207 .addUse(Ext.getReg(0))
3208 .setMIFlags(Flags);
3209 B.buildFPTrunc(Dst, Log2, Flags);
3210 MI.eraseFromParent();
3211 return true;
3212 }
3213
3214 assert(Ty == LLT::scalar(32));
3215
3216 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3217 if (!ScaledInput) {
3218 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3219 .addUse(Src)
3220 .setMIFlags(Flags);
3221 MI.eraseFromParent();
3222 return true;
3223 }
3224
3225 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3226 .addUse(ScaledInput)
3227 .setMIFlags(Flags);
3228
3229 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3230 auto Zero = B.buildFConstant(Ty, 0.0);
3231 auto ResultOffset =
3232 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3233 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3234
3235 MI.eraseFromParent();
3236 return true;
3237 }
3238
getMad(MachineIRBuilder & B,LLT Ty,Register X,Register Y,Register Z,unsigned Flags)3239 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3240 Register Z, unsigned Flags) {
3241 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3242 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3243 }
3244
legalizeFlogCommon(MachineInstr & MI,MachineIRBuilder & B) const3245 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3246 MachineIRBuilder &B) const {
3247 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3248 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3249
3250 MachineRegisterInfo &MRI = *B.getMRI();
3251 Register Dst = MI.getOperand(0).getReg();
3252 Register X = MI.getOperand(1).getReg();
3253 unsigned Flags = MI.getFlags();
3254 const LLT Ty = MRI.getType(X);
3255 MachineFunction &MF = B.getMF();
3256
3257 const LLT F32 = LLT::scalar(32);
3258 const LLT F16 = LLT::scalar(16);
3259
3260 const AMDGPUTargetMachine &TM =
3261 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3262
3263 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3264 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3265 if (Ty == F16 && !ST.has16BitInsts()) {
3266 Register LogVal = MRI.createGenericVirtualRegister(F32);
3267 auto PromoteSrc = B.buildFPExt(F32, X);
3268 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3269 B.buildFPTrunc(Dst, LogVal);
3270 } else {
3271 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3272 }
3273
3274 MI.eraseFromParent();
3275 return true;
3276 }
3277
3278 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3279 if (ScaledInput)
3280 X = ScaledInput;
3281
3282 auto Y =
3283 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3284
3285 Register R;
3286 if (ST.hasFastFMAF32()) {
3287 // c+cc are ln(2)/ln(10) to more than 49 bits
3288 const float c_log10 = 0x1.344134p-2f;
3289 const float cc_log10 = 0x1.09f79ep-26f;
3290
3291 // c + cc is ln(2) to more than 49 bits
3292 const float c_log = 0x1.62e42ep-1f;
3293 const float cc_log = 0x1.efa39ep-25f;
3294
3295 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3296 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3297
3298 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3299 auto NegR = B.buildFNeg(Ty, R, Flags);
3300 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3301 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3302 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3303 } else {
3304 // ch+ct is ln(2)/ln(10) to more than 36 bits
3305 const float ch_log10 = 0x1.344000p-2f;
3306 const float ct_log10 = 0x1.3509f6p-18f;
3307
3308 // ch + ct is ln(2) to more than 36 bits
3309 const float ch_log = 0x1.62e000p-1f;
3310 const float ct_log = 0x1.0bfbe8p-15f;
3311
3312 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3313 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3314
3315 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3316 auto YH = B.buildAnd(Ty, Y, MaskConst);
3317 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3318 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3319
3320 Register Mad0 =
3321 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3322 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3323 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3324 }
3325
3326 const bool IsFiniteOnly =
3327 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3328 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3329
3330 if (!IsFiniteOnly) {
3331 // Expand isfinite(x) => fabs(x) < inf
3332 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3333 auto Fabs = B.buildFAbs(Ty, Y);
3334 auto IsFinite =
3335 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3336 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3337 }
3338
3339 if (ScaledInput) {
3340 auto Zero = B.buildFConstant(Ty, 0.0);
3341 auto ShiftK =
3342 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3343 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3344 B.buildFSub(Dst, R, Shift, Flags);
3345 } else {
3346 B.buildCopy(Dst, R);
3347 }
3348
3349 MI.eraseFromParent();
3350 return true;
3351 }
3352
legalizeFlogUnsafe(MachineIRBuilder & B,Register Dst,Register Src,bool IsLog10,unsigned Flags) const3353 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3354 Register Src, bool IsLog10,
3355 unsigned Flags) const {
3356 const double Log2BaseInverted =
3357 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3358
3359 LLT Ty = B.getMRI()->getType(Dst);
3360
3361 if (Ty == LLT::scalar(32)) {
3362 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3363 if (ScaledInput) {
3364 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3365 .addUse(Src)
3366 .setMIFlags(Flags);
3367 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3368 auto Zero = B.buildFConstant(Ty, 0.0);
3369 auto ResultOffset =
3370 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3371 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3372
3373 if (ST.hasFastFMAF32())
3374 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3375 else {
3376 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3377 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3378 }
3379
3380 return true;
3381 }
3382 }
3383
3384 auto Log2Operand = Ty == LLT::scalar(16)
3385 ? B.buildFLog2(Ty, Src, Flags)
3386 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3387 .addUse(Src)
3388 .setMIFlags(Flags);
3389 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3390 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3391 return true;
3392 }
3393
legalizeFExp2(MachineInstr & MI,MachineIRBuilder & B) const3394 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3395 MachineIRBuilder &B) const {
3396 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3397 // If we have to handle denormals, scale up the input and adjust the result.
3398
3399 Register Dst = MI.getOperand(0).getReg();
3400 Register Src = MI.getOperand(1).getReg();
3401 unsigned Flags = MI.getFlags();
3402 LLT Ty = B.getMRI()->getType(Dst);
3403 const LLT F16 = LLT::scalar(16);
3404 const LLT F32 = LLT::scalar(32);
3405
3406 if (Ty == F16) {
3407 // Nothing in half is a denormal when promoted to f32.
3408 auto Ext = B.buildFPExt(F32, Src, Flags);
3409 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3410 .addUse(Ext.getReg(0))
3411 .setMIFlags(Flags);
3412 B.buildFPTrunc(Dst, Log2, Flags);
3413 MI.eraseFromParent();
3414 return true;
3415 }
3416
3417 assert(Ty == F32);
3418
3419 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3420 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3421 .addUse(Src)
3422 .setMIFlags(Flags);
3423 MI.eraseFromParent();
3424 return true;
3425 }
3426
3427 // bool needs_scaling = x < -0x1.f80000p+6f;
3428 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3429
3430 // -nextafter(128.0, -1)
3431 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3432 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3433 RangeCheckConst, Flags);
3434
3435 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3436 auto Zero = B.buildFConstant(Ty, 0.0);
3437 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3438 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3439
3440 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3441 .addUse(AddInput.getReg(0))
3442 .setMIFlags(Flags);
3443
3444 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3445 auto One = B.buildFConstant(Ty, 1.0);
3446 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3447 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3448 MI.eraseFromParent();
3449 return true;
3450 }
3451
legalizeFExpUnsafe(MachineIRBuilder & B,Register Dst,Register X,unsigned Flags) const3452 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3453 Register X, unsigned Flags) const {
3454 LLT Ty = B.getMRI()->getType(Dst);
3455 LLT F32 = LLT::scalar(32);
3456
3457 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3458 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3459 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3460
3461 if (Ty == F32) {
3462 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3463 .addUse(Mul.getReg(0))
3464 .setMIFlags(Flags);
3465 } else {
3466 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3467 }
3468
3469 return true;
3470 }
3471
3472 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3473 auto NeedsScaling =
3474 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3475 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3476 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3477 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3478
3479 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3480 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3481
3482 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3483 .addUse(ExpInput.getReg(0))
3484 .setMIFlags(Flags);
3485
3486 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3487 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3488 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3489 return true;
3490 }
3491
legalizeFExp(MachineInstr & MI,MachineIRBuilder & B) const3492 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3493 MachineIRBuilder &B) const {
3494 Register Dst = MI.getOperand(0).getReg();
3495 Register X = MI.getOperand(1).getReg();
3496 const unsigned Flags = MI.getFlags();
3497 MachineFunction &MF = B.getMF();
3498 MachineRegisterInfo &MRI = *B.getMRI();
3499 LLT Ty = MRI.getType(Dst);
3500 const LLT F16 = LLT::scalar(16);
3501 const LLT F32 = LLT::scalar(32);
3502 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3503
3504 if (Ty == F16) {
3505 // v_exp_f16 (fmul x, log2e)
3506 if (allowApproxFunc(MF, Flags)) {
3507 // TODO: Does this really require fast?
3508 legalizeFExpUnsafe(B, Dst, X, Flags);
3509 MI.eraseFromParent();
3510 return true;
3511 }
3512
3513 // exp(f16 x) ->
3514 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3515
3516 // Nothing in half is a denormal when promoted to f32.
3517 auto Ext = B.buildFPExt(F32, X, Flags);
3518 Register Lowered = MRI.createGenericVirtualRegister(F32);
3519 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3520 B.buildFPTrunc(Dst, Lowered, Flags);
3521 MI.eraseFromParent();
3522 return true;
3523 }
3524
3525 assert(Ty == F32);
3526
3527 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3528 // library behavior. Also, is known-not-daz source sufficient?
3529 if (allowApproxFunc(MF, Flags)) {
3530 legalizeFExpUnsafe(B, Dst, X, Flags);
3531 MI.eraseFromParent();
3532 return true;
3533 }
3534
3535 // Algorithm:
3536 //
3537 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3538 //
3539 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3540 // n = 64*m + j, 0 <= j < 64
3541 //
3542 // e^x = 2^((64*m + j + f)/64)
3543 // = (2^m) * (2^(j/64)) * 2^(f/64)
3544 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3545 //
3546 // f = x*(64/ln(2)) - n
3547 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3548 //
3549 // e^x = (2^m) * (2^(j/64)) * e^r
3550 //
3551 // (2^(j/64)) is precomputed
3552 //
3553 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3554 // e^r = 1 + q
3555 //
3556 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3557 //
3558 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3559 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3560 Register PH, PL;
3561
3562 if (ST.hasFastFMAF32()) {
3563 const float c_exp = numbers::log2ef;
3564 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3565 const float c_exp10 = 0x1.a934f0p+1f;
3566 const float cc_exp10 = 0x1.2f346ep-24f;
3567
3568 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3569 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3570 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3571 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3572
3573 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3574 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3575 } else {
3576 const float ch_exp = 0x1.714000p+0f;
3577 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3578
3579 const float ch_exp10 = 0x1.a92000p+1f;
3580 const float cl_exp10 = 0x1.4f0978p-11f;
3581
3582 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3583 auto XH = B.buildAnd(Ty, X, MaskConst);
3584 auto XL = B.buildFSub(Ty, X, XH, Flags);
3585
3586 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3587 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3588
3589 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3590 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3591
3592 Register Mad0 =
3593 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3594 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3595 }
3596
3597 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3598
3599 // It is unsafe to contract this fsub into the PH multiply.
3600 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3601 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3602 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3603
3604 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3605 .addUse(A.getReg(0))
3606 .setMIFlags(Flags);
3607 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3608
3609 auto UnderflowCheckConst =
3610 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3611 auto Zero = B.buildFConstant(Ty, 0.0);
3612 auto Underflow =
3613 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3614
3615 R = B.buildSelect(Ty, Underflow, Zero, R);
3616
3617 const auto &Options = MF.getTarget().Options;
3618
3619 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3620 auto OverflowCheckConst =
3621 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3622
3623 auto Overflow =
3624 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3625 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3626 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3627 }
3628
3629 B.buildCopy(Dst, R);
3630 MI.eraseFromParent();
3631 return true;
3632 }
3633
legalizeFPow(MachineInstr & MI,MachineIRBuilder & B) const3634 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3635 MachineIRBuilder &B) const {
3636 Register Dst = MI.getOperand(0).getReg();
3637 Register Src0 = MI.getOperand(1).getReg();
3638 Register Src1 = MI.getOperand(2).getReg();
3639 unsigned Flags = MI.getFlags();
3640 LLT Ty = B.getMRI()->getType(Dst);
3641 const LLT F16 = LLT::float16();
3642 const LLT F32 = LLT::float32();
3643
3644 if (Ty == F32) {
3645 auto Log = B.buildFLog2(F32, Src0, Flags);
3646 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3647 .addUse(Log.getReg(0))
3648 .addUse(Src1)
3649 .setMIFlags(Flags);
3650 B.buildFExp2(Dst, Mul, Flags);
3651 } else if (Ty == F16) {
3652 // There's no f16 fmul_legacy, so we need to convert for it.
3653 auto Log = B.buildFLog2(F16, Src0, Flags);
3654 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3655 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3656 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3657 .addUse(Ext0.getReg(0))
3658 .addUse(Ext1.getReg(0))
3659 .setMIFlags(Flags);
3660 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3661 } else
3662 return false;
3663
3664 MI.eraseFromParent();
3665 return true;
3666 }
3667
3668 // Find a source register, ignoring any possible source modifiers.
stripAnySourceMods(Register OrigSrc,MachineRegisterInfo & MRI)3669 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3670 Register ModSrc = OrigSrc;
3671 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3672 ModSrc = SrcFNeg->getOperand(1).getReg();
3673 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3674 ModSrc = SrcFAbs->getOperand(1).getReg();
3675 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3676 ModSrc = SrcFAbs->getOperand(1).getReg();
3677 return ModSrc;
3678 }
3679
legalizeFFloor(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3680 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3681 MachineRegisterInfo &MRI,
3682 MachineIRBuilder &B) const {
3683
3684 const LLT S1 = LLT::scalar(1);
3685 const LLT F64 = LLT::float64();
3686 Register Dst = MI.getOperand(0).getReg();
3687 Register OrigSrc = MI.getOperand(1).getReg();
3688 unsigned Flags = MI.getFlags();
3689 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3690 "this should not have been custom lowered");
3691
3692 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3693 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3694 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3695 // V_FRACT bug is:
3696 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3697 //
3698 // Convert floor(x) to (x - fract(x))
3699
3700 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3701 .addUse(OrigSrc)
3702 .setMIFlags(Flags);
3703
3704 // Give source modifier matching some assistance before obscuring a foldable
3705 // pattern.
3706
3707 // TODO: We can avoid the neg on the fract? The input sign to fract
3708 // shouldn't matter?
3709 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3710
3711 auto Const =
3712 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3713
3714 Register Min = MRI.createGenericVirtualRegister(F64);
3715
3716 // We don't need to concern ourselves with the snan handling difference, so
3717 // use the one which will directly select.
3718 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3719 if (MFI->getMode().IEEE)
3720 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3721 else
3722 B.buildFMinNum(Min, Fract, Const, Flags);
3723
3724 Register CorrectedFract = Min;
3725 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3726 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3727 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3728 }
3729
3730 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3731 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3732
3733 MI.eraseFromParent();
3734 return true;
3735 }
3736
3737 // Turn an illegal packed v2s16 build vector into bit operations.
3738 // TODO: This should probably be a bitcast action in LegalizerHelper.
legalizeBuildVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3739 bool AMDGPULegalizerInfo::legalizeBuildVector(
3740 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3741 Register Dst = MI.getOperand(0).getReg();
3742 const LLT S32 = LLT::scalar(32);
3743 const LLT S16 = LLT::scalar(16);
3744 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3745
3746 Register Src0 = MI.getOperand(1).getReg();
3747 Register Src1 = MI.getOperand(2).getReg();
3748
3749 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3750 assert(MRI.getType(Src0) == S32);
3751 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3752 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3753 }
3754
3755 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3756 B.buildBitcast(Dst, Merge);
3757
3758 MI.eraseFromParent();
3759 return true;
3760 }
3761
3762 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3763 //
3764 // Source and accumulation registers must all be 32-bits.
3765 //
3766 // TODO: When the multiply is uniform, we should produce a code sequence
3767 // that is better suited to instruction selection on the SALU. Instead of
3768 // the outer loop going over parts of the result, the outer loop should go
3769 // over parts of one of the factors. This should result in instruction
3770 // selection that makes full use of S_ADDC_U32 instructions.
buildMultiply(LegalizerHelper & Helper,MutableArrayRef<Register> Accum,ArrayRef<Register> Src0,ArrayRef<Register> Src1,bool UsePartialMad64_32,bool SeparateOddAlignedProducts) const3771 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3772 MutableArrayRef<Register> Accum,
3773 ArrayRef<Register> Src0,
3774 ArrayRef<Register> Src1,
3775 bool UsePartialMad64_32,
3776 bool SeparateOddAlignedProducts) const {
3777 // Use (possibly empty) vectors of S1 registers to represent the set of
3778 // carries from one pair of positions to the next.
3779 using Carry = SmallVector<Register, 2>;
3780
3781 MachineIRBuilder &B = Helper.MIRBuilder;
3782 GISelKnownBits &KB = *Helper.getKnownBits();
3783
3784 const LLT S1 = LLT::scalar(1);
3785 const LLT S32 = LLT::scalar(32);
3786 const LLT S64 = LLT::scalar(64);
3787
3788 Register Zero32;
3789 Register Zero64;
3790
3791 auto getZero32 = [&]() -> Register {
3792 if (!Zero32)
3793 Zero32 = B.buildConstant(S32, 0).getReg(0);
3794 return Zero32;
3795 };
3796 auto getZero64 = [&]() -> Register {
3797 if (!Zero64)
3798 Zero64 = B.buildConstant(S64, 0).getReg(0);
3799 return Zero64;
3800 };
3801
3802 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3803 for (unsigned i = 0; i < Src0.size(); ++i) {
3804 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3805 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3806 }
3807
3808 // Merge the given carries into the 32-bit LocalAccum, which is modified
3809 // in-place.
3810 //
3811 // Returns the carry-out, which is a single S1 register or null.
3812 auto mergeCarry =
3813 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3814 if (CarryIn.empty())
3815 return Register();
3816
3817 bool HaveCarryOut = true;
3818 Register CarryAccum;
3819 if (CarryIn.size() == 1) {
3820 if (!LocalAccum) {
3821 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3822 return Register();
3823 }
3824
3825 CarryAccum = getZero32();
3826 } else {
3827 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3828 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3829 CarryAccum =
3830 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3831 .getReg(0);
3832 }
3833
3834 if (!LocalAccum) {
3835 LocalAccum = getZero32();
3836 HaveCarryOut = false;
3837 }
3838 }
3839
3840 auto Add =
3841 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3842 LocalAccum = Add.getReg(0);
3843 return HaveCarryOut ? Add.getReg(1) : Register();
3844 };
3845
3846 // Build a multiply-add chain to compute
3847 //
3848 // LocalAccum + (partial products at DstIndex)
3849 // + (opportunistic subset of CarryIn)
3850 //
3851 // LocalAccum is an array of one or two 32-bit registers that are updated
3852 // in-place. The incoming registers may be null.
3853 //
3854 // In some edge cases, carry-ins can be consumed "for free". In that case,
3855 // the consumed carry bits are removed from CarryIn in-place.
3856 auto buildMadChain =
3857 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3858 -> Carry {
3859 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3860 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3861
3862 Carry CarryOut;
3863 unsigned j0 = 0;
3864
3865 // Use plain 32-bit multiplication for the most significant part of the
3866 // result by default.
3867 if (LocalAccum.size() == 1 &&
3868 (!UsePartialMad64_32 || !CarryIn.empty())) {
3869 do {
3870 // Skip multiplication if one of the operands is 0
3871 unsigned j1 = DstIndex - j0;
3872 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3873 ++j0;
3874 continue;
3875 }
3876 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3877 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3878 LocalAccum[0] = Mul.getReg(0);
3879 } else {
3880 if (CarryIn.empty()) {
3881 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3882 } else {
3883 LocalAccum[0] =
3884 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3885 .getReg(0);
3886 CarryIn.pop_back();
3887 }
3888 }
3889 ++j0;
3890 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3891 }
3892
3893 // Build full 64-bit multiplies.
3894 if (j0 <= DstIndex) {
3895 bool HaveSmallAccum = false;
3896 Register Tmp;
3897
3898 if (LocalAccum[0]) {
3899 if (LocalAccum.size() == 1) {
3900 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3901 HaveSmallAccum = true;
3902 } else if (LocalAccum[1]) {
3903 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3904 HaveSmallAccum = false;
3905 } else {
3906 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3907 HaveSmallAccum = true;
3908 }
3909 } else {
3910 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3911 Tmp = getZero64();
3912 HaveSmallAccum = true;
3913 }
3914
3915 do {
3916 unsigned j1 = DstIndex - j0;
3917 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3918 ++j0;
3919 continue;
3920 }
3921 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3922 {Src0[j0], Src1[j1], Tmp});
3923 Tmp = Mad.getReg(0);
3924 if (!HaveSmallAccum)
3925 CarryOut.push_back(Mad.getReg(1));
3926 HaveSmallAccum = false;
3927
3928 ++j0;
3929 } while (j0 <= DstIndex);
3930
3931 auto Unmerge = B.buildUnmerge(S32, Tmp);
3932 LocalAccum[0] = Unmerge.getReg(0);
3933 if (LocalAccum.size() > 1)
3934 LocalAccum[1] = Unmerge.getReg(1);
3935 }
3936
3937 return CarryOut;
3938 };
3939
3940 // Outer multiply loop, iterating over destination parts from least
3941 // significant to most significant parts.
3942 //
3943 // The columns of the following diagram correspond to the destination parts
3944 // affected by one iteration of the outer loop (ignoring boundary
3945 // conditions).
3946 //
3947 // Dest index relative to 2 * i: 1 0 -1
3948 // ------
3949 // Carries from previous iteration: e o
3950 // Even-aligned partial product sum: E E .
3951 // Odd-aligned partial product sum: O O
3952 //
3953 // 'o' is OddCarry, 'e' is EvenCarry.
3954 // EE and OO are computed from partial products via buildMadChain and use
3955 // accumulation where possible and appropriate.
3956 //
3957 Register SeparateOddCarry;
3958 Carry EvenCarry;
3959 Carry OddCarry;
3960
3961 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3962 Carry OddCarryIn = std::move(OddCarry);
3963 Carry EvenCarryIn = std::move(EvenCarry);
3964 OddCarry.clear();
3965 EvenCarry.clear();
3966
3967 // Partial products at offset 2 * i.
3968 if (2 * i < Accum.size()) {
3969 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3970 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3971 }
3972
3973 // Partial products at offset 2 * i - 1.
3974 if (i > 0) {
3975 if (!SeparateOddAlignedProducts) {
3976 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3977 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3978 } else {
3979 bool IsHighest = 2 * i >= Accum.size();
3980 Register SeparateOddOut[2];
3981 auto LocalAccum = MutableArrayRef(SeparateOddOut)
3982 .take_front(IsHighest ? 1 : 2);
3983 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3984
3985 MachineInstr *Lo;
3986
3987 if (i == 1) {
3988 if (!IsHighest)
3989 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3990 else
3991 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3992 } else {
3993 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3994 SeparateOddCarry);
3995 }
3996 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3997
3998 if (!IsHighest) {
3999 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4000 Lo->getOperand(1).getReg());
4001 Accum[2 * i] = Hi.getReg(0);
4002 SeparateOddCarry = Hi.getReg(1);
4003 }
4004 }
4005 }
4006
4007 // Add in the carries from the previous iteration
4008 if (i > 0) {
4009 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4010 EvenCarryIn.push_back(CarryOut);
4011
4012 if (2 * i < Accum.size()) {
4013 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4014 OddCarry.push_back(CarryOut);
4015 }
4016 }
4017 }
4018 }
4019
4020 // Custom narrowing of wide multiplies using wide multiply-add instructions.
4021 //
4022 // TODO: If the multiply is followed by an addition, we should attempt to
4023 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
legalizeMul(LegalizerHelper & Helper,MachineInstr & MI) const4024 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4025 MachineInstr &MI) const {
4026 assert(ST.hasMad64_32());
4027 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4028
4029 MachineIRBuilder &B = Helper.MIRBuilder;
4030 MachineRegisterInfo &MRI = *B.getMRI();
4031
4032 Register DstReg = MI.getOperand(0).getReg();
4033 Register Src0 = MI.getOperand(1).getReg();
4034 Register Src1 = MI.getOperand(2).getReg();
4035
4036 LLT Ty = MRI.getType(DstReg);
4037 assert(Ty.isScalar());
4038
4039 unsigned Size = Ty.getSizeInBits();
4040 unsigned NumParts = Size / 32;
4041 assert((Size % 32) == 0);
4042 assert(NumParts >= 2);
4043
4044 // Whether to use MAD_64_32 for partial products whose high half is
4045 // discarded. This avoids some ADD instructions but risks false dependency
4046 // stalls on some subtargets in some cases.
4047 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4048
4049 // Whether to compute odd-aligned partial products separately. This is
4050 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4051 // in an even-aligned VGPR.
4052 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4053
4054 LLT S32 = LLT::scalar(32);
4055 SmallVector<Register, 2> Src0Parts, Src1Parts;
4056 for (unsigned i = 0; i < NumParts; ++i) {
4057 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4058 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4059 }
4060 B.buildUnmerge(Src0Parts, Src0);
4061 B.buildUnmerge(Src1Parts, Src1);
4062
4063 SmallVector<Register, 2> AccumRegs(NumParts);
4064 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4065 SeparateOddAlignedProducts);
4066
4067 B.buildMergeLikeInstr(DstReg, AccumRegs);
4068 MI.eraseFromParent();
4069 return true;
4070 }
4071
4072 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4073 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4074 // case with a single min instruction instead of a compare+select.
legalizeCTLZ_CTTZ(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4075 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4076 MachineRegisterInfo &MRI,
4077 MachineIRBuilder &B) const {
4078 Register Dst = MI.getOperand(0).getReg();
4079 Register Src = MI.getOperand(1).getReg();
4080 LLT DstTy = MRI.getType(Dst);
4081 LLT SrcTy = MRI.getType(Src);
4082
4083 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4084 ? AMDGPU::G_AMDGPU_FFBH_U32
4085 : AMDGPU::G_AMDGPU_FFBL_B32;
4086 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4087 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4088
4089 MI.eraseFromParent();
4090 return true;
4091 }
4092
4093 // Check that this is a G_XOR x, -1
isNot(const MachineRegisterInfo & MRI,const MachineInstr & MI)4094 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4095 if (MI.getOpcode() != TargetOpcode::G_XOR)
4096 return false;
4097 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4098 return ConstVal && *ConstVal == -1;
4099 }
4100
4101 // Return the use branch instruction, otherwise null if the usage is invalid.
4102 static MachineInstr *
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br,MachineBasicBlock * & UncondBrTarget,bool & Negated)4103 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4104 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4105 Register CondDef = MI.getOperand(0).getReg();
4106 if (!MRI.hasOneNonDBGUse(CondDef))
4107 return nullptr;
4108
4109 MachineBasicBlock *Parent = MI.getParent();
4110 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4111
4112 if (isNot(MRI, *UseMI)) {
4113 Register NegatedCond = UseMI->getOperand(0).getReg();
4114 if (!MRI.hasOneNonDBGUse(NegatedCond))
4115 return nullptr;
4116
4117 // We're deleting the def of this value, so we need to remove it.
4118 eraseInstr(*UseMI, MRI);
4119
4120 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4121 Negated = true;
4122 }
4123
4124 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4125 return nullptr;
4126
4127 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4128 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4129 if (Next == Parent->end()) {
4130 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4131 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4132 return nullptr;
4133 UncondBrTarget = &*NextMBB;
4134 } else {
4135 if (Next->getOpcode() != AMDGPU::G_BR)
4136 return nullptr;
4137 Br = &*Next;
4138 UncondBrTarget = Br->getOperand(0).getMBB();
4139 }
4140
4141 return UseMI;
4142 }
4143
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg,const TargetRegisterClass * ArgRC,LLT ArgTy) const4144 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4145 const ArgDescriptor *Arg,
4146 const TargetRegisterClass *ArgRC,
4147 LLT ArgTy) const {
4148 MCRegister SrcReg = Arg->getRegister();
4149 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4150 assert(DstReg.isVirtual() && "Virtual register expected");
4151
4152 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4153 *ArgRC, B.getDebugLoc(), ArgTy);
4154 if (Arg->isMasked()) {
4155 // TODO: Should we try to emit this once in the entry block?
4156 const LLT S32 = LLT::scalar(32);
4157 const unsigned Mask = Arg->getMask();
4158 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4159
4160 Register AndMaskSrc = LiveIn;
4161
4162 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4163 // 0.
4164 if (Shift != 0) {
4165 auto ShiftAmt = B.buildConstant(S32, Shift);
4166 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4167 }
4168
4169 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4170 } else {
4171 B.buildCopy(DstReg, LiveIn);
4172 }
4173
4174 return true;
4175 }
4176
loadInputValue(Register DstReg,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const4177 bool AMDGPULegalizerInfo::loadInputValue(
4178 Register DstReg, MachineIRBuilder &B,
4179 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4180 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4181 const ArgDescriptor *Arg = nullptr;
4182 const TargetRegisterClass *ArgRC;
4183 LLT ArgTy;
4184
4185 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4186 const ArgDescriptor WorkGroupIDX =
4187 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4188 // If GridZ is not programmed in an entry function then the hardware will set
4189 // it to all zeros, so there is no need to mask the GridY value in the low
4190 // order bits.
4191 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4192 AMDGPU::TTMP7,
4193 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4194 const ArgDescriptor WorkGroupIDZ =
4195 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4196 if (ST.hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
4197 switch (ArgType) {
4198 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4199 Arg = &WorkGroupIDX;
4200 ArgRC = &AMDGPU::SReg_32RegClass;
4201 ArgTy = LLT::scalar(32);
4202 break;
4203 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4204 Arg = &WorkGroupIDY;
4205 ArgRC = &AMDGPU::SReg_32RegClass;
4206 ArgTy = LLT::scalar(32);
4207 break;
4208 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4209 Arg = &WorkGroupIDZ;
4210 ArgRC = &AMDGPU::SReg_32RegClass;
4211 ArgTy = LLT::scalar(32);
4212 break;
4213 default:
4214 break;
4215 }
4216 }
4217
4218 if (!Arg)
4219 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4220
4221 if (!Arg) {
4222 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4223 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4224 // case the pointer argument may be missing and we use null.
4225 B.buildConstant(DstReg, 0);
4226 return true;
4227 }
4228
4229 // It's undefined behavior if a function marked with the amdgpu-no-*
4230 // attributes uses the corresponding intrinsic.
4231 B.buildUndef(DstReg);
4232 return true;
4233 }
4234
4235 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4236 return false; // TODO: Handle these
4237 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4238 }
4239
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const4240 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4241 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4242 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4243 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4244 return false;
4245
4246 MI.eraseFromParent();
4247 return true;
4248 }
4249
replaceWithConstant(MachineIRBuilder & B,MachineInstr & MI,int64_t C)4250 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4251 int64_t C) {
4252 B.buildConstant(MI.getOperand(0).getReg(), C);
4253 MI.eraseFromParent();
4254 return true;
4255 }
4256
legalizeWorkitemIDIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned Dim,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const4257 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4258 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4259 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4260 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4261 if (MaxID == 0)
4262 return replaceWithConstant(B, MI, 0);
4263
4264 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4265 const ArgDescriptor *Arg;
4266 const TargetRegisterClass *ArgRC;
4267 LLT ArgTy;
4268 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4269
4270 Register DstReg = MI.getOperand(0).getReg();
4271 if (!Arg) {
4272 // It's undefined behavior if a function marked with the amdgpu-no-*
4273 // attributes uses the corresponding intrinsic.
4274 B.buildUndef(DstReg);
4275 MI.eraseFromParent();
4276 return true;
4277 }
4278
4279 if (Arg->isMasked()) {
4280 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4281 // masking operations anyway.
4282 //
4283 // TODO: We could assert the top bit is 0 for the source copy.
4284 if (!loadInputValue(DstReg, B, ArgType))
4285 return false;
4286 } else {
4287 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4288 if (!loadInputValue(TmpReg, B, ArgType))
4289 return false;
4290 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4291 }
4292
4293 MI.eraseFromParent();
4294 return true;
4295 }
4296
getKernargParameterPtr(MachineIRBuilder & B,int64_t Offset) const4297 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4298 int64_t Offset) const {
4299 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
4300 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4301
4302 // TODO: If we passed in the base kernel offset we could have a better
4303 // alignment than 4, but we don't really need it.
4304 if (!loadInputValue(KernArgReg, B,
4305 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4306 llvm_unreachable("failed to find kernarg segment ptr");
4307
4308 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4309 // TODO: Should get nuw
4310 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4311 }
4312
4313 /// Legalize a value that's loaded from kernel arguments. This is only used by
4314 /// legacy intrinsics.
legalizeKernargMemParameter(MachineInstr & MI,MachineIRBuilder & B,uint64_t Offset,Align Alignment) const4315 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4316 MachineIRBuilder &B,
4317 uint64_t Offset,
4318 Align Alignment) const {
4319 Register DstReg = MI.getOperand(0).getReg();
4320
4321 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4322 "unexpected kernarg parameter type");
4323
4324 Register Ptr = getKernargParameterPtr(B, Offset);
4325 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4326 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4327 MachineMemOperand::MODereferenceable |
4328 MachineMemOperand::MOInvariant);
4329 MI.eraseFromParent();
4330 return true;
4331 }
4332
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4333 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4334 MachineRegisterInfo &MRI,
4335 MachineIRBuilder &B) const {
4336 Register Dst = MI.getOperand(0).getReg();
4337 LLT DstTy = MRI.getType(Dst);
4338 LLT S16 = LLT::scalar(16);
4339 LLT S32 = LLT::scalar(32);
4340 LLT S64 = LLT::scalar(64);
4341
4342 if (DstTy == S16)
4343 return legalizeFDIV16(MI, MRI, B);
4344 if (DstTy == S32)
4345 return legalizeFDIV32(MI, MRI, B);
4346 if (DstTy == S64)
4347 return legalizeFDIV64(MI, MRI, B);
4348
4349 return false;
4350 }
4351
legalizeUnsignedDIV_REM32Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register X,Register Y) const4352 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4353 Register DstDivReg,
4354 Register DstRemReg,
4355 Register X,
4356 Register Y) const {
4357 const LLT S1 = LLT::scalar(1);
4358 const LLT S32 = LLT::scalar(32);
4359
4360 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4361 // algorithm used here.
4362
4363 // Initial estimate of inv(y).
4364 auto FloatY = B.buildUITOFP(S32, Y);
4365 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4366 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4367 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4368 auto Z = B.buildFPTOUI(S32, ScaledY);
4369
4370 // One round of UNR.
4371 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4372 auto NegYZ = B.buildMul(S32, NegY, Z);
4373 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4374
4375 // Quotient/remainder estimate.
4376 auto Q = B.buildUMulH(S32, X, Z);
4377 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4378
4379 // First quotient/remainder refinement.
4380 auto One = B.buildConstant(S32, 1);
4381 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4382 if (DstDivReg)
4383 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4384 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4385
4386 // Second quotient/remainder refinement.
4387 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4388 if (DstDivReg)
4389 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4390
4391 if (DstRemReg)
4392 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4393 }
4394
4395 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
4396 //
4397 // Return lo, hi of result
4398 //
4399 // %cvt.lo = G_UITOFP Val.lo
4400 // %cvt.hi = G_UITOFP Val.hi
4401 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4402 // %rcp = G_AMDGPU_RCP_IFLAG %mad
4403 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
4404 // %mul2 = G_FMUL %mul1, 2**(-32)
4405 // %trunc = G_INTRINSIC_TRUNC %mul2
4406 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4407 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
emitReciprocalU64(MachineIRBuilder & B,Register Val)4408 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4409 Register Val) {
4410 const LLT S32 = LLT::scalar(32);
4411 auto Unmerge = B.buildUnmerge(S32, Val);
4412
4413 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4414 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4415
4416 auto Mad = B.buildFMAD(
4417 S32, CvtHi, // 2**32
4418 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4419
4420 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4421 auto Mul1 = B.buildFMul(
4422 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4423
4424 // 2**(-32)
4425 auto Mul2 = B.buildFMul(
4426 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4427 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4428
4429 // -(2**32)
4430 auto Mad2 = B.buildFMAD(
4431 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4432 Mul1);
4433
4434 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4435 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4436
4437 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4438 }
4439
legalizeUnsignedDIV_REM64Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register Numer,Register Denom) const4440 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4441 Register DstDivReg,
4442 Register DstRemReg,
4443 Register Numer,
4444 Register Denom) const {
4445 const LLT S32 = LLT::scalar(32);
4446 const LLT S64 = LLT::scalar(64);
4447 const LLT S1 = LLT::scalar(1);
4448 Register RcpLo, RcpHi;
4449
4450 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4451
4452 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4453
4454 auto Zero64 = B.buildConstant(S64, 0);
4455 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4456
4457 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4458 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4459
4460 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4461 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4462 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4463
4464 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4465 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4466 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4467
4468 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4469 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4470 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4471 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4472 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4473
4474 auto Zero32 = B.buildConstant(S32, 0);
4475 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4476 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4477 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4478
4479 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4480 Register NumerLo = UnmergeNumer.getReg(0);
4481 Register NumerHi = UnmergeNumer.getReg(1);
4482
4483 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4484 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4485 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4486 Register Mul3_Lo = UnmergeMul3.getReg(0);
4487 Register Mul3_Hi = UnmergeMul3.getReg(1);
4488 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4489 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4490 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4491 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4492
4493 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4494 Register DenomLo = UnmergeDenom.getReg(0);
4495 Register DenomHi = UnmergeDenom.getReg(1);
4496
4497 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4498 auto C1 = B.buildSExt(S32, CmpHi);
4499
4500 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4501 auto C2 = B.buildSExt(S32, CmpLo);
4502
4503 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4504 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4505
4506 // TODO: Here and below portions of the code can be enclosed into if/endif.
4507 // Currently control flow is unconditional and we have 4 selects after
4508 // potential endif to substitute PHIs.
4509
4510 // if C3 != 0 ...
4511 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4512 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4513 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4514 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4515
4516 auto One64 = B.buildConstant(S64, 1);
4517 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4518
4519 auto C4 =
4520 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4521 auto C5 =
4522 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4523 auto C6 = B.buildSelect(
4524 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4525
4526 // if (C6 != 0)
4527 auto Add4 = B.buildAdd(S64, Add3, One64);
4528 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4529
4530 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4531 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4532 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4533
4534 // endif C6
4535 // endif C3
4536
4537 if (DstDivReg) {
4538 auto Sel1 = B.buildSelect(
4539 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4540 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4541 Sel1, MulHi3);
4542 }
4543
4544 if (DstRemReg) {
4545 auto Sel2 = B.buildSelect(
4546 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4547 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4548 Sel2, Sub1);
4549 }
4550 }
4551
legalizeUnsignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4552 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4553 MachineRegisterInfo &MRI,
4554 MachineIRBuilder &B) const {
4555 Register DstDivReg, DstRemReg;
4556 switch (MI.getOpcode()) {
4557 default:
4558 llvm_unreachable("Unexpected opcode!");
4559 case AMDGPU::G_UDIV: {
4560 DstDivReg = MI.getOperand(0).getReg();
4561 break;
4562 }
4563 case AMDGPU::G_UREM: {
4564 DstRemReg = MI.getOperand(0).getReg();
4565 break;
4566 }
4567 case AMDGPU::G_UDIVREM: {
4568 DstDivReg = MI.getOperand(0).getReg();
4569 DstRemReg = MI.getOperand(1).getReg();
4570 break;
4571 }
4572 }
4573
4574 const LLT S64 = LLT::scalar(64);
4575 const LLT S32 = LLT::scalar(32);
4576 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4577 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4578 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4579 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4580
4581 if (Ty == S32)
4582 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4583 else if (Ty == S64)
4584 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4585 else
4586 return false;
4587
4588 MI.eraseFromParent();
4589 return true;
4590 }
4591
legalizeSignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4592 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4593 MachineRegisterInfo &MRI,
4594 MachineIRBuilder &B) const {
4595 const LLT S64 = LLT::scalar(64);
4596 const LLT S32 = LLT::scalar(32);
4597
4598 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4599 if (Ty != S32 && Ty != S64)
4600 return false;
4601
4602 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4603 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4604 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4605
4606 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4607 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4608 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4609
4610 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4611 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4612
4613 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4614 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4615
4616 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4617 switch (MI.getOpcode()) {
4618 default:
4619 llvm_unreachable("Unexpected opcode!");
4620 case AMDGPU::G_SDIV: {
4621 DstDivReg = MI.getOperand(0).getReg();
4622 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4623 break;
4624 }
4625 case AMDGPU::G_SREM: {
4626 DstRemReg = MI.getOperand(0).getReg();
4627 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4628 break;
4629 }
4630 case AMDGPU::G_SDIVREM: {
4631 DstDivReg = MI.getOperand(0).getReg();
4632 DstRemReg = MI.getOperand(1).getReg();
4633 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4634 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4635 break;
4636 }
4637 }
4638
4639 if (Ty == S32)
4640 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4641 else
4642 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4643
4644 if (DstDivReg) {
4645 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4646 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4647 B.buildSub(DstDivReg, SignXor, Sign);
4648 }
4649
4650 if (DstRemReg) {
4651 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4652 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4653 B.buildSub(DstRemReg, SignXor, Sign);
4654 }
4655
4656 MI.eraseFromParent();
4657 return true;
4658 }
4659
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4660 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4661 MachineRegisterInfo &MRI,
4662 MachineIRBuilder &B) const {
4663 Register Res = MI.getOperand(0).getReg();
4664 Register LHS = MI.getOperand(1).getReg();
4665 Register RHS = MI.getOperand(2).getReg();
4666 uint16_t Flags = MI.getFlags();
4667 LLT ResTy = MRI.getType(Res);
4668
4669 const MachineFunction &MF = B.getMF();
4670 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4671 MF.getTarget().Options.UnsafeFPMath;
4672
4673 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4674 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4675 return false;
4676
4677 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4678 // the CI documentation has a worst case error of 1 ulp.
4679 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4680 // use it as long as we aren't trying to use denormals.
4681 //
4682 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4683
4684 // 1 / x -> RCP(x)
4685 if (CLHS->isExactlyValue(1.0)) {
4686 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4687 .addUse(RHS)
4688 .setMIFlags(Flags);
4689
4690 MI.eraseFromParent();
4691 return true;
4692 }
4693
4694 // -1 / x -> RCP( FNEG(x) )
4695 if (CLHS->isExactlyValue(-1.0)) {
4696 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4697 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4698 .addUse(FNeg.getReg(0))
4699 .setMIFlags(Flags);
4700
4701 MI.eraseFromParent();
4702 return true;
4703 }
4704 }
4705
4706 // For f16 require afn or arcp.
4707 // For f32 require afn.
4708 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4709 !MI.getFlag(MachineInstr::FmArcp)))
4710 return false;
4711
4712 // x / y -> x * (1.0 / y)
4713 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4714 .addUse(RHS)
4715 .setMIFlags(Flags);
4716 B.buildFMul(Res, LHS, RCP, Flags);
4717
4718 MI.eraseFromParent();
4719 return true;
4720 }
4721
legalizeFastUnsafeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4722 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4723 MachineRegisterInfo &MRI,
4724 MachineIRBuilder &B) const {
4725 Register Res = MI.getOperand(0).getReg();
4726 Register X = MI.getOperand(1).getReg();
4727 Register Y = MI.getOperand(2).getReg();
4728 uint16_t Flags = MI.getFlags();
4729 LLT ResTy = MRI.getType(Res);
4730
4731 const MachineFunction &MF = B.getMF();
4732 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4733 MI.getFlag(MachineInstr::FmAfn);
4734
4735 if (!AllowInaccurateRcp)
4736 return false;
4737
4738 auto NegY = B.buildFNeg(ResTy, Y);
4739 auto One = B.buildFConstant(ResTy, 1.0);
4740
4741 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4742 .addUse(Y)
4743 .setMIFlags(Flags);
4744
4745 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4746 R = B.buildFMA(ResTy, Tmp0, R, R);
4747
4748 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4749 R = B.buildFMA(ResTy, Tmp1, R, R);
4750
4751 auto Ret = B.buildFMul(ResTy, X, R);
4752 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4753
4754 B.buildFMA(Res, Tmp2, R, Ret);
4755 MI.eraseFromParent();
4756 return true;
4757 }
4758
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4759 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4760 MachineRegisterInfo &MRI,
4761 MachineIRBuilder &B) const {
4762 if (legalizeFastUnsafeFDIV(MI, MRI, B))
4763 return true;
4764
4765 Register Res = MI.getOperand(0).getReg();
4766 Register LHS = MI.getOperand(1).getReg();
4767 Register RHS = MI.getOperand(2).getReg();
4768
4769 uint16_t Flags = MI.getFlags();
4770
4771 LLT S16 = LLT::scalar(16);
4772 LLT S32 = LLT::scalar(32);
4773
4774 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4775 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4776
4777 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4778 .addUse(RHSExt.getReg(0))
4779 .setMIFlags(Flags);
4780
4781 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4782 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4783
4784 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4785 .addUse(RDst.getReg(0))
4786 .addUse(RHS)
4787 .addUse(LHS)
4788 .setMIFlags(Flags);
4789
4790 MI.eraseFromParent();
4791 return true;
4792 }
4793
4794 static const unsigned SPDenormModeBitField =
4795 AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
4796 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
4797
4798 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4799 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,SIModeRegisterDefaults Mode)4800 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4801 const GCNSubtarget &ST,
4802 SIModeRegisterDefaults Mode) {
4803 // Set SP denorm mode to this value.
4804 unsigned SPDenormMode =
4805 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4806
4807 if (ST.hasDenormModeInst()) {
4808 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4809 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4810
4811 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4812 B.buildInstr(AMDGPU::S_DENORM_MODE)
4813 .addImm(NewDenormModeValue);
4814
4815 } else {
4816 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4817 .addImm(SPDenormMode)
4818 .addImm(SPDenormModeBitField);
4819 }
4820 }
4821
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4822 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4823 MachineRegisterInfo &MRI,
4824 MachineIRBuilder &B) const {
4825 if (legalizeFastUnsafeFDIV(MI, MRI, B))
4826 return true;
4827
4828 Register Res = MI.getOperand(0).getReg();
4829 Register LHS = MI.getOperand(1).getReg();
4830 Register RHS = MI.getOperand(2).getReg();
4831 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4832 SIModeRegisterDefaults Mode = MFI->getMode();
4833
4834 uint16_t Flags = MI.getFlags();
4835
4836 LLT S32 = LLT::scalar(32);
4837 LLT S1 = LLT::scalar(1);
4838
4839 auto One = B.buildFConstant(S32, 1.0f);
4840
4841 auto DenominatorScaled =
4842 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4843 .addUse(LHS)
4844 .addUse(RHS)
4845 .addImm(0)
4846 .setMIFlags(Flags);
4847 auto NumeratorScaled =
4848 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4849 .addUse(LHS)
4850 .addUse(RHS)
4851 .addImm(1)
4852 .setMIFlags(Flags);
4853
4854 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4855 .addUse(DenominatorScaled.getReg(0))
4856 .setMIFlags(Flags);
4857 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4858
4859 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4860 const bool HasDynamicDenormals =
4861 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4862 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4863
4864 Register SavedSPDenormMode;
4865 if (!PreservesDenormals) {
4866 if (HasDynamicDenormals) {
4867 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4868 B.buildInstr(AMDGPU::S_GETREG_B32)
4869 .addDef(SavedSPDenormMode)
4870 .addImm(SPDenormModeBitField);
4871 }
4872 toggleSPDenormMode(true, B, ST, Mode);
4873 }
4874
4875 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4876 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4877 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4878 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4879 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4880 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4881
4882 if (!PreservesDenormals) {
4883 if (HasDynamicDenormals) {
4884 assert(SavedSPDenormMode);
4885 B.buildInstr(AMDGPU::S_SETREG_B32)
4886 .addReg(SavedSPDenormMode)
4887 .addImm(SPDenormModeBitField);
4888 } else
4889 toggleSPDenormMode(false, B, ST, Mode);
4890 }
4891
4892 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
4893 .addUse(Fma4.getReg(0))
4894 .addUse(Fma1.getReg(0))
4895 .addUse(Fma3.getReg(0))
4896 .addUse(NumeratorScaled.getReg(1))
4897 .setMIFlags(Flags);
4898
4899 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4900 .addUse(Fmas.getReg(0))
4901 .addUse(RHS)
4902 .addUse(LHS)
4903 .setMIFlags(Flags);
4904
4905 MI.eraseFromParent();
4906 return true;
4907 }
4908
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4909 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
4910 MachineRegisterInfo &MRI,
4911 MachineIRBuilder &B) const {
4912 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
4913 return true;
4914
4915 Register Res = MI.getOperand(0).getReg();
4916 Register LHS = MI.getOperand(1).getReg();
4917 Register RHS = MI.getOperand(2).getReg();
4918
4919 uint16_t Flags = MI.getFlags();
4920
4921 LLT S64 = LLT::scalar(64);
4922 LLT S1 = LLT::scalar(1);
4923
4924 auto One = B.buildFConstant(S64, 1.0);
4925
4926 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4927 .addUse(LHS)
4928 .addUse(RHS)
4929 .addImm(0)
4930 .setMIFlags(Flags);
4931
4932 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4933
4934 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
4935 .addUse(DivScale0.getReg(0))
4936 .setMIFlags(Flags);
4937
4938 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4939 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4940 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4941
4942 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4943 .addUse(LHS)
4944 .addUse(RHS)
4945 .addImm(1)
4946 .setMIFlags(Flags);
4947
4948 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4949 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4950 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4951
4952 Register Scale;
4953 if (!ST.hasUsableDivScaleConditionOutput()) {
4954 // Workaround a hardware bug on SI where the condition output from div_scale
4955 // is not usable.
4956
4957 LLT S32 = LLT::scalar(32);
4958
4959 auto NumUnmerge = B.buildUnmerge(S32, LHS);
4960 auto DenUnmerge = B.buildUnmerge(S32, RHS);
4961 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4962 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4963
4964 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4965 Scale1Unmerge.getReg(1));
4966 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4967 Scale0Unmerge.getReg(1));
4968 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4969 } else {
4970 Scale = DivScale1.getReg(1);
4971 }
4972
4973 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
4974 .addUse(Fma4.getReg(0))
4975 .addUse(Fma3.getReg(0))
4976 .addUse(Mul.getReg(0))
4977 .addUse(Scale)
4978 .setMIFlags(Flags);
4979
4980 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
4981 .addUse(Fmas.getReg(0))
4982 .addUse(RHS)
4983 .addUse(LHS)
4984 .setMIFlags(Flags);
4985
4986 MI.eraseFromParent();
4987 return true;
4988 }
4989
legalizeFFREXP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4990 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
4991 MachineRegisterInfo &MRI,
4992 MachineIRBuilder &B) const {
4993 Register Res0 = MI.getOperand(0).getReg();
4994 Register Res1 = MI.getOperand(1).getReg();
4995 Register Val = MI.getOperand(2).getReg();
4996 uint16_t Flags = MI.getFlags();
4997
4998 LLT Ty = MRI.getType(Res0);
4999 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5000
5001 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5002 .addUse(Val)
5003 .setMIFlags(Flags);
5004 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5005 .addUse(Val)
5006 .setMIFlags(Flags);
5007
5008 if (ST.hasFractBug()) {
5009 auto Fabs = B.buildFAbs(Ty, Val);
5010 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5011 auto IsFinite =
5012 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5013 auto Zero = B.buildConstant(InstrExpTy, 0);
5014 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5015 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5016 }
5017
5018 B.buildCopy(Res0, Mant);
5019 B.buildSExtOrTrunc(Res1, Exp);
5020
5021 MI.eraseFromParent();
5022 return true;
5023 }
5024
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5025 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5026 MachineRegisterInfo &MRI,
5027 MachineIRBuilder &B) const {
5028 Register Res = MI.getOperand(0).getReg();
5029 Register LHS = MI.getOperand(2).getReg();
5030 Register RHS = MI.getOperand(3).getReg();
5031 uint16_t Flags = MI.getFlags();
5032
5033 LLT S32 = LLT::scalar(32);
5034 LLT S1 = LLT::scalar(1);
5035
5036 auto Abs = B.buildFAbs(S32, RHS, Flags);
5037 const APFloat C0Val(1.0f);
5038
5039 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5040 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5041 auto C2 = B.buildFConstant(S32, 1.0f);
5042
5043 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5044 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5045
5046 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5047
5048 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5049 .addUse(Mul0.getReg(0))
5050 .setMIFlags(Flags);
5051
5052 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5053
5054 B.buildFMul(Res, Sel, Mul1, Flags);
5055
5056 MI.eraseFromParent();
5057 return true;
5058 }
5059
legalizeFSQRTF16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5060 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5061 MachineRegisterInfo &MRI,
5062 MachineIRBuilder &B) const {
5063 // Bypass the correct expansion a standard promotion through G_FSQRT would
5064 // get. The f32 op is accurate enough for the f16 cas.
5065 unsigned Flags = MI.getFlags();
5066 assert(!ST.has16BitInsts());
5067 const LLT F32 = LLT::scalar(32);
5068 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5069 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5070 .addUse(Ext.getReg(0))
5071 .setMIFlags(Flags);
5072 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5073 MI.eraseFromParent();
5074 return true;
5075 }
5076
legalizeFSQRTF32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5077 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5078 MachineRegisterInfo &MRI,
5079 MachineIRBuilder &B) const {
5080 MachineFunction &MF = B.getMF();
5081 Register Dst = MI.getOperand(0).getReg();
5082 Register X = MI.getOperand(1).getReg();
5083 const unsigned Flags = MI.getFlags();
5084 const LLT S1 = LLT::scalar(1);
5085 const LLT F32 = LLT::scalar(32);
5086 const LLT I32 = LLT::scalar(32);
5087
5088 if (allowApproxFunc(MF, Flags)) {
5089 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5090 .addUse(X)
5091 .setMIFlags(Flags);
5092 MI.eraseFromParent();
5093 return true;
5094 }
5095
5096 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5097 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5098 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5099 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5100 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5101
5102 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5103 if (needsDenormHandlingF32(MF, X, Flags)) {
5104 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5105 .addUse(SqrtX.getReg(0))
5106 .setMIFlags(Flags);
5107
5108 auto NegOne = B.buildConstant(I32, -1);
5109 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5110
5111 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5112 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5113
5114 auto PosOne = B.buildConstant(I32, 1);
5115 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5116
5117 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5118 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5119
5120 auto Zero = B.buildFConstant(F32, 0.0f);
5121 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5122
5123 SqrtS =
5124 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5125
5126 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5127 SqrtS =
5128 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5129 } else {
5130 auto SqrtR =
5131 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5132 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5133
5134 auto Half = B.buildFConstant(F32, 0.5f);
5135 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5136 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5137 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5138 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5139 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5140 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5141 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5142 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5143 }
5144
5145 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5146
5147 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5148
5149 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5150
5151 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5152 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5153
5154 MI.eraseFromParent();
5155 return true;
5156 }
5157
legalizeFSQRTF64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5158 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5159 MachineRegisterInfo &MRI,
5160 MachineIRBuilder &B) const {
5161 // For double type, the SQRT and RSQ instructions don't have required
5162 // precision, we apply Goldschmidt's algorithm to improve the result:
5163 //
5164 // y0 = rsq(x)
5165 // g0 = x * y0
5166 // h0 = 0.5 * y0
5167 //
5168 // r0 = 0.5 - h0 * g0
5169 // g1 = g0 * r0 + g0
5170 // h1 = h0 * r0 + h0
5171 //
5172 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5173 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5174 // h2 = h1 * r1 + h1
5175 //
5176 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5177 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5178 //
5179 // sqrt(x) = g3
5180
5181 const LLT S1 = LLT::scalar(1);
5182 const LLT S32 = LLT::scalar(32);
5183 const LLT F64 = LLT::scalar(64);
5184
5185 Register Dst = MI.getOperand(0).getReg();
5186 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5187
5188 Register X = MI.getOperand(1).getReg();
5189 unsigned Flags = MI.getFlags();
5190
5191 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5192
5193 auto ZeroInt = B.buildConstant(S32, 0);
5194 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5195
5196 // Scale up input if it is too small.
5197 auto ScaleUpFactor = B.buildConstant(S32, 256);
5198 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5199 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5200
5201 auto SqrtY =
5202 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5203
5204 auto Half = B.buildFConstant(F64, 0.5);
5205 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5206 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5207
5208 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5209 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5210
5211 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5212 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5213
5214 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5215 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5216
5217 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5218
5219 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5220 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5221
5222 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5223
5224 // Scale down the result.
5225 auto ScaleDownFactor = B.buildConstant(S32, -128);
5226 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5227 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5228
5229 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5230 // with finite only or nsz because rsq(+/-0) = +/-inf
5231
5232 // TODO: Check for DAZ and expand to subnormals
5233 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5234
5235 // If x is +INF, +0, or -0, use its original value
5236 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5237
5238 MI.eraseFromParent();
5239 return true;
5240 }
5241
legalizeFSQRT(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5242 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5243 MachineRegisterInfo &MRI,
5244 MachineIRBuilder &B) const {
5245 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5246 if (Ty == LLT::scalar(32))
5247 return legalizeFSQRTF32(MI, MRI, B);
5248 if (Ty == LLT::scalar(64))
5249 return legalizeFSQRTF64(MI, MRI, B);
5250 if (Ty == LLT::scalar(16))
5251 return legalizeFSQRTF16(MI, MRI, B);
5252 return false;
5253 }
5254
5255 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5256 // FIXME: Why do we handle this one but not other removed instructions?
5257 //
5258 // Reciprocal square root. The clamp prevents infinite results, clamping
5259 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5260 // +-max_float.
legalizeRsqClampIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5261 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5262 MachineRegisterInfo &MRI,
5263 MachineIRBuilder &B) const {
5264 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5265 return true;
5266
5267 Register Dst = MI.getOperand(0).getReg();
5268 Register Src = MI.getOperand(2).getReg();
5269 auto Flags = MI.getFlags();
5270
5271 LLT Ty = MRI.getType(Dst);
5272
5273 const fltSemantics *FltSemantics;
5274 if (Ty == LLT::scalar(32))
5275 FltSemantics = &APFloat::IEEEsingle();
5276 else if (Ty == LLT::scalar(64))
5277 FltSemantics = &APFloat::IEEEdouble();
5278 else
5279 return false;
5280
5281 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5282 .addUse(Src)
5283 .setMIFlags(Flags);
5284
5285 // We don't need to concern ourselves with the snan handling difference, since
5286 // the rsq quieted (or not) so use the one which will directly select.
5287 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5288 const bool UseIEEE = MFI->getMode().IEEE;
5289
5290 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5291 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5292 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5293
5294 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5295
5296 if (UseIEEE)
5297 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5298 else
5299 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5300 MI.eraseFromParent();
5301 return true;
5302 }
5303
getDSFPAtomicOpcode(Intrinsic::ID IID)5304 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
5305 switch (IID) {
5306 case Intrinsic::amdgcn_ds_fadd:
5307 return AMDGPU::G_ATOMICRMW_FADD;
5308 case Intrinsic::amdgcn_ds_fmin:
5309 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
5310 case Intrinsic::amdgcn_ds_fmax:
5311 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
5312 default:
5313 llvm_unreachable("not a DS FP intrinsic");
5314 }
5315 }
5316
legalizeDSAtomicFPIntrinsic(LegalizerHelper & Helper,MachineInstr & MI,Intrinsic::ID IID) const5317 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
5318 MachineInstr &MI,
5319 Intrinsic::ID IID) const {
5320 GISelChangeObserver &Observer = Helper.Observer;
5321 Observer.changingInstr(MI);
5322
5323 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
5324
5325 // The remaining operands were used to set fields in the MemOperand on
5326 // construction.
5327 for (int I = 6; I > 3; --I)
5328 MI.removeOperand(I);
5329
5330 MI.removeOperand(1); // Remove the intrinsic ID.
5331 Observer.changedInstr(MI);
5332 return true;
5333 }
5334
getImplicitArgPtr(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5335 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5336 MachineRegisterInfo &MRI,
5337 MachineIRBuilder &B) const {
5338 uint64_t Offset =
5339 ST.getTargetLowering()->getImplicitParameterOffset(
5340 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5341 LLT DstTy = MRI.getType(DstReg);
5342 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5343
5344 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5345 if (!loadInputValue(KernargPtrReg, B,
5346 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5347 return false;
5348
5349 // FIXME: This should be nuw
5350 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5351 return true;
5352 }
5353
5354 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5355 /// bits of the pointer and replace them with the stride argument, then
5356 /// merge_values everything together. In the common case of a raw buffer (the
5357 /// stride component is 0), we can just AND off the upper half.
legalizePointerAsRsrcIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5358 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5359 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5360 Register Result = MI.getOperand(0).getReg();
5361 Register Pointer = MI.getOperand(2).getReg();
5362 Register Stride = MI.getOperand(3).getReg();
5363 Register NumRecords = MI.getOperand(4).getReg();
5364 Register Flags = MI.getOperand(5).getReg();
5365
5366 LLT S32 = LLT::scalar(32);
5367
5368 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5369 auto Unmerge = B.buildUnmerge(S32, Pointer);
5370 Register LowHalf = Unmerge.getReg(0);
5371 Register HighHalf = Unmerge.getReg(1);
5372
5373 auto AndMask = B.buildConstant(S32, 0x0000ffff);
5374 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5375
5376 MachineInstrBuilder NewHighHalf = Masked;
5377 std::optional<ValueAndVReg> StrideConst =
5378 getIConstantVRegValWithLookThrough(Stride, MRI);
5379 if (!StrideConst || !StrideConst->Value.isZero()) {
5380 MachineInstrBuilder ShiftedStride;
5381 if (StrideConst) {
5382 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5383 uint32_t ShiftedStrideVal = StrideVal << 16;
5384 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5385 } else {
5386 auto ExtStride = B.buildAnyExt(S32, Stride);
5387 auto ShiftConst = B.buildConstant(S32, 16);
5388 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5389 }
5390 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5391 }
5392 Register NewHighHalfReg = NewHighHalf.getReg(0);
5393 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5394 MI.eraseFromParent();
5395 return true;
5396 }
5397
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5398 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5399 MachineRegisterInfo &MRI,
5400 MachineIRBuilder &B) const {
5401 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5402 if (!MFI->isEntryFunction()) {
5403 return legalizePreloadedArgIntrin(MI, MRI, B,
5404 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5405 }
5406
5407 Register DstReg = MI.getOperand(0).getReg();
5408 if (!getImplicitArgPtr(DstReg, MRI, B))
5409 return false;
5410
5411 MI.eraseFromParent();
5412 return true;
5413 }
5414
getLDSKernelId(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5415 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5416 MachineRegisterInfo &MRI,
5417 MachineIRBuilder &B) const {
5418 Function &F = B.getMF().getFunction();
5419 std::optional<uint32_t> KnownSize =
5420 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5421 if (KnownSize.has_value())
5422 B.buildConstant(DstReg, *KnownSize);
5423 return false;
5424 }
5425
legalizeLDSKernelId(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5426 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5427 MachineRegisterInfo &MRI,
5428 MachineIRBuilder &B) const {
5429
5430 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5431 if (!MFI->isEntryFunction()) {
5432 return legalizePreloadedArgIntrin(MI, MRI, B,
5433 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5434 }
5435
5436 Register DstReg = MI.getOperand(0).getReg();
5437 if (!getLDSKernelId(DstReg, MRI, B))
5438 return false;
5439
5440 MI.eraseFromParent();
5441 return true;
5442 }
5443
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const5444 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5445 MachineRegisterInfo &MRI,
5446 MachineIRBuilder &B,
5447 unsigned AddrSpace) const {
5448 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5449 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5450 Register Hi32 = Unmerge.getReg(1);
5451
5452 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5453 MI.eraseFromParent();
5454 return true;
5455 }
5456
5457 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5458 // offset (the offset that is included in bounds checking and swizzling, to be
5459 // split between the instruction's voffset and immoffset fields) and soffset
5460 // (the offset that is excluded from bounds checking and swizzling, to go in
5461 // the instruction's soffset field). This function takes the first kind of
5462 // offset and figures out how to split it between voffset and immoffset.
5463 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const5464 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5465 Register OrigOffset) const {
5466 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5467 Register BaseReg;
5468 unsigned ImmOffset;
5469 const LLT S32 = LLT::scalar(32);
5470 MachineRegisterInfo &MRI = *B.getMRI();
5471
5472 std::tie(BaseReg, ImmOffset) =
5473 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
5474
5475 // If BaseReg is a pointer, convert it to int.
5476 if (MRI.getType(BaseReg).isPointer())
5477 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5478
5479 // If the immediate value is too big for the immoffset field, put only bits
5480 // that would normally fit in the immoffset field. The remaining value that
5481 // is copied/added for the voffset field is a large power of 2, and it
5482 // stands more chance of being CSEd with the copy/add for another similar
5483 // load/store.
5484 // However, do not do that rounding down if that is a negative
5485 // number, as it appears to be illegal to have a negative offset in the
5486 // vgpr, even if adding the immediate offset makes it positive.
5487 unsigned Overflow = ImmOffset & ~MaxImm;
5488 ImmOffset -= Overflow;
5489 if ((int32_t)Overflow < 0) {
5490 Overflow += ImmOffset;
5491 ImmOffset = 0;
5492 }
5493
5494 if (Overflow != 0) {
5495 if (!BaseReg) {
5496 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5497 } else {
5498 auto OverflowVal = B.buildConstant(S32, Overflow);
5499 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5500 }
5501 }
5502
5503 if (!BaseReg)
5504 BaseReg = B.buildConstant(S32, 0).getReg(0);
5505
5506 return std::pair(BaseReg, ImmOffset);
5507 }
5508
5509 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg,bool ImageStore) const5510 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5511 MachineRegisterInfo &MRI,
5512 Register Reg,
5513 bool ImageStore) const {
5514 const LLT S16 = LLT::scalar(16);
5515 const LLT S32 = LLT::scalar(32);
5516 LLT StoreVT = MRI.getType(Reg);
5517 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5518
5519 if (ST.hasUnpackedD16VMem()) {
5520 auto Unmerge = B.buildUnmerge(S16, Reg);
5521
5522 SmallVector<Register, 4> WideRegs;
5523 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5524 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5525
5526 int NumElts = StoreVT.getNumElements();
5527
5528 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5529 .getReg(0);
5530 }
5531
5532 if (ImageStore && ST.hasImageStoreD16Bug()) {
5533 if (StoreVT.getNumElements() == 2) {
5534 SmallVector<Register, 4> PackedRegs;
5535 Reg = B.buildBitcast(S32, Reg).getReg(0);
5536 PackedRegs.push_back(Reg);
5537 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5538 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5539 .getReg(0);
5540 }
5541
5542 if (StoreVT.getNumElements() == 3) {
5543 SmallVector<Register, 4> PackedRegs;
5544 auto Unmerge = B.buildUnmerge(S16, Reg);
5545 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5546 PackedRegs.push_back(Unmerge.getReg(I));
5547 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5548 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5549 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5550 }
5551
5552 if (StoreVT.getNumElements() == 4) {
5553 SmallVector<Register, 4> PackedRegs;
5554 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5555 auto Unmerge = B.buildUnmerge(S32, Reg);
5556 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5557 PackedRegs.push_back(Unmerge.getReg(I));
5558 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5559 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5560 .getReg(0);
5561 }
5562
5563 llvm_unreachable("invalid data type");
5564 }
5565
5566 if (StoreVT == LLT::fixed_vector(3, S16)) {
5567 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5568 .getReg(0);
5569 }
5570 return Reg;
5571 }
5572
fixStoreSourceType(MachineIRBuilder & B,Register VData,bool IsFormat) const5573 Register AMDGPULegalizerInfo::fixStoreSourceType(
5574 MachineIRBuilder &B, Register VData, bool IsFormat) const {
5575 MachineRegisterInfo *MRI = B.getMRI();
5576 LLT Ty = MRI->getType(VData);
5577
5578 const LLT S16 = LLT::scalar(16);
5579
5580 // Fixup buffer resources themselves needing to be v4i128.
5581 if (hasBufferRsrcWorkaround(Ty))
5582 return castBufferRsrcToV4I32(VData, B);
5583
5584 // Fixup illegal register types for i8 stores.
5585 if (Ty == LLT::scalar(8) || Ty == S16) {
5586 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5587 return AnyExt;
5588 }
5589
5590 if (Ty.isVector()) {
5591 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5592 if (IsFormat)
5593 return handleD16VData(B, *MRI, VData);
5594 }
5595 }
5596
5597 return VData;
5598 }
5599
legalizeBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsTyped,bool IsFormat) const5600 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5601 MachineRegisterInfo &MRI,
5602 MachineIRBuilder &B,
5603 bool IsTyped,
5604 bool IsFormat) const {
5605 Register VData = MI.getOperand(1).getReg();
5606 LLT Ty = MRI.getType(VData);
5607 LLT EltTy = Ty.getScalarType();
5608 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5609 const LLT S32 = LLT::scalar(32);
5610
5611 VData = fixStoreSourceType(B, VData, IsFormat);
5612 castBufferRsrcArgToV4I32(MI, B, 2);
5613 Register RSrc = MI.getOperand(2).getReg();
5614
5615 MachineMemOperand *MMO = *MI.memoperands_begin();
5616 const int MemSize = MMO->getSize();
5617
5618 unsigned ImmOffset;
5619
5620 // The typed intrinsics add an immediate after the registers.
5621 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5622
5623 // The struct intrinsic variants add one additional operand over raw.
5624 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5625 Register VIndex;
5626 int OpOffset = 0;
5627 if (HasVIndex) {
5628 VIndex = MI.getOperand(3).getReg();
5629 OpOffset = 1;
5630 } else {
5631 VIndex = B.buildConstant(S32, 0).getReg(0);
5632 }
5633
5634 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5635 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5636
5637 unsigned Format = 0;
5638 if (IsTyped) {
5639 Format = MI.getOperand(5 + OpOffset).getImm();
5640 ++OpOffset;
5641 }
5642
5643 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5644
5645 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5646
5647 unsigned Opc;
5648 if (IsTyped) {
5649 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5650 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5651 } else if (IsFormat) {
5652 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5653 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5654 } else {
5655 switch (MemSize) {
5656 case 1:
5657 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5658 break;
5659 case 2:
5660 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5661 break;
5662 default:
5663 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5664 break;
5665 }
5666 }
5667
5668 auto MIB = B.buildInstr(Opc)
5669 .addUse(VData) // vdata
5670 .addUse(RSrc) // rsrc
5671 .addUse(VIndex) // vindex
5672 .addUse(VOffset) // voffset
5673 .addUse(SOffset) // soffset
5674 .addImm(ImmOffset); // offset(imm)
5675
5676 if (IsTyped)
5677 MIB.addImm(Format);
5678
5679 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5680 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5681 .addMemOperand(MMO);
5682
5683 MI.eraseFromParent();
5684 return true;
5685 }
5686
buildBufferLoad(unsigned Opc,Register LoadDstReg,Register RSrc,Register VIndex,Register VOffset,Register SOffset,unsigned ImmOffset,unsigned Format,unsigned AuxiliaryData,MachineMemOperand * MMO,bool IsTyped,bool HasVIndex,MachineIRBuilder & B)5687 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5688 Register VIndex, Register VOffset, Register SOffset,
5689 unsigned ImmOffset, unsigned Format,
5690 unsigned AuxiliaryData, MachineMemOperand *MMO,
5691 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5692 auto MIB = B.buildInstr(Opc)
5693 .addDef(LoadDstReg) // vdata
5694 .addUse(RSrc) // rsrc
5695 .addUse(VIndex) // vindex
5696 .addUse(VOffset) // voffset
5697 .addUse(SOffset) // soffset
5698 .addImm(ImmOffset); // offset(imm)
5699
5700 if (IsTyped)
5701 MIB.addImm(Format);
5702
5703 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5704 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5705 .addMemOperand(MMO);
5706 }
5707
legalizeBufferLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat,bool IsTyped) const5708 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5709 MachineRegisterInfo &MRI,
5710 MachineIRBuilder &B,
5711 bool IsFormat,
5712 bool IsTyped) const {
5713 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5714 MachineMemOperand *MMO = *MI.memoperands_begin();
5715 const LLT MemTy = MMO->getMemoryType();
5716 const LLT S32 = LLT::scalar(32);
5717
5718 Register Dst = MI.getOperand(0).getReg();
5719
5720 Register StatusDst;
5721 int OpOffset = 0;
5722 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5723 bool IsTFE = MI.getNumExplicitDefs() == 2;
5724 if (IsTFE) {
5725 StatusDst = MI.getOperand(1).getReg();
5726 ++OpOffset;
5727 }
5728
5729 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5730 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
5731
5732 // The typed intrinsics add an immediate after the registers.
5733 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5734
5735 // The struct intrinsic variants add one additional operand over raw.
5736 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5737 Register VIndex;
5738 if (HasVIndex) {
5739 VIndex = MI.getOperand(3 + OpOffset).getReg();
5740 ++OpOffset;
5741 } else {
5742 VIndex = B.buildConstant(S32, 0).getReg(0);
5743 }
5744
5745 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5746 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5747
5748 unsigned Format = 0;
5749 if (IsTyped) {
5750 Format = MI.getOperand(5 + OpOffset).getImm();
5751 ++OpOffset;
5752 }
5753
5754 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5755 unsigned ImmOffset;
5756
5757 LLT Ty = MRI.getType(Dst);
5758 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5759 // logic doesn't have to handle that case.
5760 if (hasBufferRsrcWorkaround(Ty)) {
5761 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
5762 Dst = MI.getOperand(0).getReg();
5763 }
5764 LLT EltTy = Ty.getScalarType();
5765 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5766 const bool Unpacked = ST.hasUnpackedD16VMem();
5767
5768 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5769
5770 unsigned Opc;
5771
5772 // TODO: Support TFE for typed and narrow loads.
5773 if (IsTyped) {
5774 if (IsTFE)
5775 return false;
5776 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5777 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5778 } else if (IsFormat) {
5779 if (IsD16) {
5780 if (IsTFE)
5781 return false;
5782 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5783 } else {
5784 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5785 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5786 }
5787 } else {
5788 if (IsTFE)
5789 return false;
5790 switch (MemTy.getSizeInBits()) {
5791 case 8:
5792 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5793 break;
5794 case 16:
5795 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
5796 break;
5797 default:
5798 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
5799 break;
5800 }
5801 }
5802
5803 if (IsTFE) {
5804 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
5805 unsigned NumLoadDWords = NumValueDWords + 1;
5806 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
5807 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
5808 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5809 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5810 if (NumValueDWords == 1) {
5811 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
5812 } else {
5813 SmallVector<Register, 5> LoadElts;
5814 for (unsigned I = 0; I != NumValueDWords; ++I)
5815 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
5816 LoadElts.push_back(StatusDst);
5817 B.buildUnmerge(LoadElts, LoadDstReg);
5818 LoadElts.truncate(NumValueDWords);
5819 B.buildMergeLikeInstr(Dst, LoadElts);
5820 }
5821 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
5822 (IsD16 && !Ty.isVector())) {
5823 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
5824 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5825 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5826 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5827 B.buildTrunc(Dst, LoadDstReg);
5828 } else if (Unpacked && IsD16 && Ty.isVector()) {
5829 LLT UnpackedTy = Ty.changeElementSize(32);
5830 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
5831 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5832 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5833 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5834 // FIXME: G_TRUNC should work, but legalization currently fails
5835 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
5836 SmallVector<Register, 4> Repack;
5837 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
5838 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
5839 B.buildMergeLikeInstr(Dst, Repack);
5840 } else {
5841 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
5842 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5843 }
5844
5845 MI.eraseFromParent();
5846 return true;
5847 }
5848
getBufferAtomicPseudo(Intrinsic::ID IntrID)5849 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
5850 switch (IntrID) {
5851 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5852 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
5853 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5854 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
5855 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
5856 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5857 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
5858 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5859 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
5860 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
5861 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5862 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
5863 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5864 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
5865 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
5866 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5867 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
5868 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5869 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
5870 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
5871 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5872 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
5873 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5874 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
5875 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
5876 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5877 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
5878 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5879 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
5880 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
5881 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5882 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
5883 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5884 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
5885 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
5886 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5887 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
5888 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5889 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
5890 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
5891 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5892 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
5893 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
5895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
5896 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
5898 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5899 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
5900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
5901 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
5903 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5904 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
5905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
5906 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
5908 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
5910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
5911 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
5913 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
5915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
5916 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
5918 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
5920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5921 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
5922 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
5923 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
5924 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5925 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
5926 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5927 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
5928 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
5929 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5930 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
5931 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5932 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
5933 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
5934 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
5935 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
5936 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
5937 default:
5938 llvm_unreachable("unhandled atomic opcode");
5939 }
5940 }
5941
legalizeBufferAtomic(MachineInstr & MI,MachineIRBuilder & B,Intrinsic::ID IID) const5942 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
5943 MachineIRBuilder &B,
5944 Intrinsic::ID IID) const {
5945 const bool IsCmpSwap =
5946 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
5947 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
5948 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
5949 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
5950
5951 Register Dst = MI.getOperand(0).getReg();
5952 // Since we don't have 128-bit atomics, we don't need to handle the case of
5953 // p8 argmunents to the atomic itself
5954 Register VData = MI.getOperand(2).getReg();
5955
5956 Register CmpVal;
5957 int OpOffset = 0;
5958
5959 if (IsCmpSwap) {
5960 CmpVal = MI.getOperand(3).getReg();
5961 ++OpOffset;
5962 }
5963
5964 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
5965 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
5966 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
5967
5968 // The struct intrinsic variants add one additional operand over raw.
5969 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5970 Register VIndex;
5971 if (HasVIndex) {
5972 VIndex = MI.getOperand(4 + OpOffset).getReg();
5973 ++OpOffset;
5974 } else {
5975 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
5976 }
5977
5978 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
5979 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
5980 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
5981
5982 MachineMemOperand *MMO = *MI.memoperands_begin();
5983
5984 unsigned ImmOffset;
5985 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5986
5987 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
5988 .addDef(Dst)
5989 .addUse(VData); // vdata
5990
5991 if (IsCmpSwap)
5992 MIB.addReg(CmpVal);
5993
5994 MIB.addUse(RSrc) // rsrc
5995 .addUse(VIndex) // vindex
5996 .addUse(VOffset) // voffset
5997 .addUse(SOffset) // soffset
5998 .addImm(ImmOffset) // offset(imm)
5999 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6000 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6001 .addMemOperand(MMO);
6002
6003 MI.eraseFromParent();
6004 return true;
6005 }
6006
6007 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6008 /// vector with s16 typed elements.
packImage16bitOpsToDwords(MachineIRBuilder & B,MachineInstr & MI,SmallVectorImpl<Register> & PackedAddrs,unsigned ArgOffset,const AMDGPU::ImageDimIntrinsicInfo * Intr,bool IsA16,bool IsG16)6009 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6010 SmallVectorImpl<Register> &PackedAddrs,
6011 unsigned ArgOffset,
6012 const AMDGPU::ImageDimIntrinsicInfo *Intr,
6013 bool IsA16, bool IsG16) {
6014 const LLT S16 = LLT::scalar(16);
6015 const LLT V2S16 = LLT::fixed_vector(2, 16);
6016 auto EndIdx = Intr->VAddrEnd;
6017
6018 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6019 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6020 if (!SrcOp.isReg())
6021 continue; // _L to _LZ may have eliminated this.
6022
6023 Register AddrReg = SrcOp.getReg();
6024
6025 if ((I < Intr->GradientStart) ||
6026 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6027 (I >= Intr->CoordStart && !IsA16)) {
6028 if ((I < Intr->GradientStart) && IsA16 &&
6029 (B.getMRI()->getType(AddrReg) == S16)) {
6030 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6031 // Special handling of bias when A16 is on. Bias is of type half but
6032 // occupies full 32-bit.
6033 PackedAddrs.push_back(
6034 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6035 .getReg(0));
6036 } else {
6037 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6038 "Bias needs to be converted to 16 bit in A16 mode");
6039 // Handle any gradient or coordinate operands that should not be packed
6040 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6041 PackedAddrs.push_back(AddrReg);
6042 }
6043 } else {
6044 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6045 // derivatives dx/dh and dx/dv are packed with undef.
6046 if (((I + 1) >= EndIdx) ||
6047 ((Intr->NumGradients / 2) % 2 == 1 &&
6048 (I == static_cast<unsigned>(Intr->GradientStart +
6049 (Intr->NumGradients / 2) - 1) ||
6050 I == static_cast<unsigned>(Intr->GradientStart +
6051 Intr->NumGradients - 1))) ||
6052 // Check for _L to _LZ optimization
6053 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6054 PackedAddrs.push_back(
6055 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6056 .getReg(0));
6057 } else {
6058 PackedAddrs.push_back(
6059 B.buildBuildVector(
6060 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6061 .getReg(0));
6062 ++I;
6063 }
6064 }
6065 }
6066 }
6067
6068 /// Convert from separate vaddr components to a single vector address register,
6069 /// and replace the remaining operands with $noreg.
convertImageAddrToPacked(MachineIRBuilder & B,MachineInstr & MI,int DimIdx,int NumVAddrs)6070 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6071 int DimIdx, int NumVAddrs) {
6072 const LLT S32 = LLT::scalar(32);
6073 (void)S32;
6074 SmallVector<Register, 8> AddrRegs;
6075 for (int I = 0; I != NumVAddrs; ++I) {
6076 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6077 if (SrcOp.isReg()) {
6078 AddrRegs.push_back(SrcOp.getReg());
6079 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6080 }
6081 }
6082
6083 int NumAddrRegs = AddrRegs.size();
6084 if (NumAddrRegs != 1) {
6085 auto VAddr =
6086 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6087 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6088 }
6089
6090 for (int I = 1; I != NumVAddrs; ++I) {
6091 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6092 if (SrcOp.isReg())
6093 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6094 }
6095 }
6096
6097 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
6098 ///
6099 /// Depending on the subtarget, load/store with 16-bit element data need to be
6100 /// rewritten to use the low half of 32-bit registers, or directly use a packed
6101 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
6102 /// registers.
6103 ///
6104 /// We don't want to directly select image instructions just yet, but also want
6105 /// to exposes all register repacking to the legalizer/combiners. We also don't
6106 /// want a selected instruction entering RegBankSelect. In order to avoid
6107 /// defining a multitude of intermediate image instructions, directly hack on
6108 /// the intrinsic's arguments. In cases like a16 addresses, this requires
6109 /// padding now unnecessary arguments with $noreg.
legalizeImageIntrinsic(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer,const AMDGPU::ImageDimIntrinsicInfo * Intr) const6110 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6111 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6112 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6113
6114 const MachineFunction &MF = *MI.getMF();
6115 const unsigned NumDefs = MI.getNumExplicitDefs();
6116 const unsigned ArgOffset = NumDefs + 1;
6117 bool IsTFE = NumDefs == 2;
6118 // We are only processing the operands of d16 image operations on subtargets
6119 // that use the unpacked register layout, or need to repack the TFE result.
6120
6121 // TODO: Do we need to guard against already legalized intrinsics?
6122 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6123 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
6124
6125 MachineRegisterInfo *MRI = B.getMRI();
6126 const LLT S32 = LLT::scalar(32);
6127 const LLT S16 = LLT::scalar(16);
6128 const LLT V2S16 = LLT::fixed_vector(2, 16);
6129
6130 unsigned DMask = 0;
6131 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6132 LLT Ty = MRI->getType(VData);
6133
6134 const bool IsAtomicPacked16Bit =
6135 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6136 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6137
6138 // Check for 16 bit addresses and pack if true.
6139 LLT GradTy =
6140 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6141 LLT AddrTy =
6142 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6143 const bool IsG16 =
6144 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6145 const bool IsA16 = AddrTy == S16;
6146 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6147
6148 int DMaskLanes = 0;
6149 if (!BaseOpcode->Atomic) {
6150 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6151 if (BaseOpcode->Gather4) {
6152 DMaskLanes = 4;
6153 } else if (DMask != 0) {
6154 DMaskLanes = llvm::popcount(DMask);
6155 } else if (!IsTFE && !BaseOpcode->Store) {
6156 // If dmask is 0, this is a no-op load. This can be eliminated.
6157 B.buildUndef(MI.getOperand(0));
6158 MI.eraseFromParent();
6159 return true;
6160 }
6161 }
6162
6163 Observer.changingInstr(MI);
6164 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6165
6166 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6167 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6168 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6169 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6170 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
6171
6172 // Track that we legalized this
6173 MI.setDesc(B.getTII().get(NewOpcode));
6174
6175 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6176 // dmask to be at least 1 otherwise the instruction will fail
6177 if (IsTFE && DMask == 0) {
6178 DMask = 0x1;
6179 DMaskLanes = 1;
6180 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6181 }
6182
6183 if (BaseOpcode->Atomic) {
6184 Register VData0 = MI.getOperand(2).getReg();
6185 LLT Ty = MRI->getType(VData0);
6186
6187 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6188 if (Ty.isVector() && !IsAtomicPacked16Bit)
6189 return false;
6190
6191 if (BaseOpcode->AtomicX2) {
6192 Register VData1 = MI.getOperand(3).getReg();
6193 // The two values are packed in one register.
6194 LLT PackedTy = LLT::fixed_vector(2, Ty);
6195 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6196 MI.getOperand(2).setReg(Concat.getReg(0));
6197 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6198 }
6199 }
6200
6201 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6202
6203 // Rewrite the addressing register layout before doing anything else.
6204 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6205 // 16 bit gradients are supported, but are tied to the A16 control
6206 // so both gradients and addresses must be 16 bit
6207 return false;
6208 }
6209
6210 if (IsA16 && !ST.hasA16()) {
6211 // A16 not supported
6212 return false;
6213 }
6214
6215 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6216 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6217
6218 if (IsA16 || IsG16) {
6219 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6220 // instructions expect VGPR_32
6221 SmallVector<Register, 4> PackedRegs;
6222
6223 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6224
6225 // See also below in the non-a16 branch
6226 const bool UseNSA = ST.hasNSAEncoding() &&
6227 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6228 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6229 const bool UsePartialNSA =
6230 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6231
6232 if (UsePartialNSA) {
6233 // Pack registers that would go over NSAMaxSize into last VAddr register
6234 LLT PackedAddrTy =
6235 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6236 auto Concat = B.buildConcatVectors(
6237 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6238 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6239 PackedRegs.resize(NSAMaxSize);
6240 } else if (!UseNSA && PackedRegs.size() > 1) {
6241 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6242 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6243 PackedRegs[0] = Concat.getReg(0);
6244 PackedRegs.resize(1);
6245 }
6246
6247 const unsigned NumPacked = PackedRegs.size();
6248 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6249 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6250 if (!SrcOp.isReg()) {
6251 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6252 continue;
6253 }
6254
6255 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6256
6257 if (I - Intr->VAddrStart < NumPacked)
6258 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6259 else
6260 SrcOp.setReg(AMDGPU::NoRegister);
6261 }
6262 } else {
6263 // If the register allocator cannot place the address registers contiguously
6264 // without introducing moves, then using the non-sequential address encoding
6265 // is always preferable, since it saves VALU instructions and is usually a
6266 // wash in terms of code size or even better.
6267 //
6268 // However, we currently have no way of hinting to the register allocator
6269 // that MIMG addresses should be placed contiguously when it is possible to
6270 // do so, so force non-NSA for the common 2-address case as a heuristic.
6271 //
6272 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6273 // allocation when possible.
6274 //
6275 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6276 // set of the remaining addresses.
6277 const bool UseNSA = ST.hasNSAEncoding() &&
6278 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6279 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6280 const bool UsePartialNSA =
6281 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6282
6283 if (UsePartialNSA) {
6284 convertImageAddrToPacked(B, MI,
6285 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6286 Intr->NumVAddrs - NSAMaxSize + 1);
6287 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6288 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6289 Intr->NumVAddrs);
6290 }
6291 }
6292
6293 int Flags = 0;
6294 if (IsA16)
6295 Flags |= 1;
6296 if (IsG16)
6297 Flags |= 2;
6298 MI.addOperand(MachineOperand::CreateImm(Flags));
6299
6300 if (BaseOpcode->Store) { // No TFE for stores?
6301 // TODO: Handle dmask trim
6302 if (!Ty.isVector() || !IsD16)
6303 return true;
6304
6305 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6306 if (RepackedReg != VData) {
6307 MI.getOperand(1).setReg(RepackedReg);
6308 }
6309
6310 return true;
6311 }
6312
6313 Register DstReg = MI.getOperand(0).getReg();
6314 const LLT EltTy = Ty.getScalarType();
6315 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6316
6317 // Confirm that the return type is large enough for the dmask specified
6318 if (NumElts < DMaskLanes)
6319 return false;
6320
6321 if (NumElts > 4 || DMaskLanes > 4)
6322 return false;
6323
6324 // Image atomic instructions are using DMask to specify how many bits
6325 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6326 // DMaskLanes for image atomic has default value '0'.
6327 // We must be sure that atomic variants (especially packed) will not be
6328 // truncated from v2s16 or v4s16 to s16 type.
6329 //
6330 // ChangeElementCount will be needed for image load where Ty is always scalar.
6331 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6332 const LLT AdjustedTy =
6333 DMaskLanes == 0
6334 ? Ty
6335 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6336
6337 // The raw dword aligned data component of the load. The only legal cases
6338 // where this matters should be when using the packed D16 format, for
6339 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6340 LLT RoundedTy;
6341
6342 // S32 vector to cover all data, plus TFE result element.
6343 LLT TFETy;
6344
6345 // Register type to use for each loaded component. Will be S32 or V2S16.
6346 LLT RegTy;
6347
6348 if (IsD16 && ST.hasUnpackedD16VMem()) {
6349 RoundedTy =
6350 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6351 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6352 RegTy = S32;
6353 } else {
6354 unsigned EltSize = EltTy.getSizeInBits();
6355 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6356 unsigned RoundedSize = 32 * RoundedElts;
6357 RoundedTy = LLT::scalarOrVector(
6358 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6359 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6360 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6361 }
6362
6363 // The return type does not need adjustment.
6364 // TODO: Should we change s16 case to s32 or <2 x s16>?
6365 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6366 return true;
6367
6368 Register Dst1Reg;
6369
6370 // Insert after the instruction.
6371 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6372
6373 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6374 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6375 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6376 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6377
6378 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6379
6380 MI.getOperand(0).setReg(NewResultReg);
6381
6382 // In the IR, TFE is supposed to be used with a 2 element struct return
6383 // type. The instruction really returns these two values in one contiguous
6384 // register, with one additional dword beyond the loaded data. Rewrite the
6385 // return type to use a single register result.
6386
6387 if (IsTFE) {
6388 Dst1Reg = MI.getOperand(1).getReg();
6389 if (MRI->getType(Dst1Reg) != S32)
6390 return false;
6391
6392 // TODO: Make sure the TFE operand bit is set.
6393 MI.removeOperand(1);
6394
6395 // Handle the easy case that requires no repack instructions.
6396 if (Ty == S32) {
6397 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6398 return true;
6399 }
6400 }
6401
6402 // Now figure out how to copy the new result register back into the old
6403 // result.
6404 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6405
6406 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6407
6408 if (ResultNumRegs == 1) {
6409 assert(!IsTFE);
6410 ResultRegs[0] = NewResultReg;
6411 } else {
6412 // We have to repack into a new vector of some kind.
6413 for (int I = 0; I != NumDataRegs; ++I)
6414 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6415 B.buildUnmerge(ResultRegs, NewResultReg);
6416
6417 // Drop the final TFE element to get the data part. The TFE result is
6418 // directly written to the right place already.
6419 if (IsTFE)
6420 ResultRegs.resize(NumDataRegs);
6421 }
6422
6423 // For an s16 scalar result, we form an s32 result with a truncate regardless
6424 // of packed vs. unpacked.
6425 if (IsD16 && !Ty.isVector()) {
6426 B.buildTrunc(DstReg, ResultRegs[0]);
6427 return true;
6428 }
6429
6430 // Avoid a build/concat_vector of 1 entry.
6431 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6432 B.buildBitcast(DstReg, ResultRegs[0]);
6433 return true;
6434 }
6435
6436 assert(Ty.isVector());
6437
6438 if (IsD16) {
6439 // For packed D16 results with TFE enabled, all the data components are
6440 // S32. Cast back to the expected type.
6441 //
6442 // TODO: We don't really need to use load s32 elements. We would only need one
6443 // cast for the TFE result if a multiple of v2s16 was used.
6444 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6445 for (Register &Reg : ResultRegs)
6446 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6447 } else if (ST.hasUnpackedD16VMem()) {
6448 for (Register &Reg : ResultRegs)
6449 Reg = B.buildTrunc(S16, Reg).getReg(0);
6450 }
6451 }
6452
6453 auto padWithUndef = [&](LLT Ty, int NumElts) {
6454 if (NumElts == 0)
6455 return;
6456 Register Undef = B.buildUndef(Ty).getReg(0);
6457 for (int I = 0; I != NumElts; ++I)
6458 ResultRegs.push_back(Undef);
6459 };
6460
6461 // Pad out any elements eliminated due to the dmask.
6462 LLT ResTy = MRI->getType(ResultRegs[0]);
6463 if (!ResTy.isVector()) {
6464 padWithUndef(ResTy, NumElts - ResultRegs.size());
6465 B.buildBuildVector(DstReg, ResultRegs);
6466 return true;
6467 }
6468
6469 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6470 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6471
6472 // Deal with the one annoying legal case.
6473 const LLT V3S16 = LLT::fixed_vector(3, 16);
6474 if (Ty == V3S16) {
6475 if (IsTFE) {
6476 if (ResultRegs.size() == 1) {
6477 NewResultReg = ResultRegs[0];
6478 } else if (ResultRegs.size() == 2) {
6479 LLT V4S16 = LLT::fixed_vector(4, 16);
6480 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6481 } else {
6482 return false;
6483 }
6484 }
6485
6486 if (MRI->getType(DstReg).getNumElements() <
6487 MRI->getType(NewResultReg).getNumElements()) {
6488 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6489 } else {
6490 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6491 }
6492 return true;
6493 }
6494
6495 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6496 B.buildConcatVectors(DstReg, ResultRegs);
6497 return true;
6498 }
6499
legalizeSBufferLoad(LegalizerHelper & Helper,MachineInstr & MI) const6500 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6501 MachineInstr &MI) const {
6502 MachineIRBuilder &B = Helper.MIRBuilder;
6503 GISelChangeObserver &Observer = Helper.Observer;
6504
6505 Register OrigDst = MI.getOperand(0).getReg();
6506 Register Dst;
6507 LLT Ty = B.getMRI()->getType(OrigDst);
6508 unsigned Size = Ty.getSizeInBits();
6509 MachineFunction &MF = B.getMF();
6510 unsigned Opc = 0;
6511 if (Size < 32 && ST.hasScalarSubwordLoads()) {
6512 assert(Size == 8 || Size == 16);
6513 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6514 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6515 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6516 // destination register.
6517 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6518 } else {
6519 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6520 Dst = OrigDst;
6521 }
6522
6523 Observer.changingInstr(MI);
6524
6525 // Handle needing to s.buffer.load() a p8 value.
6526 if (hasBufferRsrcWorkaround(Ty)) {
6527 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6528 B.setInsertPt(B.getMBB(), MI);
6529 }
6530 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6531 Ty = getBitcastRegisterType(Ty);
6532 Helper.bitcastDst(MI, Ty, 0);
6533 B.setInsertPt(B.getMBB(), MI);
6534 }
6535
6536 // FIXME: We don't really need this intermediate instruction. The intrinsic
6537 // should be fixed to have a memory operand. Since it's readnone, we're not
6538 // allowed to add one.
6539 MI.setDesc(B.getTII().get(Opc));
6540 MI.removeOperand(1); // Remove intrinsic ID
6541
6542 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6543 // TODO: Should this use datalayout alignment?
6544 const unsigned MemSize = (Size + 7) / 8;
6545 const Align MemAlign(std::min(MemSize, 4u));
6546 MachineMemOperand *MMO = MF.getMachineMemOperand(
6547 MachinePointerInfo(),
6548 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6549 MachineMemOperand::MOInvariant,
6550 MemSize, MemAlign);
6551 MI.addMemOperand(MF, MMO);
6552 if (Dst != OrigDst) {
6553 MI.getOperand(0).setReg(Dst);
6554 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6555 B.buildTrunc(OrigDst, Dst);
6556 }
6557
6558 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6559 // always be legal. We may need to restore this to a 96-bit result if it turns
6560 // out this needs to be converted to a vector load during RegBankSelect.
6561 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6562 if (Ty.isVector())
6563 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
6564 else
6565 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6566 }
6567
6568 Observer.changedInstr(MI);
6569 return true;
6570 }
6571
6572 // TODO: Move to selection
legalizeTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6573 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
6574 MachineRegisterInfo &MRI,
6575 MachineIRBuilder &B) const {
6576 if (!ST.isTrapHandlerEnabled() ||
6577 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6578 return legalizeTrapEndpgm(MI, MRI, B);
6579
6580 return ST.supportsGetDoorbellID() ?
6581 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6582 }
6583
legalizeTrapEndpgm(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6584 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6585 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6586 const DebugLoc &DL = MI.getDebugLoc();
6587 MachineBasicBlock &BB = B.getMBB();
6588 MachineFunction *MF = BB.getParent();
6589
6590 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6591 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6592 .addImm(0);
6593 MI.eraseFromParent();
6594 return true;
6595 }
6596
6597 // We need a block split to make the real endpgm a terminator. We also don't
6598 // want to break phis in successor blocks, so we can't just delete to the
6599 // end of the block.
6600 BB.splitAt(MI, false /*UpdateLiveIns*/);
6601 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6602 MF->push_back(TrapBB);
6603 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6604 .addImm(0);
6605 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6606 .addMBB(TrapBB);
6607
6608 BB.addSuccessor(TrapBB);
6609 MI.eraseFromParent();
6610 return true;
6611 }
6612
legalizeTrapHsaQueuePtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6613 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6614 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6615 MachineFunction &MF = B.getMF();
6616 const LLT S64 = LLT::scalar(64);
6617
6618 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6619 // For code object version 5, queue_ptr is passed through implicit kernarg.
6620 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
6621 AMDGPU::AMDHSA_COV5) {
6622 AMDGPUTargetLowering::ImplicitParameter Param =
6623 AMDGPUTargetLowering::QUEUE_PTR;
6624 uint64_t Offset =
6625 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6626
6627 Register KernargPtrReg = MRI.createGenericVirtualRegister(
6628 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6629
6630 if (!loadInputValue(KernargPtrReg, B,
6631 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6632 return false;
6633
6634 // TODO: can we be smarter about machine pointer info?
6635 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6636 MachineMemOperand *MMO = MF.getMachineMemOperand(
6637 PtrInfo,
6638 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6639 MachineMemOperand::MOInvariant,
6640 LLT::scalar(64), commonAlignment(Align(64), Offset));
6641
6642 // Pointer address
6643 Register LoadAddr = MRI.createGenericVirtualRegister(
6644 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6645 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6646 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6647 // Load address
6648 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6649 B.buildCopy(SGPR01, Temp);
6650 B.buildInstr(AMDGPU::S_TRAP)
6651 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6652 .addReg(SGPR01, RegState::Implicit);
6653 MI.eraseFromParent();
6654 return true;
6655 }
6656
6657 // Pass queue pointer to trap handler as input, and insert trap instruction
6658 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6659 Register LiveIn =
6660 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6661 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
6662 return false;
6663
6664 B.buildCopy(SGPR01, LiveIn);
6665 B.buildInstr(AMDGPU::S_TRAP)
6666 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6667 .addReg(SGPR01, RegState::Implicit);
6668
6669 MI.eraseFromParent();
6670 return true;
6671 }
6672
legalizeTrapHsa(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6673 bool AMDGPULegalizerInfo::legalizeTrapHsa(
6674 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6675 B.buildInstr(AMDGPU::S_TRAP)
6676 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6677 MI.eraseFromParent();
6678 return true;
6679 }
6680
legalizeDebugTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6681 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
6682 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6683 // Is non-HSA path or trap-handler disabled? Then, report a warning
6684 // accordingly
6685 if (!ST.isTrapHandlerEnabled() ||
6686 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6687 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6688 "debugtrap handler not supported",
6689 MI.getDebugLoc(), DS_Warning);
6690 LLVMContext &Ctx = B.getMF().getFunction().getContext();
6691 Ctx.diagnose(NoTrap);
6692 } else {
6693 // Insert debug-trap instruction
6694 B.buildInstr(AMDGPU::S_TRAP)
6695 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6696 }
6697
6698 MI.eraseFromParent();
6699 return true;
6700 }
6701
legalizeBVHIntrinsic(MachineInstr & MI,MachineIRBuilder & B) const6702 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6703 MachineIRBuilder &B) const {
6704 MachineRegisterInfo &MRI = *B.getMRI();
6705 const LLT S16 = LLT::scalar(16);
6706 const LLT S32 = LLT::scalar(32);
6707 const LLT V2S16 = LLT::fixed_vector(2, 16);
6708 const LLT V3S32 = LLT::fixed_vector(3, 32);
6709
6710 Register DstReg = MI.getOperand(0).getReg();
6711 Register NodePtr = MI.getOperand(2).getReg();
6712 Register RayExtent = MI.getOperand(3).getReg();
6713 Register RayOrigin = MI.getOperand(4).getReg();
6714 Register RayDir = MI.getOperand(5).getReg();
6715 Register RayInvDir = MI.getOperand(6).getReg();
6716 Register TDescr = MI.getOperand(7).getReg();
6717
6718 if (!ST.hasGFX10_AEncoding()) {
6719 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6720 "intrinsic not supported on subtarget",
6721 MI.getDebugLoc());
6722 B.getMF().getFunction().getContext().diagnose(BadIntrin);
6723 return false;
6724 }
6725
6726 const bool IsGFX11 = AMDGPU::isGFX11(ST);
6727 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6728 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6729 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6730 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6731 const unsigned NumVDataDwords = 4;
6732 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6733 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6734 const bool UseNSA =
6735 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6736
6737 const unsigned BaseOpcodes[2][2] = {
6738 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6739 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6740 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6741 int Opcode;
6742 if (UseNSA) {
6743 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6744 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6745 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6746 : AMDGPU::MIMGEncGfx10NSA,
6747 NumVDataDwords, NumVAddrDwords);
6748 } else {
6749 assert(!IsGFX12Plus);
6750 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6751 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6752 : AMDGPU::MIMGEncGfx10Default,
6753 NumVDataDwords, NumVAddrDwords);
6754 }
6755 assert(Opcode != -1);
6756
6757 SmallVector<Register, 12> Ops;
6758 if (UseNSA && IsGFX11Plus) {
6759 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6760 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6761 auto Merged = B.buildMergeLikeInstr(
6762 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6763 Ops.push_back(Merged.getReg(0));
6764 };
6765
6766 Ops.push_back(NodePtr);
6767 Ops.push_back(RayExtent);
6768 packLanes(RayOrigin);
6769
6770 if (IsA16) {
6771 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6772 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6773 auto MergedDir = B.buildMergeLikeInstr(
6774 V3S32,
6775 {B.buildBitcast(
6776 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
6777 UnmergeRayDir.getReg(0)}))
6778 .getReg(0),
6779 B.buildBitcast(
6780 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
6781 UnmergeRayDir.getReg(1)}))
6782 .getReg(0),
6783 B.buildBitcast(
6784 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
6785 UnmergeRayDir.getReg(2)}))
6786 .getReg(0)});
6787 Ops.push_back(MergedDir.getReg(0));
6788 } else {
6789 packLanes(RayDir);
6790 packLanes(RayInvDir);
6791 }
6792 } else {
6793 if (Is64) {
6794 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
6795 Ops.push_back(Unmerge.getReg(0));
6796 Ops.push_back(Unmerge.getReg(1));
6797 } else {
6798 Ops.push_back(NodePtr);
6799 }
6800 Ops.push_back(RayExtent);
6801
6802 auto packLanes = [&Ops, &S32, &B](Register Src) {
6803 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6804 Ops.push_back(Unmerge.getReg(0));
6805 Ops.push_back(Unmerge.getReg(1));
6806 Ops.push_back(Unmerge.getReg(2));
6807 };
6808
6809 packLanes(RayOrigin);
6810 if (IsA16) {
6811 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6812 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6813 Register R1 = MRI.createGenericVirtualRegister(S32);
6814 Register R2 = MRI.createGenericVirtualRegister(S32);
6815 Register R3 = MRI.createGenericVirtualRegister(S32);
6816 B.buildMergeLikeInstr(R1,
6817 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
6818 B.buildMergeLikeInstr(
6819 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
6820 B.buildMergeLikeInstr(
6821 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
6822 Ops.push_back(R1);
6823 Ops.push_back(R2);
6824 Ops.push_back(R3);
6825 } else {
6826 packLanes(RayDir);
6827 packLanes(RayInvDir);
6828 }
6829 }
6830
6831 if (!UseNSA) {
6832 // Build a single vector containing all the operands so far prepared.
6833 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
6834 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
6835 Ops.clear();
6836 Ops.push_back(MergedOps);
6837 }
6838
6839 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
6840 .addDef(DstReg)
6841 .addImm(Opcode);
6842
6843 for (Register R : Ops) {
6844 MIB.addUse(R);
6845 }
6846
6847 MIB.addUse(TDescr)
6848 .addImm(IsA16 ? 1 : 0)
6849 .cloneMemRefs(MI);
6850
6851 MI.eraseFromParent();
6852 return true;
6853 }
6854
legalizeFPTruncRound(MachineInstr & MI,MachineIRBuilder & B) const6855 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
6856 MachineIRBuilder &B) const {
6857 unsigned Opc;
6858 int RoundMode = MI.getOperand(2).getImm();
6859
6860 if (RoundMode == (int)RoundingMode::TowardPositive)
6861 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
6862 else if (RoundMode == (int)RoundingMode::TowardNegative)
6863 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
6864 else
6865 return false;
6866
6867 B.buildInstr(Opc)
6868 .addDef(MI.getOperand(0).getReg())
6869 .addUse(MI.getOperand(1).getReg());
6870
6871 MI.eraseFromParent();
6872
6873 return true;
6874 }
6875
legalizeStackSave(MachineInstr & MI,MachineIRBuilder & B) const6876 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
6877 MachineIRBuilder &B) const {
6878 const SITargetLowering *TLI = ST.getTargetLowering();
6879 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
6880 Register DstReg = MI.getOperand(0).getReg();
6881 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
6882 MI.eraseFromParent();
6883 return true;
6884 }
6885
legalizeWaveID(MachineInstr & MI,MachineIRBuilder & B) const6886 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
6887 MachineIRBuilder &B) const {
6888 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
6889 if (!ST.hasArchitectedSGPRs())
6890 return false;
6891 LLT S32 = LLT::scalar(32);
6892 Register DstReg = MI.getOperand(0).getReg();
6893 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
6894 auto LSB = B.buildConstant(S32, 25);
6895 auto Width = B.buildConstant(S32, 5);
6896 B.buildUbfx(DstReg, TTMP8, LSB, Width);
6897 MI.eraseFromParent();
6898 return true;
6899 }
6900
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const6901 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
6902 MachineInstr &MI) const {
6903 MachineIRBuilder &B = Helper.MIRBuilder;
6904 MachineRegisterInfo &MRI = *B.getMRI();
6905
6906 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
6907 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
6908 switch (IntrID) {
6909 case Intrinsic::amdgcn_if:
6910 case Intrinsic::amdgcn_else: {
6911 MachineInstr *Br = nullptr;
6912 MachineBasicBlock *UncondBrTarget = nullptr;
6913 bool Negated = false;
6914 if (MachineInstr *BrCond =
6915 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
6916 const SIRegisterInfo *TRI
6917 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
6918
6919 Register Def = MI.getOperand(1).getReg();
6920 Register Use = MI.getOperand(3).getReg();
6921
6922 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
6923
6924 if (Negated)
6925 std::swap(CondBrTarget, UncondBrTarget);
6926
6927 B.setInsertPt(B.getMBB(), BrCond->getIterator());
6928 if (IntrID == Intrinsic::amdgcn_if) {
6929 B.buildInstr(AMDGPU::SI_IF)
6930 .addDef(Def)
6931 .addUse(Use)
6932 .addMBB(UncondBrTarget);
6933 } else {
6934 B.buildInstr(AMDGPU::SI_ELSE)
6935 .addDef(Def)
6936 .addUse(Use)
6937 .addMBB(UncondBrTarget);
6938 }
6939
6940 if (Br) {
6941 Br->getOperand(0).setMBB(CondBrTarget);
6942 } else {
6943 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
6944 // since we're swapping branch targets it needs to be reinserted.
6945 // FIXME: IRTranslator should probably not do this
6946 B.buildBr(*CondBrTarget);
6947 }
6948
6949 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
6950 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
6951 MI.eraseFromParent();
6952 BrCond->eraseFromParent();
6953 return true;
6954 }
6955
6956 return false;
6957 }
6958 case Intrinsic::amdgcn_loop: {
6959 MachineInstr *Br = nullptr;
6960 MachineBasicBlock *UncondBrTarget = nullptr;
6961 bool Negated = false;
6962 if (MachineInstr *BrCond =
6963 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
6964 const SIRegisterInfo *TRI
6965 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
6966
6967 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
6968 Register Reg = MI.getOperand(2).getReg();
6969
6970 if (Negated)
6971 std::swap(CondBrTarget, UncondBrTarget);
6972
6973 B.setInsertPt(B.getMBB(), BrCond->getIterator());
6974 B.buildInstr(AMDGPU::SI_LOOP)
6975 .addUse(Reg)
6976 .addMBB(UncondBrTarget);
6977
6978 if (Br)
6979 Br->getOperand(0).setMBB(CondBrTarget);
6980 else
6981 B.buildBr(*CondBrTarget);
6982
6983 MI.eraseFromParent();
6984 BrCond->eraseFromParent();
6985 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
6986 return true;
6987 }
6988
6989 return false;
6990 }
6991 case Intrinsic::amdgcn_make_buffer_rsrc:
6992 return legalizePointerAsRsrcIntrin(MI, MRI, B);
6993 case Intrinsic::amdgcn_kernarg_segment_ptr:
6994 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
6995 // This only makes sense to call in a kernel, so just lower to null.
6996 B.buildConstant(MI.getOperand(0).getReg(), 0);
6997 MI.eraseFromParent();
6998 return true;
6999 }
7000
7001 return legalizePreloadedArgIntrin(
7002 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7003 case Intrinsic::amdgcn_implicitarg_ptr:
7004 return legalizeImplicitArgPtr(MI, MRI, B);
7005 case Intrinsic::amdgcn_workitem_id_x:
7006 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7007 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7008 case Intrinsic::amdgcn_workitem_id_y:
7009 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7010 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7011 case Intrinsic::amdgcn_workitem_id_z:
7012 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7013 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7014 case Intrinsic::amdgcn_workgroup_id_x:
7015 return legalizePreloadedArgIntrin(MI, MRI, B,
7016 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7017 case Intrinsic::amdgcn_workgroup_id_y:
7018 return legalizePreloadedArgIntrin(MI, MRI, B,
7019 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7020 case Intrinsic::amdgcn_workgroup_id_z:
7021 return legalizePreloadedArgIntrin(MI, MRI, B,
7022 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7023 case Intrinsic::amdgcn_wave_id:
7024 return legalizeWaveID(MI, B);
7025 case Intrinsic::amdgcn_lds_kernel_id:
7026 return legalizePreloadedArgIntrin(MI, MRI, B,
7027 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7028 case Intrinsic::amdgcn_dispatch_ptr:
7029 return legalizePreloadedArgIntrin(MI, MRI, B,
7030 AMDGPUFunctionArgInfo::DISPATCH_PTR);
7031 case Intrinsic::amdgcn_queue_ptr:
7032 return legalizePreloadedArgIntrin(MI, MRI, B,
7033 AMDGPUFunctionArgInfo::QUEUE_PTR);
7034 case Intrinsic::amdgcn_implicit_buffer_ptr:
7035 return legalizePreloadedArgIntrin(
7036 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7037 case Intrinsic::amdgcn_dispatch_id:
7038 return legalizePreloadedArgIntrin(MI, MRI, B,
7039 AMDGPUFunctionArgInfo::DISPATCH_ID);
7040 case Intrinsic::r600_read_ngroups_x:
7041 // TODO: Emit error for hsa
7042 return legalizeKernargMemParameter(MI, B,
7043 SI::KernelInputOffsets::NGROUPS_X);
7044 case Intrinsic::r600_read_ngroups_y:
7045 return legalizeKernargMemParameter(MI, B,
7046 SI::KernelInputOffsets::NGROUPS_Y);
7047 case Intrinsic::r600_read_ngroups_z:
7048 return legalizeKernargMemParameter(MI, B,
7049 SI::KernelInputOffsets::NGROUPS_Z);
7050 case Intrinsic::r600_read_local_size_x:
7051 // TODO: Could insert G_ASSERT_ZEXT from s16
7052 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
7053 case Intrinsic::r600_read_local_size_y:
7054 // TODO: Could insert G_ASSERT_ZEXT from s16
7055 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y);
7056 // TODO: Could insert G_ASSERT_ZEXT from s16
7057 case Intrinsic::r600_read_local_size_z:
7058 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
7059 case Intrinsic::r600_read_global_size_x:
7060 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
7061 case Intrinsic::r600_read_global_size_y:
7062 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
7063 case Intrinsic::r600_read_global_size_z:
7064 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
7065 case Intrinsic::amdgcn_fdiv_fast:
7066 return legalizeFDIVFastIntrin(MI, MRI, B);
7067 case Intrinsic::amdgcn_is_shared:
7068 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
7069 case Intrinsic::amdgcn_is_private:
7070 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
7071 case Intrinsic::amdgcn_wavefrontsize: {
7072 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7073 MI.eraseFromParent();
7074 return true;
7075 }
7076 case Intrinsic::amdgcn_s_buffer_load:
7077 return legalizeSBufferLoad(Helper, MI);
7078 case Intrinsic::amdgcn_raw_buffer_store:
7079 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7080 case Intrinsic::amdgcn_struct_buffer_store:
7081 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7082 return legalizeBufferStore(MI, MRI, B, false, false);
7083 case Intrinsic::amdgcn_raw_buffer_store_format:
7084 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7085 case Intrinsic::amdgcn_struct_buffer_store_format:
7086 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7087 return legalizeBufferStore(MI, MRI, B, false, true);
7088 case Intrinsic::amdgcn_raw_tbuffer_store:
7089 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7090 case Intrinsic::amdgcn_struct_tbuffer_store:
7091 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7092 return legalizeBufferStore(MI, MRI, B, true, true);
7093 case Intrinsic::amdgcn_raw_buffer_load:
7094 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7095 case Intrinsic::amdgcn_struct_buffer_load:
7096 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7097 return legalizeBufferLoad(MI, MRI, B, false, false);
7098 case Intrinsic::amdgcn_raw_buffer_load_format:
7099 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7100 case Intrinsic::amdgcn_struct_buffer_load_format:
7101 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7102 return legalizeBufferLoad(MI, MRI, B, true, false);
7103 case Intrinsic::amdgcn_raw_tbuffer_load:
7104 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7105 case Intrinsic::amdgcn_struct_tbuffer_load:
7106 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7107 return legalizeBufferLoad(MI, MRI, B, true, true);
7108 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7109 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7110 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7111 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7112 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7113 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7114 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7115 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7116 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7117 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7118 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7119 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7120 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7121 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7122 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7123 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7124 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7125 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7126 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7127 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7128 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7129 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7130 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7131 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7132 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7133 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7134 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7135 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7136 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7137 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7138 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7139 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7140 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7141 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7142 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7143 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7144 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7145 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7146 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7147 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7148 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7149 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7150 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7151 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7152 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7153 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7154 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7155 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7156 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7157 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7158 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7159 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7160 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7161 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7162 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7163 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7164 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7165 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7166 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7167 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7168 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7169 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7170 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7171 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7172 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
7173 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
7174 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
7175 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
7176 return legalizeBufferAtomic(MI, B, IntrID);
7177 case Intrinsic::trap:
7178 return legalizeTrapIntrinsic(MI, MRI, B);
7179 case Intrinsic::debugtrap:
7180 return legalizeDebugTrapIntrinsic(MI, MRI, B);
7181 case Intrinsic::amdgcn_rsq_clamp:
7182 return legalizeRsqClampIntrinsic(MI, MRI, B);
7183 case Intrinsic::amdgcn_ds_fadd:
7184 case Intrinsic::amdgcn_ds_fmin:
7185 case Intrinsic::amdgcn_ds_fmax:
7186 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
7187 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7188 return legalizeBVHIntrinsic(MI, B);
7189 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7190 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7191 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7192 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7193 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7194 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7195 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7196 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7197 Register Index = MI.getOperand(5).getReg();
7198 LLT S32 = LLT::scalar(32);
7199 if (MRI.getType(Index) != S32)
7200 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7201 return true;
7202 }
7203 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7204 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7205 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7206 Register Index = MI.getOperand(7).getReg();
7207 LLT S32 = LLT::scalar(32);
7208 if (MRI.getType(Index) != S32)
7209 MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7210 return true;
7211 }
7212 case Intrinsic::amdgcn_fmed3: {
7213 GISelChangeObserver &Observer = Helper.Observer;
7214
7215 // FIXME: This is to workaround the inability of tablegen match combiners to
7216 // match intrinsics in patterns.
7217 Observer.changingInstr(MI);
7218 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7219 MI.removeOperand(1);
7220 Observer.changedInstr(MI);
7221 return true;
7222 }
7223 default: {
7224 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7225 AMDGPU::getImageDimIntrinsicInfo(IntrID))
7226 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7227 return true;
7228 }
7229 }
7230
7231 return true;
7232 }
7233