1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPULegalizerInfo.h"
15
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40 "amdgpu-global-isel-new-legality",
41 cl::desc("Use GlobalISel desired legality, rather than try to use"
42 "rules compatible with selection patterns"),
43 cl::init(false),
44 cl::ReallyHidden);
45
46 static constexpr unsigned MaxRegisterSize = 1024;
47
48 // Round the number of elements to the next power of two elements
getPow2VectorType(LLT Ty)49 static LLT getPow2VectorType(LLT Ty) {
50 unsigned NElts = Ty.getNumElements();
51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54
55 // Round the number of bits to the next power of two bits
getPow2ScalarType(LLT Ty)56 static LLT getPow2ScalarType(LLT Ty) {
57 unsigned Bits = Ty.getSizeInBits();
58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59 return LLT::scalar(Pow2Bits);
60 }
61
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
isSmallOddVector(unsigned TypeIdx)65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66 return [=](const LegalityQuery &Query) {
67 const LLT Ty = Query.Types[TypeIdx];
68 if (!Ty.isVector())
69 return false;
70
71 const LLT EltTy = Ty.getElementType();
72 const unsigned EltSize = EltTy.getSizeInBits();
73 return Ty.getNumElements() % 2 != 0 &&
74 EltSize > 1 && EltSize < 32 &&
75 Ty.getSizeInBits() % 32 != 0;
76 };
77 }
78
sizeIsMultipleOf32(unsigned TypeIdx)79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80 return [=](const LegalityQuery &Query) {
81 const LLT Ty = Query.Types[TypeIdx];
82 return Ty.getSizeInBits() % 32 == 0;
83 };
84 }
85
isWideVec16(unsigned TypeIdx)86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 const LLT EltTy = Ty.getScalarType();
90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91 };
92 }
93
oneMoreElement(unsigned TypeIdx)94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getElementType();
98 return std::make_pair(TypeIdx,
99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100 };
101 }
102
fewerEltsToSize64Vector(unsigned TypeIdx)103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104 return [=](const LegalityQuery &Query) {
105 const LLT Ty = Query.Types[TypeIdx];
106 const LLT EltTy = Ty.getElementType();
107 unsigned Size = Ty.getSizeInBits();
108 unsigned Pieces = (Size + 63) / 64;
109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110 return std::make_pair(
111 TypeIdx,
112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113 };
114 }
115
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
moreEltsToNext32Bit(unsigned TypeIdx)118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119 return [=](const LegalityQuery &Query) {
120 const LLT Ty = Query.Types[TypeIdx];
121
122 const LLT EltTy = Ty.getElementType();
123 const int Size = Ty.getSizeInBits();
124 const int EltSize = EltTy.getSizeInBits();
125 const int NextMul32 = (Size + 31) / 32;
126
127 assert(EltSize < 32);
128
129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131 };
132 }
133
getBitcastRegisterType(const LLT Ty)134 static LLT getBitcastRegisterType(const LLT Ty) {
135 const unsigned Size = Ty.getSizeInBits();
136
137 LLT CoercedTy;
138 if (Size <= 32) {
139 // <2 x s8> -> s16
140 // <4 x s8> -> s32
141 return LLT::scalar(Size);
142 }
143
144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146
bitcastToRegisterType(unsigned TypeIdx)147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148 return [=](const LegalityQuery &Query) {
149 const LLT Ty = Query.Types[TypeIdx];
150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151 };
152 }
153
bitcastToVectorElement32(unsigned TypeIdx)154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155 return [=](const LegalityQuery &Query) {
156 const LLT Ty = Query.Types[TypeIdx];
157 unsigned Size = Ty.getSizeInBits();
158 assert(Size % 32 == 0);
159 return std::make_pair(
160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161 };
162 }
163
vectorSmallerThan(unsigned TypeIdx,unsigned Size)164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165 return [=](const LegalityQuery &Query) {
166 const LLT QueryTy = Query.Types[TypeIdx];
167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168 };
169 }
170
vectorWiderThan(unsigned TypeIdx,unsigned Size)171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172 return [=](const LegalityQuery &Query) {
173 const LLT QueryTy = Query.Types[TypeIdx];
174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175 };
176 }
177
numElementsNotEven(unsigned TypeIdx)178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179 return [=](const LegalityQuery &Query) {
180 const LLT QueryTy = Query.Types[TypeIdx];
181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182 };
183 }
184
isRegisterSize(unsigned Size)185 static bool isRegisterSize(unsigned Size) {
186 return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188
isRegisterVectorElementType(LLT EltTy)189 static bool isRegisterVectorElementType(LLT EltTy) {
190 const int EltSize = EltTy.getSizeInBits();
191 return EltSize == 16 || EltSize % 32 == 0;
192 }
193
isRegisterVectorType(LLT Ty)194 static bool isRegisterVectorType(LLT Ty) {
195 const int EltSize = Ty.getElementType().getSizeInBits();
196 return EltSize == 32 || EltSize == 64 ||
197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198 EltSize == 128 || EltSize == 256;
199 }
200
isRegisterType(LLT Ty)201 static bool isRegisterType(LLT Ty) {
202 if (!isRegisterSize(Ty.getSizeInBits()))
203 return false;
204
205 if (Ty.isVector())
206 return isRegisterVectorType(Ty);
207
208 return true;
209 }
210
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
isRegisterType(unsigned TypeIdx)213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214 return [=](const LegalityQuery &Query) {
215 return isRegisterType(Query.Types[TypeIdx]);
216 };
217 }
218
elementTypeIsLegal(unsigned TypeIdx)219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 if (!QueryTy.isVector())
223 return false;
224 const LLT EltTy = QueryTy.getElementType();
225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226 };
227 }
228
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
isWideScalarExtLoadTruncStore(unsigned TypeIdx)231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232 return [=](const LegalityQuery &Query) {
233 const LLT Ty = Query.Types[TypeIdx];
234 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236 };
237 }
238
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
maxSizeForAddrSpace(const GCNSubtarget & ST,unsigned AS,bool IsLoad)242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243 bool IsLoad) {
244 switch (AS) {
245 case AMDGPUAS::PRIVATE_ADDRESS:
246 // FIXME: Private element size.
247 return ST.enableFlatScratch() ? 128 : 32;
248 case AMDGPUAS::LOCAL_ADDRESS:
249 return ST.useDS128() ? 128 : 64;
250 case AMDGPUAS::GLOBAL_ADDRESS:
251 case AMDGPUAS::CONSTANT_ADDRESS:
252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253 // Treat constant and global as identical. SMRD loads are sometimes usable for
254 // global loads (ideally constant address space should be eliminated)
255 // depending on the context. Legality cannot be context dependent, but
256 // RegBankSelect can split the load as necessary depending on the pointer
257 // register bank/uniformity and if the memory is invariant or not written in a
258 // kernel.
259 return IsLoad ? 512 : 128;
260 default:
261 // Flat addresses may contextually need to be split to 32-bit parts if they
262 // may alias scratch depending on the subtarget.
263 return 128;
264 }
265 }
266
isLoadStoreSizeLegal(const GCNSubtarget & ST,const LegalityQuery & Query)267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268 const LegalityQuery &Query) {
269 const LLT Ty = Query.Types[0];
270
271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273
274 unsigned RegSize = Ty.getSizeInBits();
275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277 unsigned AS = Query.Types[1].getAddressSpace();
278
279 // All of these need to be custom lowered to cast the pointer operand.
280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281 return false;
282
283 // Do not handle extending vector loads.
284 if (Ty.isVector() && MemSize != RegSize)
285 return false;
286
287 // TODO: We should be able to widen loads if the alignment is high enough, but
288 // we also need to modify the memory access size.
289 #if 0
290 // Accept widening loads based on alignment.
291 if (IsLoad && MemSize < Size)
292 MemSize = std::max(MemSize, Align);
293 #endif
294
295 // Only 1-byte and 2-byte to 32-bit extloads are valid.
296 if (MemSize != RegSize && RegSize != 32)
297 return false;
298
299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300 return false;
301
302 switch (MemSize) {
303 case 8:
304 case 16:
305 case 32:
306 case 64:
307 case 128:
308 break;
309 case 96:
310 if (!ST.hasDwordx3LoadStores())
311 return false;
312 break;
313 case 256:
314 case 512:
315 // These may contextually need to be broken down.
316 break;
317 default:
318 return false;
319 }
320
321 assert(RegSize >= MemSize);
322
323 if (AlignBits < MemSize) {
324 const SITargetLowering *TLI = ST.getTargetLowering();
325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326 Align(AlignBits / 8)))
327 return false;
328 }
329
330 return true;
331 }
332
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
loadStoreBitcastWorkaround(const LLT Ty)337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338 if (EnableNewLegality)
339 return false;
340
341 const unsigned Size = Ty.getSizeInBits();
342 if (Size <= 64)
343 return false;
344 if (!Ty.isVector())
345 return true;
346
347 LLT EltTy = Ty.getElementType();
348 if (EltTy.isPointer())
349 return true;
350
351 unsigned EltSize = EltTy.getSizeInBits();
352 return EltSize != 32 && EltSize != 64;
353 }
354
isLoadStoreLegal(const GCNSubtarget & ST,const LegalityQuery & Query)355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356 const LLT Ty = Query.Types[0];
357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358 !loadStoreBitcastWorkaround(Ty);
359 }
360
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
shouldBitcastLoadStoreType(const GCNSubtarget & ST,const LLT Ty,const LLT MemTy)363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364 const LLT MemTy) {
365 const unsigned MemSizeInBits = MemTy.getSizeInBits();
366 const unsigned Size = Ty.getSizeInBits();
367 if (Size != MemSizeInBits)
368 return Size <= 32 && Ty.isVector();
369
370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371 return true;
372
373 // Don't try to handle bitcasting vector ext loads for now.
374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375 (Size <= 32 || isRegisterSize(Size)) &&
376 !isRegisterVectorElementType(Ty.getElementType());
377 }
378
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
shouldWidenLoad(const GCNSubtarget & ST,LLT MemoryTy,unsigned AlignInBits,unsigned AddrSpace,unsigned Opcode)382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383 unsigned AlignInBits, unsigned AddrSpace,
384 unsigned Opcode) {
385 unsigned SizeInBits = MemoryTy.getSizeInBits();
386 // We don't want to widen cases that are naturally legal.
387 if (isPowerOf2_32(SizeInBits))
388 return false;
389
390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391 // end up widening these for a scalar load during RegBankSelect, since there
392 // aren't 96-bit scalar loads.
393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394 return false;
395
396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397 return false;
398
399 // A load is known dereferenceable up to the alignment, so it's legal to widen
400 // to it.
401 //
402 // TODO: Could check dereferenceable for less aligned cases.
403 unsigned RoundedSize = NextPowerOf2(SizeInBits);
404 if (AlignInBits < RoundedSize)
405 return false;
406
407 // Do not widen if it would introduce a slow unaligned load.
408 const SITargetLowering *TLI = ST.getTargetLowering();
409 bool Fast = false;
410 return TLI->allowsMisalignedMemoryAccessesImpl(
411 RoundedSize, AddrSpace, Align(AlignInBits / 8),
412 MachineMemOperand::MOLoad, &Fast) &&
413 Fast;
414 }
415
shouldWidenLoad(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417 unsigned Opcode) {
418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419 return false;
420
421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422 Query.MMODescrs[0].AlignInBits,
423 Query.Types[1].getAddressSpace(), Opcode);
424 }
425
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427 const GCNTargetMachine &TM)
428 : ST(ST_) {
429 using namespace TargetOpcode;
430
431 auto GetAddrSpacePtr = [&TM](unsigned AS) {
432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433 };
434
435 const LLT S1 = LLT::scalar(1);
436 const LLT S8 = LLT::scalar(8);
437 const LLT S16 = LLT::scalar(16);
438 const LLT S32 = LLT::scalar(32);
439 const LLT S64 = LLT::scalar(64);
440 const LLT S128 = LLT::scalar(128);
441 const LLT S256 = LLT::scalar(256);
442 const LLT S512 = LLT::scalar(512);
443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444
445 const LLT V2S8 = LLT::fixed_vector(2, 8);
446 const LLT V2S16 = LLT::fixed_vector(2, 16);
447 const LLT V4S16 = LLT::fixed_vector(4, 16);
448
449 const LLT V2S32 = LLT::fixed_vector(2, 32);
450 const LLT V3S32 = LLT::fixed_vector(3, 32);
451 const LLT V4S32 = LLT::fixed_vector(4, 32);
452 const LLT V5S32 = LLT::fixed_vector(5, 32);
453 const LLT V6S32 = LLT::fixed_vector(6, 32);
454 const LLT V7S32 = LLT::fixed_vector(7, 32);
455 const LLT V8S32 = LLT::fixed_vector(8, 32);
456 const LLT V9S32 = LLT::fixed_vector(9, 32);
457 const LLT V10S32 = LLT::fixed_vector(10, 32);
458 const LLT V11S32 = LLT::fixed_vector(11, 32);
459 const LLT V12S32 = LLT::fixed_vector(12, 32);
460 const LLT V13S32 = LLT::fixed_vector(13, 32);
461 const LLT V14S32 = LLT::fixed_vector(14, 32);
462 const LLT V15S32 = LLT::fixed_vector(15, 32);
463 const LLT V16S32 = LLT::fixed_vector(16, 32);
464 const LLT V32S32 = LLT::fixed_vector(32, 32);
465
466 const LLT V2S64 = LLT::fixed_vector(2, 64);
467 const LLT V3S64 = LLT::fixed_vector(3, 64);
468 const LLT V4S64 = LLT::fixed_vector(4, 64);
469 const LLT V5S64 = LLT::fixed_vector(5, 64);
470 const LLT V6S64 = LLT::fixed_vector(6, 64);
471 const LLT V7S64 = LLT::fixed_vector(7, 64);
472 const LLT V8S64 = LLT::fixed_vector(8, 64);
473 const LLT V16S64 = LLT::fixed_vector(16, 64);
474
475 std::initializer_list<LLT> AllS32Vectors =
476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478 std::initializer_list<LLT> AllS64Vectors =
479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480
481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488
489 const LLT CodePtr = FlatPtr;
490
491 const std::initializer_list<LLT> AddrSpaces64 = {
492 GlobalPtr, ConstantPtr, FlatPtr
493 };
494
495 const std::initializer_list<LLT> AddrSpaces32 = {
496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497 };
498
499 const std::initializer_list<LLT> FPTypesBase = {
500 S32, S64
501 };
502
503 const std::initializer_list<LLT> FPTypes16 = {
504 S32, S64, S16
505 };
506
507 const std::initializer_list<LLT> FPTypesPK16 = {
508 S32, S64, S16, V2S16
509 };
510
511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512
513 // s1 for VCC branches, s32 for SCC branches.
514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515
516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517 // elements for v3s16
518 getActionDefinitionsBuilder(G_PHI)
519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520 .legalFor(AllS32Vectors)
521 .legalFor(AllS64Vectors)
522 .legalFor(AddrSpaces64)
523 .legalFor(AddrSpaces32)
524 .legalIf(isPointer(0))
525 .clampScalar(0, S16, S256)
526 .widenScalarToNextPow2(0, 32)
527 .clampMaxNumElements(0, S32, 16)
528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529 .scalarize(0);
530
531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532 // Full set of gfx9 features.
533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534 .legalFor({S32, S16, V2S16})
535 .minScalar(0, S16)
536 .clampMaxNumElements(0, S16, 2)
537 .widenScalarToNextMultipleOf(0, 32)
538 .maxScalar(0, S32)
539 .scalarize(0);
540
541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542 .legalFor({S32, S16, V2S16}) // Clamp modifier
543 .minScalarOrElt(0, S16)
544 .clampMaxNumElements(0, S16, 2)
545 .scalarize(0)
546 .widenScalarToNextPow2(0, 32)
547 .lower();
548 } else if (ST.has16BitInsts()) {
549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550 .legalFor({S32, S16})
551 .minScalar(0, S16)
552 .widenScalarToNextMultipleOf(0, 32)
553 .maxScalar(0, S32)
554 .scalarize(0);
555
556 // Technically the saturating operations require clamp bit support, but this
557 // was introduced at the same time as 16-bit operations.
558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559 .legalFor({S32, S16}) // Clamp modifier
560 .minScalar(0, S16)
561 .scalarize(0)
562 .widenScalarToNextPow2(0, 16)
563 .lower();
564
565 // We're just lowering this, but it helps get a better result to try to
566 // coerce to the desired type first.
567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568 .minScalar(0, S16)
569 .scalarize(0)
570 .lower();
571 } else {
572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573 .legalFor({S32})
574 .widenScalarToNextMultipleOf(0, 32)
575 .clampScalar(0, S32, S32)
576 .scalarize(0);
577
578 if (ST.hasIntClamp()) {
579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580 .legalFor({S32}) // Clamp modifier.
581 .scalarize(0)
582 .minScalarOrElt(0, S32)
583 .lower();
584 } else {
585 // Clamp bit support was added in VI, along with 16-bit operations.
586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587 .minScalar(0, S32)
588 .scalarize(0)
589 .lower();
590 }
591
592 // FIXME: DAG expansion gets better results. The widening uses the smaller
593 // range values and goes for the min/max lowering directly.
594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595 .minScalar(0, S32)
596 .scalarize(0)
597 .lower();
598 }
599
600 getActionDefinitionsBuilder(
601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602 .customFor({S32, S64})
603 .clampScalar(0, S32, S64)
604 .widenScalarToNextPow2(0, 32)
605 .scalarize(0);
606
607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608 .legalFor({S32})
609 .maxScalar(0, S32);
610
611 if (ST.hasVOP3PInsts()) {
612 Mulh
613 .clampMaxNumElements(0, S8, 2)
614 .lowerFor({V2S8});
615 }
616
617 Mulh
618 .scalarize(0)
619 .lower();
620
621 // Report legal for any types we can handle anywhere. For the cases only legal
622 // on the SALU, RegBankSelect will be able to re-legalize.
623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625 .clampScalar(0, S32, S64)
626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628 .widenScalarToNextPow2(0)
629 .scalarize(0);
630
631 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633 .legalFor({{S32, S1}, {S32, S32}})
634 .minScalar(0, S32)
635 // TODO: .scalarize(0)
636 .lower();
637
638 getActionDefinitionsBuilder(G_BITCAST)
639 // Don't worry about the size constraint.
640 .legalIf(all(isRegisterType(0), isRegisterType(1)))
641 .lower();
642
643
644 getActionDefinitionsBuilder(G_CONSTANT)
645 .legalFor({S1, S32, S64, S16, GlobalPtr,
646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647 .legalIf(isPointer(0))
648 .clampScalar(0, S32, S64)
649 .widenScalarToNextPow2(0);
650
651 getActionDefinitionsBuilder(G_FCONSTANT)
652 .legalFor({S32, S64, S16})
653 .clampScalar(0, S16, S64);
654
655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656 .legalIf(isRegisterType(0))
657 // s1 and s16 are special cases because they have legal operations on
658 // them, but don't really occupy registers in the normal way.
659 .legalFor({S1, S16})
660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661 .clampScalarOrElt(0, S32, MaxScalar)
662 .widenScalarToNextPow2(0, 32)
663 .clampMaxNumElements(0, S32, 16);
664
665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666
667 // If the amount is divergent, we have to do a wave reduction to get the
668 // maximum value, so this is expanded during RegBankSelect.
669 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670 .legalFor({{PrivatePtr, S32}});
671
672 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673 .customIf(typeIsNot(0, PrivatePtr));
674
675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676
677 auto &FPOpActions = getActionDefinitionsBuilder(
678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679 .legalFor({S32, S64});
680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681 .customFor({S32, S64});
682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683 .customFor({S32, S64});
684
685 if (ST.has16BitInsts()) {
686 if (ST.hasVOP3PInsts())
687 FPOpActions.legalFor({S16, V2S16});
688 else
689 FPOpActions.legalFor({S16});
690
691 TrigActions.customFor({S16});
692 FDIVActions.customFor({S16});
693 }
694
695 auto &MinNumMaxNum = getActionDefinitionsBuilder({
696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697
698 if (ST.hasVOP3PInsts()) {
699 MinNumMaxNum.customFor(FPTypesPK16)
700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701 .clampMaxNumElements(0, S16, 2)
702 .clampScalar(0, S16, S64)
703 .scalarize(0);
704 } else if (ST.has16BitInsts()) {
705 MinNumMaxNum.customFor(FPTypes16)
706 .clampScalar(0, S16, S64)
707 .scalarize(0);
708 } else {
709 MinNumMaxNum.customFor(FPTypesBase)
710 .clampScalar(0, S32, S64)
711 .scalarize(0);
712 }
713
714 if (ST.hasVOP3PInsts())
715 FPOpActions.clampMaxNumElements(0, S16, 2);
716
717 FPOpActions
718 .scalarize(0)
719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720
721 TrigActions
722 .scalarize(0)
723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724
725 FDIVActions
726 .scalarize(0)
727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728
729 getActionDefinitionsBuilder({G_FNEG, G_FABS})
730 .legalFor(FPTypesPK16)
731 .clampMaxNumElements(0, S16, 2)
732 .scalarize(0)
733 .clampScalar(0, S16, S64);
734
735 if (ST.has16BitInsts()) {
736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737 .legalFor({S32, S64, S16})
738 .scalarize(0)
739 .clampScalar(0, S16, S64);
740 } else {
741 getActionDefinitionsBuilder(G_FSQRT)
742 .legalFor({S32, S64})
743 .scalarize(0)
744 .clampScalar(0, S32, S64);
745
746 if (ST.hasFractBug()) {
747 getActionDefinitionsBuilder(G_FFLOOR)
748 .customFor({S64})
749 .legalFor({S32, S64})
750 .scalarize(0)
751 .clampScalar(0, S32, S64);
752 } else {
753 getActionDefinitionsBuilder(G_FFLOOR)
754 .legalFor({S32, S64})
755 .scalarize(0)
756 .clampScalar(0, S32, S64);
757 }
758 }
759
760 getActionDefinitionsBuilder(G_FPTRUNC)
761 .legalFor({{S32, S64}, {S16, S32}})
762 .scalarize(0)
763 .lower();
764
765 getActionDefinitionsBuilder(G_FPEXT)
766 .legalFor({{S64, S32}, {S32, S16}})
767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768 .scalarize(0);
769
770 getActionDefinitionsBuilder(G_FSUB)
771 // Use actual fsub instruction
772 .legalFor({S32})
773 // Must use fadd + fneg
774 .lowerFor({S64, S16, V2S16})
775 .scalarize(0)
776 .clampScalar(0, S32, S64);
777
778 // Whether this is legal depends on the floating point mode for the function.
779 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781 FMad.customFor({S32, S16});
782 else if (ST.hasMadMacF32Insts())
783 FMad.customFor({S32});
784 else if (ST.hasMadF16())
785 FMad.customFor({S16});
786 FMad.scalarize(0)
787 .lower();
788
789 auto &FRem = getActionDefinitionsBuilder(G_FREM);
790 if (ST.has16BitInsts()) {
791 FRem.customFor({S16, S32, S64});
792 } else {
793 FRem.minScalar(0, S32)
794 .customFor({S32, S64});
795 }
796 FRem.scalarize(0);
797
798 // TODO: Do we need to clamp maximum bitwidth?
799 getActionDefinitionsBuilder(G_TRUNC)
800 .legalIf(isScalar(0))
801 .legalFor({{V2S16, V2S32}})
802 .clampMaxNumElements(0, S16, 2)
803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804 // situations (like an invalid implicit use), we don't want to infinite loop
805 // in the legalizer.
806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807 .alwaysLegal();
808
809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811 {S32, S1}, {S64, S1}, {S16, S1}})
812 .scalarize(0)
813 .clampScalar(0, S32, S64)
814 .widenScalarToNextPow2(1, 32);
815
816 // TODO: Split s1->s64 during regbankselect for VALU.
817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819 .lowerIf(typeIs(1, S1))
820 .customFor({{S32, S64}, {S64, S64}});
821 if (ST.has16BitInsts())
822 IToFP.legalFor({{S16, S16}});
823 IToFP.clampScalar(1, S32, S64)
824 .minScalar(0, S32)
825 .scalarize(0)
826 .widenScalarToNextPow2(1);
827
828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830 .customFor({{S64, S32}, {S64, S64}})
831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832 if (ST.has16BitInsts())
833 FPToI.legalFor({{S16, S16}});
834 else
835 FPToI.minScalar(1, S32);
836
837 FPToI.minScalar(0, S32)
838 .widenScalarToNextPow2(0, 32)
839 .scalarize(0)
840 .lower();
841
842 // Lower roundeven into G_FRINT
843 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844 .scalarize(0)
845 .lower();
846
847 if (ST.has16BitInsts()) {
848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849 .legalFor({S16, S32, S64})
850 .clampScalar(0, S16, S64)
851 .scalarize(0);
852 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854 .legalFor({S32, S64})
855 .clampScalar(0, S32, S64)
856 .scalarize(0);
857 } else {
858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859 .legalFor({S32})
860 .customFor({S64})
861 .clampScalar(0, S32, S64)
862 .scalarize(0);
863 }
864
865 getActionDefinitionsBuilder(G_PTR_ADD)
866 .legalIf(all(isPointer(0), sameSize(0, 1)))
867 .scalarize(0)
868 .scalarSameSizeAs(1, 0);
869
870 getActionDefinitionsBuilder(G_PTRMASK)
871 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872 .scalarSameSizeAs(1, 0)
873 .scalarize(0);
874
875 auto &CmpBuilder =
876 getActionDefinitionsBuilder(G_ICMP)
877 // The compare output type differs based on the register bank of the output,
878 // so make both s1 and s32 legal.
879 //
880 // Scalar compares producing output in scc will be promoted to s32, as that
881 // is the allocatable register type that will be needed for the copy from
882 // scc. This will be promoted during RegBankSelect, and we assume something
883 // before that won't try to use s32 result types.
884 //
885 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886 // bank.
887 .legalForCartesianProduct(
888 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889 .legalForCartesianProduct(
890 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891 if (ST.has16BitInsts()) {
892 CmpBuilder.legalFor({{S1, S16}});
893 }
894
895 CmpBuilder
896 .widenScalarToNextPow2(1)
897 .clampScalar(1, S32, S64)
898 .scalarize(0)
899 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900
901 getActionDefinitionsBuilder(G_FCMP)
902 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903 .widenScalarToNextPow2(1)
904 .clampScalar(1, S32, S64)
905 .scalarize(0);
906
907 // FIXME: fpow has a selection pattern that should move to custom lowering.
908 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909 if (ST.has16BitInsts())
910 Exp2Ops.legalFor({S32, S16});
911 else
912 Exp2Ops.legalFor({S32});
913 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914 Exp2Ops.scalarize(0);
915
916 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917 if (ST.has16BitInsts())
918 ExpOps.customFor({{S32}, {S16}});
919 else
920 ExpOps.customFor({S32});
921 ExpOps.clampScalar(0, MinScalarFPTy, S32)
922 .scalarize(0);
923
924 getActionDefinitionsBuilder(G_FPOWI)
925 .clampScalar(0, MinScalarFPTy, S32)
926 .lower();
927
928 // The 64-bit versions produce 32-bit results, but only on the SALU.
929 getActionDefinitionsBuilder(G_CTPOP)
930 .legalFor({{S32, S32}, {S32, S64}})
931 .clampScalar(0, S32, S32)
932 .clampScalar(1, S32, S64)
933 .scalarize(0)
934 .widenScalarToNextPow2(0, 32)
935 .widenScalarToNextPow2(1, 32);
936
937 // The hardware instructions return a different result on 0 than the generic
938 // instructions expect. The hardware produces -1, but these produce the
939 // bitwidth.
940 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
941 .scalarize(0)
942 .clampScalar(0, S32, S32)
943 .clampScalar(1, S32, S64)
944 .widenScalarToNextPow2(0, 32)
945 .widenScalarToNextPow2(1, 32)
946 .custom();
947
948 // The 64-bit versions produce 32-bit results, but only on the SALU.
949 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
950 .legalFor({{S32, S32}, {S32, S64}})
951 .clampScalar(0, S32, S32)
952 .clampScalar(1, S32, S64)
953 .scalarize(0)
954 .widenScalarToNextPow2(0, 32)
955 .widenScalarToNextPow2(1, 32);
956
957 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958 // RegBankSelect.
959 getActionDefinitionsBuilder(G_BITREVERSE)
960 .legalFor({S32, S64})
961 .clampScalar(0, S32, S64)
962 .scalarize(0)
963 .widenScalarToNextPow2(0);
964
965 if (ST.has16BitInsts()) {
966 getActionDefinitionsBuilder(G_BSWAP)
967 .legalFor({S16, S32, V2S16})
968 .clampMaxNumElements(0, S16, 2)
969 // FIXME: Fixing non-power-of-2 before clamp is workaround for
970 // narrowScalar limitation.
971 .widenScalarToNextPow2(0)
972 .clampScalar(0, S16, S32)
973 .scalarize(0);
974
975 if (ST.hasVOP3PInsts()) {
976 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
977 .legalFor({S32, S16, V2S16})
978 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
979 .clampMaxNumElements(0, S16, 2)
980 .minScalar(0, S16)
981 .widenScalarToNextPow2(0)
982 .scalarize(0)
983 .lower();
984 } else {
985 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
986 .legalFor({S32, S16})
987 .widenScalarToNextPow2(0)
988 .minScalar(0, S16)
989 .scalarize(0)
990 .lower();
991 }
992 } else {
993 // TODO: Should have same legality without v_perm_b32
994 getActionDefinitionsBuilder(G_BSWAP)
995 .legalFor({S32})
996 .lowerIf(scalarNarrowerThan(0, 32))
997 // FIXME: Fixing non-power-of-2 before clamp is workaround for
998 // narrowScalar limitation.
999 .widenScalarToNextPow2(0)
1000 .maxScalar(0, S32)
1001 .scalarize(0)
1002 .lower();
1003
1004 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1005 .legalFor({S32})
1006 .minScalar(0, S32)
1007 .widenScalarToNextPow2(0)
1008 .scalarize(0)
1009 .lower();
1010 }
1011
1012 getActionDefinitionsBuilder(G_INTTOPTR)
1013 // List the common cases
1014 .legalForCartesianProduct(AddrSpaces64, {S64})
1015 .legalForCartesianProduct(AddrSpaces32, {S32})
1016 .scalarize(0)
1017 // Accept any address space as long as the size matches
1018 .legalIf(sameSize(0, 1))
1019 .widenScalarIf(smallerThan(1, 0),
1020 [](const LegalityQuery &Query) {
1021 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1022 })
1023 .narrowScalarIf(largerThan(1, 0),
1024 [](const LegalityQuery &Query) {
1025 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1026 });
1027
1028 getActionDefinitionsBuilder(G_PTRTOINT)
1029 // List the common cases
1030 .legalForCartesianProduct(AddrSpaces64, {S64})
1031 .legalForCartesianProduct(AddrSpaces32, {S32})
1032 .scalarize(0)
1033 // Accept any address space as long as the size matches
1034 .legalIf(sameSize(0, 1))
1035 .widenScalarIf(smallerThan(0, 1),
1036 [](const LegalityQuery &Query) {
1037 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1038 })
1039 .narrowScalarIf(
1040 largerThan(0, 1),
1041 [](const LegalityQuery &Query) {
1042 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043 });
1044
1045 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1046 .scalarize(0)
1047 .custom();
1048
1049 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1050 bool IsLoad) -> bool {
1051 const LLT DstTy = Query.Types[0];
1052
1053 // Split vector extloads.
1054 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1056
1057 if (MemSize < DstTy.getSizeInBits())
1058 MemSize = std::max(MemSize, AlignBits);
1059
1060 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1061 return true;
1062
1063 const LLT PtrTy = Query.Types[1];
1064 unsigned AS = PtrTy.getAddressSpace();
1065 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1066 return true;
1067
1068 // Catch weird sized loads that don't evenly divide into the access sizes
1069 // TODO: May be able to widen depending on alignment etc.
1070 unsigned NumRegs = (MemSize + 31) / 32;
1071 if (NumRegs == 3) {
1072 if (!ST.hasDwordx3LoadStores())
1073 return true;
1074 } else {
1075 // If the alignment allows, these should have been widened.
1076 if (!isPowerOf2_32(NumRegs))
1077 return true;
1078 }
1079
1080 if (AlignBits < MemSize) {
1081 const SITargetLowering *TLI = ST.getTargetLowering();
1082 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1083 Align(AlignBits / 8));
1084 }
1085
1086 return false;
1087 };
1088
1089 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1090 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1091 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1092
1093 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1094 // LDS
1095 // TODO: Unsupported flat for SI.
1096
1097 for (unsigned Op : {G_LOAD, G_STORE}) {
1098 const bool IsStore = Op == G_STORE;
1099
1100 auto &Actions = getActionDefinitionsBuilder(Op);
1101 // Explicitly list some common cases.
1102 // TODO: Does this help compile time at all?
1103 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1104 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1105 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1106 {S64, GlobalPtr, S64, GlobalAlign32},
1107 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1108 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1109 {S32, GlobalPtr, S8, GlobalAlign8},
1110 {S32, GlobalPtr, S16, GlobalAlign16},
1111
1112 {S32, LocalPtr, S32, 32},
1113 {S64, LocalPtr, S64, 32},
1114 {V2S32, LocalPtr, V2S32, 32},
1115 {S32, LocalPtr, S8, 8},
1116 {S32, LocalPtr, S16, 16},
1117 {V2S16, LocalPtr, S32, 32},
1118
1119 {S32, PrivatePtr, S32, 32},
1120 {S32, PrivatePtr, S8, 8},
1121 {S32, PrivatePtr, S16, 16},
1122 {V2S16, PrivatePtr, S32, 32},
1123
1124 {S32, ConstantPtr, S32, GlobalAlign32},
1125 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1126 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1127 {S64, ConstantPtr, S64, GlobalAlign32},
1128 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1129 Actions.legalIf(
1130 [=](const LegalityQuery &Query) -> bool {
1131 return isLoadStoreLegal(ST, Query);
1132 });
1133
1134 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1135 // 64-bits.
1136 //
1137 // TODO: Should generalize bitcast action into coerce, which will also cover
1138 // inserting addrspacecasts.
1139 Actions.customIf(typeIs(1, Constant32Ptr));
1140
1141 // Turn any illegal element vectors into something easier to deal
1142 // with. These will ultimately produce 32-bit scalar shifts to extract the
1143 // parts anyway.
1144 //
1145 // For odd 16-bit element vectors, prefer to split those into pieces with
1146 // 16-bit vector parts.
1147 Actions.bitcastIf(
1148 [=](const LegalityQuery &Query) -> bool {
1149 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1150 Query.MMODescrs[0].MemoryTy);
1151 }, bitcastToRegisterType(0));
1152
1153 if (!IsStore) {
1154 // Widen suitably aligned loads by loading extra bytes. The standard
1155 // legalization actions can't properly express widening memory operands.
1156 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1157 return shouldWidenLoad(ST, Query, G_LOAD);
1158 });
1159 }
1160
1161 // FIXME: load/store narrowing should be moved to lower action
1162 Actions
1163 .narrowScalarIf(
1164 [=](const LegalityQuery &Query) -> bool {
1165 return !Query.Types[0].isVector() &&
1166 needToSplitMemOp(Query, Op == G_LOAD);
1167 },
1168 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1169 const LLT DstTy = Query.Types[0];
1170 const LLT PtrTy = Query.Types[1];
1171
1172 const unsigned DstSize = DstTy.getSizeInBits();
1173 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1174
1175 // Split extloads.
1176 if (DstSize > MemSize)
1177 return std::make_pair(0, LLT::scalar(MemSize));
1178
1179 if (!isPowerOf2_32(DstSize)) {
1180 // We're probably decomposing an odd sized store. Try to split
1181 // to the widest type. TODO: Account for alignment. As-is it
1182 // should be OK, since the new parts will be further legalized.
1183 unsigned FloorSize = PowerOf2Floor(DstSize);
1184 return std::make_pair(0, LLT::scalar(FloorSize));
1185 }
1186
1187 if (DstSize > 32 && (DstSize % 32 != 0)) {
1188 // FIXME: Need a way to specify non-extload of larger size if
1189 // suitably aligned.
1190 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1191 }
1192
1193 unsigned MaxSize = maxSizeForAddrSpace(ST,
1194 PtrTy.getAddressSpace(),
1195 Op == G_LOAD);
1196 if (MemSize > MaxSize)
1197 return std::make_pair(0, LLT::scalar(MaxSize));
1198
1199 unsigned Align = Query.MMODescrs[0].AlignInBits;
1200 return std::make_pair(0, LLT::scalar(Align));
1201 })
1202 .fewerElementsIf(
1203 [=](const LegalityQuery &Query) -> bool {
1204 return Query.Types[0].isVector() &&
1205 needToSplitMemOp(Query, Op == G_LOAD);
1206 },
1207 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1208 const LLT DstTy = Query.Types[0];
1209 const LLT PtrTy = Query.Types[1];
1210
1211 LLT EltTy = DstTy.getElementType();
1212 unsigned MaxSize = maxSizeForAddrSpace(ST,
1213 PtrTy.getAddressSpace(),
1214 Op == G_LOAD);
1215
1216 // FIXME: Handle widened to power of 2 results better. This ends
1217 // up scalarizing.
1218 // FIXME: 3 element stores scalarized on SI
1219
1220 // Split if it's too large for the address space.
1221 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1222 if (MemSize > MaxSize) {
1223 unsigned NumElts = DstTy.getNumElements();
1224 unsigned EltSize = EltTy.getSizeInBits();
1225
1226 if (MaxSize % EltSize == 0) {
1227 return std::make_pair(
1228 0, LLT::scalarOrVector(
1229 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1230 }
1231
1232 unsigned NumPieces = MemSize / MaxSize;
1233
1234 // FIXME: Refine when odd breakdowns handled
1235 // The scalars will need to be re-legalized.
1236 if (NumPieces == 1 || NumPieces >= NumElts ||
1237 NumElts % NumPieces != 0)
1238 return std::make_pair(0, EltTy);
1239
1240 return std::make_pair(
1241 0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1242 }
1243
1244 // FIXME: We could probably handle weird extending loads better.
1245 if (DstTy.getSizeInBits() > MemSize)
1246 return std::make_pair(0, EltTy);
1247
1248 unsigned EltSize = EltTy.getSizeInBits();
1249 unsigned DstSize = DstTy.getSizeInBits();
1250 if (!isPowerOf2_32(DstSize)) {
1251 // We're probably decomposing an odd sized store. Try to split
1252 // to the widest type. TODO: Account for alignment. As-is it
1253 // should be OK, since the new parts will be further legalized.
1254 unsigned FloorSize = PowerOf2Floor(DstSize);
1255 return std::make_pair(
1256 0, LLT::scalarOrVector(
1257 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1258 }
1259
1260 // Need to split because of alignment.
1261 unsigned Align = Query.MMODescrs[0].AlignInBits;
1262 if (EltSize > Align &&
1263 (EltSize / Align < DstTy.getNumElements())) {
1264 return std::make_pair(
1265 0, LLT::fixed_vector(EltSize / Align, EltTy));
1266 }
1267
1268 // May need relegalization for the scalars.
1269 return std::make_pair(0, EltTy);
1270 })
1271 .minScalar(0, S32)
1272 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1273 .widenScalarToNextPow2(0)
1274 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1275 .lower();
1276 }
1277
1278 // FIXME: Unaligned accesses not lowered.
1279 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1280 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1281 {S32, GlobalPtr, S16, 2 * 8},
1282 {S32, LocalPtr, S8, 8},
1283 {S32, LocalPtr, S16, 16},
1284 {S32, PrivatePtr, S8, 8},
1285 {S32, PrivatePtr, S16, 16},
1286 {S32, ConstantPtr, S8, 8},
1287 {S32, ConstantPtr, S16, 2 * 8}})
1288 .legalIf(
1289 [=](const LegalityQuery &Query) -> bool {
1290 return isLoadStoreLegal(ST, Query);
1291 });
1292
1293 if (ST.hasFlatAddressSpace()) {
1294 ExtLoads.legalForTypesWithMemDesc(
1295 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1296 }
1297
1298 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1299 // 64-bits.
1300 //
1301 // TODO: Should generalize bitcast action into coerce, which will also cover
1302 // inserting addrspacecasts.
1303 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1304
1305 ExtLoads.clampScalar(0, S32, S32)
1306 .widenScalarToNextPow2(0)
1307 .lower();
1308
1309 auto &Atomics = getActionDefinitionsBuilder(
1310 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1311 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1312 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1313 G_ATOMICRMW_UMIN})
1314 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1315 {S64, GlobalPtr}, {S64, LocalPtr},
1316 {S32, RegionPtr}, {S64, RegionPtr}});
1317 if (ST.hasFlatAddressSpace()) {
1318 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1319 }
1320
1321 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1322 if (ST.hasLDSFPAtomicAdd()) {
1323 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1324 if (ST.hasGFX90AInsts())
1325 Atomic.legalFor({{S64, LocalPtr}});
1326 }
1327 if (ST.hasAtomicFaddInsts())
1328 Atomic.legalFor({{S32, GlobalPtr}});
1329
1330 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1331 // demarshalling
1332 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1333 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1334 {S32, FlatPtr}, {S64, FlatPtr}})
1335 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1336 {S32, RegionPtr}, {S64, RegionPtr}});
1337 // TODO: Pointer types, any 32-bit or 64-bit vector
1338
1339 // Condition should be s32 for scalar, s1 for vector.
1340 getActionDefinitionsBuilder(G_SELECT)
1341 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1342 LocalPtr, FlatPtr, PrivatePtr,
1343 LLT::fixed_vector(2, LocalPtr),
1344 LLT::fixed_vector(2, PrivatePtr)},
1345 {S1, S32})
1346 .clampScalar(0, S16, S64)
1347 .scalarize(1)
1348 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1349 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1350 .clampMaxNumElements(0, S32, 2)
1351 .clampMaxNumElements(0, LocalPtr, 2)
1352 .clampMaxNumElements(0, PrivatePtr, 2)
1353 .scalarize(0)
1354 .widenScalarToNextPow2(0)
1355 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1356
1357 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1358 // be more flexible with the shift amount type.
1359 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1360 .legalFor({{S32, S32}, {S64, S32}});
1361 if (ST.has16BitInsts()) {
1362 if (ST.hasVOP3PInsts()) {
1363 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1364 .clampMaxNumElements(0, S16, 2);
1365 } else
1366 Shifts.legalFor({{S16, S16}});
1367
1368 // TODO: Support 16-bit shift amounts for all types
1369 Shifts.widenScalarIf(
1370 [=](const LegalityQuery &Query) {
1371 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1372 // 32-bit amount.
1373 const LLT ValTy = Query.Types[0];
1374 const LLT AmountTy = Query.Types[1];
1375 return ValTy.getSizeInBits() <= 16 &&
1376 AmountTy.getSizeInBits() < 16;
1377 }, changeTo(1, S16));
1378 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1379 Shifts.clampScalar(1, S32, S32);
1380 Shifts.clampScalar(0, S16, S64);
1381 Shifts.widenScalarToNextPow2(0, 16);
1382
1383 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1384 .minScalar(0, S16)
1385 .scalarize(0)
1386 .lower();
1387 } else {
1388 // Make sure we legalize the shift amount type first, as the general
1389 // expansion for the shifted type will produce much worse code if it hasn't
1390 // been truncated already.
1391 Shifts.clampScalar(1, S32, S32);
1392 Shifts.clampScalar(0, S32, S64);
1393 Shifts.widenScalarToNextPow2(0, 32);
1394
1395 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1396 .minScalar(0, S32)
1397 .scalarize(0)
1398 .lower();
1399 }
1400 Shifts.scalarize(0);
1401
1402 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1403 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1404 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1405 unsigned IdxTypeIdx = 2;
1406
1407 getActionDefinitionsBuilder(Op)
1408 .customIf([=](const LegalityQuery &Query) {
1409 const LLT EltTy = Query.Types[EltTypeIdx];
1410 const LLT VecTy = Query.Types[VecTypeIdx];
1411 const LLT IdxTy = Query.Types[IdxTypeIdx];
1412 const unsigned EltSize = EltTy.getSizeInBits();
1413 return (EltSize == 32 || EltSize == 64) &&
1414 VecTy.getSizeInBits() % 32 == 0 &&
1415 VecTy.getSizeInBits() <= MaxRegisterSize &&
1416 IdxTy.getSizeInBits() == 32;
1417 })
1418 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1419 bitcastToVectorElement32(VecTypeIdx))
1420 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1421 .bitcastIf(
1422 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1423 [=](const LegalityQuery &Query) {
1424 // For > 64-bit element types, try to turn this into a 64-bit
1425 // element vector since we may be able to do better indexing
1426 // if this is scalar. If not, fall back to 32.
1427 const LLT EltTy = Query.Types[EltTypeIdx];
1428 const LLT VecTy = Query.Types[VecTypeIdx];
1429 const unsigned DstEltSize = EltTy.getSizeInBits();
1430 const unsigned VecSize = VecTy.getSizeInBits();
1431
1432 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1433 return std::make_pair(
1434 VecTypeIdx,
1435 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1436 })
1437 .clampScalar(EltTypeIdx, S32, S64)
1438 .clampScalar(VecTypeIdx, S32, S64)
1439 .clampScalar(IdxTypeIdx, S32, S32)
1440 .clampMaxNumElements(VecTypeIdx, S32, 32)
1441 // TODO: Clamp elements for 64-bit vectors?
1442 // It should only be necessary with variable indexes.
1443 // As a last resort, lower to the stack
1444 .lower();
1445 }
1446
1447 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1448 .unsupportedIf([=](const LegalityQuery &Query) {
1449 const LLT &EltTy = Query.Types[1].getElementType();
1450 return Query.Types[0] != EltTy;
1451 });
1452
1453 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1454 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1455 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1456
1457 // FIXME: Doesn't handle extract of illegal sizes.
1458 getActionDefinitionsBuilder(Op)
1459 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1460 // FIXME: Multiples of 16 should not be legal.
1461 .legalIf([=](const LegalityQuery &Query) {
1462 const LLT BigTy = Query.Types[BigTyIdx];
1463 const LLT LitTy = Query.Types[LitTyIdx];
1464 return (BigTy.getSizeInBits() % 32 == 0) &&
1465 (LitTy.getSizeInBits() % 16 == 0);
1466 })
1467 .widenScalarIf(
1468 [=](const LegalityQuery &Query) {
1469 const LLT BigTy = Query.Types[BigTyIdx];
1470 return (BigTy.getScalarSizeInBits() < 16);
1471 },
1472 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1473 .widenScalarIf(
1474 [=](const LegalityQuery &Query) {
1475 const LLT LitTy = Query.Types[LitTyIdx];
1476 return (LitTy.getScalarSizeInBits() < 16);
1477 },
1478 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1479 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1480 .widenScalarToNextPow2(BigTyIdx, 32);
1481
1482 }
1483
1484 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1485 .legalForCartesianProduct(AllS32Vectors, {S32})
1486 .legalForCartesianProduct(AllS64Vectors, {S64})
1487 .clampNumElements(0, V16S32, V32S32)
1488 .clampNumElements(0, V2S64, V16S64)
1489 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1490
1491 if (ST.hasScalarPackInsts()) {
1492 BuildVector
1493 // FIXME: Should probably widen s1 vectors straight to s32
1494 .minScalarOrElt(0, S16)
1495 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1496 .minScalar(1, S32);
1497
1498 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1499 .legalFor({V2S16, S32})
1500 .lower();
1501 BuildVector.minScalarOrElt(0, S32);
1502 } else {
1503 BuildVector.customFor({V2S16, S16});
1504 BuildVector.minScalarOrElt(0, S32);
1505
1506 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1507 .customFor({V2S16, S32})
1508 .lower();
1509 }
1510
1511 BuildVector.legalIf(isRegisterType(0));
1512
1513 // FIXME: Clamp maximum size
1514 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1515 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1516 .clampMaxNumElements(0, S32, 32)
1517 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1518 .clampMaxNumElements(0, S16, 64);
1519
1520 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1521 // pre-legalize.
1522 if (ST.hasVOP3PInsts()) {
1523 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1524 .customFor({V2S16, V2S16})
1525 .lower();
1526 } else
1527 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1528
1529 // Merge/Unmerge
1530 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1531 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1532 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1533
1534 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1535 const LLT Ty = Query.Types[TypeIdx];
1536 if (Ty.isVector()) {
1537 const LLT &EltTy = Ty.getElementType();
1538 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1539 return true;
1540 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1541 return true;
1542 }
1543 return false;
1544 };
1545
1546 auto &Builder = getActionDefinitionsBuilder(Op)
1547 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1548 .lowerFor({{S16, V2S16}})
1549 .lowerIf([=](const LegalityQuery &Query) {
1550 const LLT BigTy = Query.Types[BigTyIdx];
1551 return BigTy.getSizeInBits() == 32;
1552 })
1553 // Try to widen to s16 first for small types.
1554 // TODO: Only do this on targets with legal s16 shifts
1555 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1556 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1557 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1558 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1559 elementTypeIs(1, S16)),
1560 changeTo(1, V2S16))
1561 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1562 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1563 // valid.
1564 .clampScalar(LitTyIdx, S32, S512)
1565 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1566 // Break up vectors with weird elements into scalars
1567 .fewerElementsIf(
1568 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1569 scalarize(0))
1570 .fewerElementsIf(
1571 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1572 scalarize(1))
1573 .clampScalar(BigTyIdx, S32, MaxScalar);
1574
1575 if (Op == G_MERGE_VALUES) {
1576 Builder.widenScalarIf(
1577 // TODO: Use 16-bit shifts if legal for 8-bit values?
1578 [=](const LegalityQuery &Query) {
1579 const LLT Ty = Query.Types[LitTyIdx];
1580 return Ty.getSizeInBits() < 32;
1581 },
1582 changeTo(LitTyIdx, S32));
1583 }
1584
1585 Builder.widenScalarIf(
1586 [=](const LegalityQuery &Query) {
1587 const LLT Ty = Query.Types[BigTyIdx];
1588 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1589 Ty.getSizeInBits() % 16 != 0;
1590 },
1591 [=](const LegalityQuery &Query) {
1592 // Pick the next power of 2, or a multiple of 64 over 128.
1593 // Whichever is smaller.
1594 const LLT &Ty = Query.Types[BigTyIdx];
1595 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1596 if (NewSizeInBits >= 256) {
1597 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1598 if (RoundedTo < NewSizeInBits)
1599 NewSizeInBits = RoundedTo;
1600 }
1601 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1602 })
1603 // Any vectors left are the wrong size. Scalarize them.
1604 .scalarize(0)
1605 .scalarize(1);
1606 }
1607
1608 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1609 // RegBankSelect.
1610 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1611 .legalFor({{S32}, {S64}});
1612
1613 if (ST.hasVOP3PInsts()) {
1614 SextInReg.lowerFor({{V2S16}})
1615 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1616 // get more vector shift opportunities, since we'll get those when
1617 // expanded.
1618 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1619 } else if (ST.has16BitInsts()) {
1620 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1621 } else {
1622 // Prefer to promote to s32 before lowering if we don't have 16-bit
1623 // shifts. This avoid a lot of intermediate truncate and extend operations.
1624 SextInReg.lowerFor({{S32}, {S64}});
1625 }
1626
1627 SextInReg
1628 .scalarize(0)
1629 .clampScalar(0, S32, S64)
1630 .lower();
1631
1632 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1633 .scalarize(0)
1634 .lower();
1635
1636 // TODO: Only Try to form v2s16 with legal packed instructions.
1637 getActionDefinitionsBuilder(G_FSHR)
1638 .legalFor({{S32, S32}})
1639 .lowerFor({{V2S16, V2S16}})
1640 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1641 .scalarize(0)
1642 .lower();
1643
1644 if (ST.hasVOP3PInsts()) {
1645 getActionDefinitionsBuilder(G_FSHL)
1646 .lowerFor({{V2S16, V2S16}})
1647 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1648 .scalarize(0)
1649 .lower();
1650 } else {
1651 getActionDefinitionsBuilder(G_FSHL)
1652 .scalarize(0)
1653 .lower();
1654 }
1655
1656 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1657 .legalFor({S64});
1658
1659 getActionDefinitionsBuilder(G_FENCE)
1660 .alwaysLegal();
1661
1662 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1663 .scalarize(0)
1664 .minScalar(0, S32)
1665 .lower();
1666
1667 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1668 .legalFor({{S32, S32}, {S64, S32}})
1669 .clampScalar(1, S32, S32)
1670 .clampScalar(0, S32, S64)
1671 .widenScalarToNextPow2(0)
1672 .scalarize(0);
1673
1674 getActionDefinitionsBuilder({
1675 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1676 G_FCOPYSIGN,
1677
1678 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1679 G_ATOMICRMW_NAND,
1680 G_ATOMICRMW_FSUB,
1681 G_READ_REGISTER,
1682 G_WRITE_REGISTER,
1683
1684 G_SADDO, G_SSUBO,
1685
1686 // TODO: Implement
1687 G_FMINIMUM, G_FMAXIMUM}).lower();
1688
1689 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1690 .lower();
1691
1692 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1693 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1694 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1695 .unsupported();
1696
1697 getLegacyLegalizerInfo().computeTables();
1698 verify(*ST.getInstrInfo());
1699 }
1700
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI) const1701 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1702 MachineInstr &MI) const {
1703 MachineIRBuilder &B = Helper.MIRBuilder;
1704 MachineRegisterInfo &MRI = *B.getMRI();
1705
1706 switch (MI.getOpcode()) {
1707 case TargetOpcode::G_ADDRSPACE_CAST:
1708 return legalizeAddrSpaceCast(MI, MRI, B);
1709 case TargetOpcode::G_FRINT:
1710 return legalizeFrint(MI, MRI, B);
1711 case TargetOpcode::G_FCEIL:
1712 return legalizeFceil(MI, MRI, B);
1713 case TargetOpcode::G_FREM:
1714 return legalizeFrem(MI, MRI, B);
1715 case TargetOpcode::G_INTRINSIC_TRUNC:
1716 return legalizeIntrinsicTrunc(MI, MRI, B);
1717 case TargetOpcode::G_SITOFP:
1718 return legalizeITOFP(MI, MRI, B, true);
1719 case TargetOpcode::G_UITOFP:
1720 return legalizeITOFP(MI, MRI, B, false);
1721 case TargetOpcode::G_FPTOSI:
1722 return legalizeFPTOI(MI, MRI, B, true);
1723 case TargetOpcode::G_FPTOUI:
1724 return legalizeFPTOI(MI, MRI, B, false);
1725 case TargetOpcode::G_FMINNUM:
1726 case TargetOpcode::G_FMAXNUM:
1727 case TargetOpcode::G_FMINNUM_IEEE:
1728 case TargetOpcode::G_FMAXNUM_IEEE:
1729 return legalizeMinNumMaxNum(Helper, MI);
1730 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1731 return legalizeExtractVectorElt(MI, MRI, B);
1732 case TargetOpcode::G_INSERT_VECTOR_ELT:
1733 return legalizeInsertVectorElt(MI, MRI, B);
1734 case TargetOpcode::G_SHUFFLE_VECTOR:
1735 return legalizeShuffleVector(MI, MRI, B);
1736 case TargetOpcode::G_FSIN:
1737 case TargetOpcode::G_FCOS:
1738 return legalizeSinCos(MI, MRI, B);
1739 case TargetOpcode::G_GLOBAL_VALUE:
1740 return legalizeGlobalValue(MI, MRI, B);
1741 case TargetOpcode::G_LOAD:
1742 case TargetOpcode::G_SEXTLOAD:
1743 case TargetOpcode::G_ZEXTLOAD:
1744 return legalizeLoad(Helper, MI);
1745 case TargetOpcode::G_FMAD:
1746 return legalizeFMad(MI, MRI, B);
1747 case TargetOpcode::G_FDIV:
1748 return legalizeFDIV(MI, MRI, B);
1749 case TargetOpcode::G_UDIV:
1750 case TargetOpcode::G_UREM:
1751 case TargetOpcode::G_UDIVREM:
1752 return legalizeUnsignedDIV_REM(MI, MRI, B);
1753 case TargetOpcode::G_SDIV:
1754 case TargetOpcode::G_SREM:
1755 case TargetOpcode::G_SDIVREM:
1756 return legalizeSignedDIV_REM(MI, MRI, B);
1757 case TargetOpcode::G_ATOMIC_CMPXCHG:
1758 return legalizeAtomicCmpXChg(MI, MRI, B);
1759 case TargetOpcode::G_FLOG:
1760 return legalizeFlog(MI, B, numbers::ln2f);
1761 case TargetOpcode::G_FLOG10:
1762 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1763 case TargetOpcode::G_FEXP:
1764 return legalizeFExp(MI, B);
1765 case TargetOpcode::G_FPOW:
1766 return legalizeFPow(MI, B);
1767 case TargetOpcode::G_FFLOOR:
1768 return legalizeFFloor(MI, MRI, B);
1769 case TargetOpcode::G_BUILD_VECTOR:
1770 return legalizeBuildVector(MI, MRI, B);
1771 case TargetOpcode::G_CTLZ:
1772 case TargetOpcode::G_CTTZ:
1773 return legalizeCTLZ_CTTZ(MI, MRI, B);
1774 default:
1775 return false;
1776 }
1777
1778 llvm_unreachable("expected switch to return");
1779 }
1780
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1781 Register AMDGPULegalizerInfo::getSegmentAperture(
1782 unsigned AS,
1783 MachineRegisterInfo &MRI,
1784 MachineIRBuilder &B) const {
1785 MachineFunction &MF = B.getMF();
1786 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1787 const LLT S32 = LLT::scalar(32);
1788
1789 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1790
1791 if (ST.hasApertureRegs()) {
1792 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1793 // getreg.
1794 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1795 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1796 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1797 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1798 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1799 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1800 unsigned Encoding =
1801 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1802 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1803 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1804
1805 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1806
1807 B.buildInstr(AMDGPU::S_GETREG_B32)
1808 .addDef(GetReg)
1809 .addImm(Encoding);
1810 MRI.setType(GetReg, S32);
1811
1812 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1813 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1814 }
1815
1816 Register QueuePtr = MRI.createGenericVirtualRegister(
1817 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1818
1819 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1820 return Register();
1821
1822 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1823 // private_segment_aperture_base_hi.
1824 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1825
1826 // TODO: can we be smarter about machine pointer info?
1827 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1828 MachineMemOperand *MMO = MF.getMachineMemOperand(
1829 PtrInfo,
1830 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1831 MachineMemOperand::MOInvariant,
1832 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1833
1834 Register LoadAddr;
1835
1836 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1837 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1838 }
1839
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1840 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1841 MachineInstr &MI, MachineRegisterInfo &MRI,
1842 MachineIRBuilder &B) const {
1843 MachineFunction &MF = B.getMF();
1844
1845 const LLT S32 = LLT::scalar(32);
1846 Register Dst = MI.getOperand(0).getReg();
1847 Register Src = MI.getOperand(1).getReg();
1848
1849 LLT DstTy = MRI.getType(Dst);
1850 LLT SrcTy = MRI.getType(Src);
1851 unsigned DestAS = DstTy.getAddressSpace();
1852 unsigned SrcAS = SrcTy.getAddressSpace();
1853
1854 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1855 // vector element.
1856 assert(!DstTy.isVector());
1857
1858 const AMDGPUTargetMachine &TM
1859 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1860
1861 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1862 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1863 return true;
1864 }
1865
1866 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1867 // Truncate.
1868 B.buildExtract(Dst, Src, 0);
1869 MI.eraseFromParent();
1870 return true;
1871 }
1872
1873 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1874 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1875 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1876
1877 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1878 // another. Merge operands are required to be the same type, but creating an
1879 // extra ptrtoint would be kind of pointless.
1880 auto HighAddr = B.buildConstant(
1881 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1882 B.buildMerge(Dst, {Src, HighAddr});
1883 MI.eraseFromParent();
1884 return true;
1885 }
1886
1887 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1888 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1889 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1890 unsigned NullVal = TM.getNullPointerValue(DestAS);
1891
1892 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1893 auto FlatNull = B.buildConstant(SrcTy, 0);
1894
1895 // Extract low 32-bits of the pointer.
1896 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1897
1898 auto CmpRes =
1899 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1900 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1901
1902 MI.eraseFromParent();
1903 return true;
1904 }
1905
1906 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1907 return false;
1908
1909 if (!ST.hasFlatAddressSpace())
1910 return false;
1911
1912 auto SegmentNull =
1913 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1914 auto FlatNull =
1915 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1916
1917 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1918 if (!ApertureReg.isValid())
1919 return false;
1920
1921 auto CmpRes =
1922 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1923
1924 // Coerce the type of the low half of the result so we can use merge_values.
1925 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1926
1927 // TODO: Should we allow mismatched types but matching sizes in merges to
1928 // avoid the ptrtoint?
1929 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1930 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1931
1932 MI.eraseFromParent();
1933 return true;
1934 }
1935
legalizeFrint(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1936 bool AMDGPULegalizerInfo::legalizeFrint(
1937 MachineInstr &MI, MachineRegisterInfo &MRI,
1938 MachineIRBuilder &B) const {
1939 Register Src = MI.getOperand(1).getReg();
1940 LLT Ty = MRI.getType(Src);
1941 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1942
1943 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1944 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1945
1946 auto C1 = B.buildFConstant(Ty, C1Val);
1947 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1948
1949 // TODO: Should this propagate fast-math-flags?
1950 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1951 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1952
1953 auto C2 = B.buildFConstant(Ty, C2Val);
1954 auto Fabs = B.buildFAbs(Ty, Src);
1955
1956 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1957 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1958 MI.eraseFromParent();
1959 return true;
1960 }
1961
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1962 bool AMDGPULegalizerInfo::legalizeFceil(
1963 MachineInstr &MI, MachineRegisterInfo &MRI,
1964 MachineIRBuilder &B) const {
1965
1966 const LLT S1 = LLT::scalar(1);
1967 const LLT S64 = LLT::scalar(64);
1968
1969 Register Src = MI.getOperand(1).getReg();
1970 assert(MRI.getType(Src) == S64);
1971
1972 // result = trunc(src)
1973 // if (src > 0.0 && src != result)
1974 // result += 1.0
1975
1976 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1977
1978 const auto Zero = B.buildFConstant(S64, 0.0);
1979 const auto One = B.buildFConstant(S64, 1.0);
1980 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1981 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1982 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1983 auto Add = B.buildSelect(S64, And, One, Zero);
1984
1985 // TODO: Should this propagate fast-math-flags?
1986 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1987 return true;
1988 }
1989
legalizeFrem(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1990 bool AMDGPULegalizerInfo::legalizeFrem(
1991 MachineInstr &MI, MachineRegisterInfo &MRI,
1992 MachineIRBuilder &B) const {
1993 Register DstReg = MI.getOperand(0).getReg();
1994 Register Src0Reg = MI.getOperand(1).getReg();
1995 Register Src1Reg = MI.getOperand(2).getReg();
1996 auto Flags = MI.getFlags();
1997 LLT Ty = MRI.getType(DstReg);
1998
1999 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2000 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2001 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2002 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2003 MI.eraseFromParent();
2004 return true;
2005 }
2006
extractF64Exponent(Register Hi,MachineIRBuilder & B)2007 static MachineInstrBuilder extractF64Exponent(Register Hi,
2008 MachineIRBuilder &B) {
2009 const unsigned FractBits = 52;
2010 const unsigned ExpBits = 11;
2011 LLT S32 = LLT::scalar(32);
2012
2013 auto Const0 = B.buildConstant(S32, FractBits - 32);
2014 auto Const1 = B.buildConstant(S32, ExpBits);
2015
2016 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2017 .addUse(Hi)
2018 .addUse(Const0.getReg(0))
2019 .addUse(Const1.getReg(0));
2020
2021 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2022 }
2023
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2024 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2025 MachineInstr &MI, MachineRegisterInfo &MRI,
2026 MachineIRBuilder &B) const {
2027 const LLT S1 = LLT::scalar(1);
2028 const LLT S32 = LLT::scalar(32);
2029 const LLT S64 = LLT::scalar(64);
2030
2031 Register Src = MI.getOperand(1).getReg();
2032 assert(MRI.getType(Src) == S64);
2033
2034 // TODO: Should this use extract since the low half is unused?
2035 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2036 Register Hi = Unmerge.getReg(1);
2037
2038 // Extract the upper half, since this is where we will find the sign and
2039 // exponent.
2040 auto Exp = extractF64Exponent(Hi, B);
2041
2042 const unsigned FractBits = 52;
2043
2044 // Extract the sign bit.
2045 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2046 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2047
2048 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2049
2050 const auto Zero32 = B.buildConstant(S32, 0);
2051
2052 // Extend back to 64-bits.
2053 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2054
2055 auto Shr = B.buildAShr(S64, FractMask, Exp);
2056 auto Not = B.buildNot(S64, Shr);
2057 auto Tmp0 = B.buildAnd(S64, Src, Not);
2058 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2059
2060 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2061 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2062
2063 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2064 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2065 MI.eraseFromParent();
2066 return true;
2067 }
2068
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2069 bool AMDGPULegalizerInfo::legalizeITOFP(
2070 MachineInstr &MI, MachineRegisterInfo &MRI,
2071 MachineIRBuilder &B, bool Signed) const {
2072
2073 Register Dst = MI.getOperand(0).getReg();
2074 Register Src = MI.getOperand(1).getReg();
2075
2076 const LLT S64 = LLT::scalar(64);
2077 const LLT S32 = LLT::scalar(32);
2078
2079 assert(MRI.getType(Src) == S64);
2080
2081 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2082 auto ThirtyTwo = B.buildConstant(S32, 32);
2083
2084 if (MRI.getType(Dst) == S64) {
2085 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2086 : B.buildUITOFP(S64, Unmerge.getReg(1));
2087
2088 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2089 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2090 .addUse(CvtHi.getReg(0))
2091 .addUse(ThirtyTwo.getReg(0));
2092
2093 // TODO: Should this propagate fast-math-flags?
2094 B.buildFAdd(Dst, LdExp, CvtLo);
2095 MI.eraseFromParent();
2096 return true;
2097 }
2098
2099 assert(MRI.getType(Dst) == S32);
2100
2101 auto One = B.buildConstant(S32, 1);
2102
2103 MachineInstrBuilder ShAmt;
2104 if (Signed) {
2105 auto ThirtyOne = B.buildConstant(S32, 31);
2106 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2107 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2108 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2109 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2110 /*HasSideEffects=*/false)
2111 .addUse(Unmerge.getReg(1));
2112 auto LS2 = B.buildSub(S32, LS, One);
2113 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2114 } else
2115 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2116 auto Norm = B.buildShl(S64, Src, ShAmt);
2117 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2118 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2119 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2120 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2121 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2122 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2123 /*HasSideEffects=*/false)
2124 .addUse(FVal.getReg(0))
2125 .addUse(Scale.getReg(0));
2126 MI.eraseFromParent();
2127 return true;
2128 }
2129
2130 // TODO: Copied from DAG implementation. Verify logic and document how this
2131 // actually works.
legalizeFPTOI(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2132 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2133 MachineRegisterInfo &MRI,
2134 MachineIRBuilder &B,
2135 bool Signed) const {
2136
2137 Register Dst = MI.getOperand(0).getReg();
2138 Register Src = MI.getOperand(1).getReg();
2139
2140 const LLT S64 = LLT::scalar(64);
2141 const LLT S32 = LLT::scalar(32);
2142
2143 const LLT SrcLT = MRI.getType(Src);
2144 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2145
2146 unsigned Flags = MI.getFlags();
2147
2148 // The basic idea of converting a floating point number into a pair of 32-bit
2149 // integers is illustrated as follows:
2150 //
2151 // tf := trunc(val);
2152 // hif := floor(tf * 2^-32);
2153 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2154 // hi := fptoi(hif);
2155 // lo := fptoi(lof);
2156 //
2157 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2158 MachineInstrBuilder Sign;
2159 if (Signed && SrcLT == S32) {
2160 // However, a 32-bit floating point number has only 23 bits mantissa and
2161 // it's not enough to hold all the significant bits of `lof` if val is
2162 // negative. To avoid the loss of precision, We need to take the absolute
2163 // value after truncating and flip the result back based on the original
2164 // signedness.
2165 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2166 Trunc = B.buildFAbs(S32, Trunc, Flags);
2167 }
2168 MachineInstrBuilder K0, K1;
2169 if (SrcLT == S64) {
2170 K0 = B.buildFConstant(S64,
2171 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2172 K1 = B.buildFConstant(S64,
2173 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2174 } else {
2175 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2176 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2177 }
2178
2179 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2180 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2181 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2182
2183 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2184 : B.buildFPTOUI(S32, FloorMul);
2185 auto Lo = B.buildFPTOUI(S32, Fma);
2186
2187 if (Signed && SrcLT == S32) {
2188 // Flip the result based on the signedness, which is either all 0s or 1s.
2189 Sign = B.buildMerge(S64, {Sign, Sign});
2190 // r := xor({lo, hi}, sign) - sign;
2191 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2192 } else
2193 B.buildMerge(Dst, {Lo, Hi});
2194 MI.eraseFromParent();
2195
2196 return true;
2197 }
2198
legalizeMinNumMaxNum(LegalizerHelper & Helper,MachineInstr & MI) const2199 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2200 MachineInstr &MI) const {
2201 MachineFunction &MF = Helper.MIRBuilder.getMF();
2202 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2203
2204 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2205 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2206
2207 // With ieee_mode disabled, the instructions have the correct behavior
2208 // already for G_FMINNUM/G_FMAXNUM
2209 if (!MFI->getMode().IEEE)
2210 return !IsIEEEOp;
2211
2212 if (IsIEEEOp)
2213 return true;
2214
2215 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2216 }
2217
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2218 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2219 MachineInstr &MI, MachineRegisterInfo &MRI,
2220 MachineIRBuilder &B) const {
2221 // TODO: Should move some of this into LegalizerHelper.
2222
2223 // TODO: Promote dynamic indexing of s16 to s32
2224
2225 // FIXME: Artifact combiner probably should have replaced the truncated
2226 // constant before this, so we shouldn't need
2227 // getIConstantVRegValWithLookThrough.
2228 Optional<ValueAndVReg> MaybeIdxVal =
2229 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2230 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2231 return true;
2232 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2233
2234 Register Dst = MI.getOperand(0).getReg();
2235 Register Vec = MI.getOperand(1).getReg();
2236
2237 LLT VecTy = MRI.getType(Vec);
2238 LLT EltTy = VecTy.getElementType();
2239 assert(EltTy == MRI.getType(Dst));
2240
2241 if (IdxVal < VecTy.getNumElements())
2242 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2243 else
2244 B.buildUndef(Dst);
2245
2246 MI.eraseFromParent();
2247 return true;
2248 }
2249
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2250 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2251 MachineInstr &MI, MachineRegisterInfo &MRI,
2252 MachineIRBuilder &B) const {
2253 // TODO: Should move some of this into LegalizerHelper.
2254
2255 // TODO: Promote dynamic indexing of s16 to s32
2256
2257 // FIXME: Artifact combiner probably should have replaced the truncated
2258 // constant before this, so we shouldn't need
2259 // getIConstantVRegValWithLookThrough.
2260 Optional<ValueAndVReg> MaybeIdxVal =
2261 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2262 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2263 return true;
2264
2265 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2266 Register Dst = MI.getOperand(0).getReg();
2267 Register Vec = MI.getOperand(1).getReg();
2268 Register Ins = MI.getOperand(2).getReg();
2269
2270 LLT VecTy = MRI.getType(Vec);
2271 LLT EltTy = VecTy.getElementType();
2272 assert(EltTy == MRI.getType(Ins));
2273
2274 if (IdxVal < VecTy.getNumElements())
2275 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2276 else
2277 B.buildUndef(Dst);
2278
2279 MI.eraseFromParent();
2280 return true;
2281 }
2282
legalizeShuffleVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2283 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2284 MachineInstr &MI, MachineRegisterInfo &MRI,
2285 MachineIRBuilder &B) const {
2286 const LLT V2S16 = LLT::fixed_vector(2, 16);
2287
2288 Register Dst = MI.getOperand(0).getReg();
2289 Register Src0 = MI.getOperand(1).getReg();
2290 LLT DstTy = MRI.getType(Dst);
2291 LLT SrcTy = MRI.getType(Src0);
2292
2293 if (SrcTy == V2S16 && DstTy == V2S16 &&
2294 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2295 return true;
2296
2297 MachineIRBuilder HelperBuilder(MI);
2298 GISelObserverWrapper DummyObserver;
2299 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2300 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2301 }
2302
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2303 bool AMDGPULegalizerInfo::legalizeSinCos(
2304 MachineInstr &MI, MachineRegisterInfo &MRI,
2305 MachineIRBuilder &B) const {
2306
2307 Register DstReg = MI.getOperand(0).getReg();
2308 Register SrcReg = MI.getOperand(1).getReg();
2309 LLT Ty = MRI.getType(DstReg);
2310 unsigned Flags = MI.getFlags();
2311
2312 Register TrigVal;
2313 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2314 if (ST.hasTrigReducedRange()) {
2315 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2316 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2317 .addUse(MulVal.getReg(0))
2318 .setMIFlags(Flags).getReg(0);
2319 } else
2320 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2321
2322 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2323 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2324 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2325 .addUse(TrigVal)
2326 .setMIFlags(Flags);
2327 MI.eraseFromParent();
2328 return true;
2329 }
2330
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,int64_t Offset,unsigned GAFlags) const2331 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2332 MachineIRBuilder &B,
2333 const GlobalValue *GV,
2334 int64_t Offset,
2335 unsigned GAFlags) const {
2336 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2337 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2338 // to the following code sequence:
2339 //
2340 // For constant address space:
2341 // s_getpc_b64 s[0:1]
2342 // s_add_u32 s0, s0, $symbol
2343 // s_addc_u32 s1, s1, 0
2344 //
2345 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2346 // a fixup or relocation is emitted to replace $symbol with a literal
2347 // constant, which is a pc-relative offset from the encoding of the $symbol
2348 // operand to the global variable.
2349 //
2350 // For global address space:
2351 // s_getpc_b64 s[0:1]
2352 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2353 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2354 //
2355 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2356 // fixups or relocations are emitted to replace $symbol@*@lo and
2357 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2358 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2359 // operand to the global variable.
2360 //
2361 // What we want here is an offset from the value returned by s_getpc
2362 // (which is the address of the s_add_u32 instruction) to the global
2363 // variable, but since the encoding of $symbol starts 4 bytes after the start
2364 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2365 // small. This requires us to add 4 to the global variable offset in order to
2366 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2367 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2368 // instruction.
2369
2370 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2371
2372 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2373 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2374
2375 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2376 .addDef(PCReg);
2377
2378 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2379 if (GAFlags == SIInstrInfo::MO_NONE)
2380 MIB.addImm(0);
2381 else
2382 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2383
2384 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2385
2386 if (PtrTy.getSizeInBits() == 32)
2387 B.buildExtract(DstReg, PCReg, 0);
2388 return true;
2389 }
2390
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2391 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2392 MachineInstr &MI, MachineRegisterInfo &MRI,
2393 MachineIRBuilder &B) const {
2394 Register DstReg = MI.getOperand(0).getReg();
2395 LLT Ty = MRI.getType(DstReg);
2396 unsigned AS = Ty.getAddressSpace();
2397
2398 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2399 MachineFunction &MF = B.getMF();
2400 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2401
2402 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2403 if (!MFI->isModuleEntryFunction() &&
2404 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2405 const Function &Fn = MF.getFunction();
2406 DiagnosticInfoUnsupported BadLDSDecl(
2407 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2408 DS_Warning);
2409 Fn.getContext().diagnose(BadLDSDecl);
2410
2411 // We currently don't have a way to correctly allocate LDS objects that
2412 // aren't directly associated with a kernel. We do force inlining of
2413 // functions that use local objects. However, if these dead functions are
2414 // not eliminated, we don't want a compile time error. Just emit a warning
2415 // and a trap, since there should be no callable path here.
2416 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2417 B.buildUndef(DstReg);
2418 MI.eraseFromParent();
2419 return true;
2420 }
2421
2422 // TODO: We could emit code to handle the initialization somewhere.
2423 // We ignore the initializer for now and legalize it to allow selection.
2424 // The initializer will anyway get errored out during assembly emission.
2425 const SITargetLowering *TLI = ST.getTargetLowering();
2426 if (!TLI->shouldUseLDSConstAddress(GV)) {
2427 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2428 return true; // Leave in place;
2429 }
2430
2431 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2432 Type *Ty = GV->getValueType();
2433 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2434 // zero-sized type in other languages to declare the dynamic shared
2435 // memory which size is not known at the compile time. They will be
2436 // allocated by the runtime and placed directly after the static
2437 // allocated ones. They all share the same offset.
2438 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2439 // Adjust alignment for that dynamic shared memory array.
2440 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2441 LLT S32 = LLT::scalar(32);
2442 auto Sz =
2443 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2444 B.buildIntToPtr(DstReg, Sz);
2445 MI.eraseFromParent();
2446 return true;
2447 }
2448 }
2449
2450 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2451 *cast<GlobalVariable>(GV)));
2452 MI.eraseFromParent();
2453 return true;
2454 }
2455
2456 const SITargetLowering *TLI = ST.getTargetLowering();
2457
2458 if (TLI->shouldEmitFixup(GV)) {
2459 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2460 MI.eraseFromParent();
2461 return true;
2462 }
2463
2464 if (TLI->shouldEmitPCReloc(GV)) {
2465 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2466 MI.eraseFromParent();
2467 return true;
2468 }
2469
2470 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2471 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2472
2473 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2474 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2475 MachinePointerInfo::getGOT(MF),
2476 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2477 MachineMemOperand::MOInvariant,
2478 LoadTy, Align(8));
2479
2480 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2481
2482 if (Ty.getSizeInBits() == 32) {
2483 // Truncate if this is a 32-bit constant address.
2484 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2485 B.buildExtract(DstReg, Load, 0);
2486 } else
2487 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2488
2489 MI.eraseFromParent();
2490 return true;
2491 }
2492
widenToNextPowerOf2(LLT Ty)2493 static LLT widenToNextPowerOf2(LLT Ty) {
2494 if (Ty.isVector())
2495 return Ty.changeElementCount(
2496 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2497 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2498 }
2499
legalizeLoad(LegalizerHelper & Helper,MachineInstr & MI) const2500 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2501 MachineInstr &MI) const {
2502 MachineIRBuilder &B = Helper.MIRBuilder;
2503 MachineRegisterInfo &MRI = *B.getMRI();
2504 GISelChangeObserver &Observer = Helper.Observer;
2505
2506 Register PtrReg = MI.getOperand(1).getReg();
2507 LLT PtrTy = MRI.getType(PtrReg);
2508 unsigned AddrSpace = PtrTy.getAddressSpace();
2509
2510 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2511 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2512 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2513 Observer.changingInstr(MI);
2514 MI.getOperand(1).setReg(Cast.getReg(0));
2515 Observer.changedInstr(MI);
2516 return true;
2517 }
2518
2519 if (MI.getOpcode() != AMDGPU::G_LOAD)
2520 return false;
2521
2522 Register ValReg = MI.getOperand(0).getReg();
2523 LLT ValTy = MRI.getType(ValReg);
2524
2525 MachineMemOperand *MMO = *MI.memoperands_begin();
2526 const unsigned ValSize = ValTy.getSizeInBits();
2527 const LLT MemTy = MMO->getMemoryType();
2528 const Align MemAlign = MMO->getAlign();
2529 const unsigned MemSize = MemTy.getSizeInBits();
2530 const unsigned AlignInBits = 8 * MemAlign.value();
2531
2532 // Widen non-power-of-2 loads to the alignment if needed
2533 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2534 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2535
2536 // This was already the correct extending load result type, so just adjust
2537 // the memory type.
2538 if (WideMemSize == ValSize) {
2539 MachineFunction &MF = B.getMF();
2540
2541 MachineMemOperand *WideMMO =
2542 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2543 Observer.changingInstr(MI);
2544 MI.setMemRefs(MF, {WideMMO});
2545 Observer.changedInstr(MI);
2546 return true;
2547 }
2548
2549 // Don't bother handling edge case that should probably never be produced.
2550 if (ValSize > WideMemSize)
2551 return false;
2552
2553 LLT WideTy = widenToNextPowerOf2(ValTy);
2554
2555 Register WideLoad;
2556 if (!WideTy.isVector()) {
2557 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2558 B.buildTrunc(ValReg, WideLoad).getReg(0);
2559 } else {
2560 // Extract the subvector.
2561
2562 if (isRegisterType(ValTy)) {
2563 // If this a case where G_EXTRACT is legal, use it.
2564 // (e.g. <3 x s32> -> <4 x s32>)
2565 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2566 B.buildExtract(ValReg, WideLoad, 0);
2567 } else {
2568 // For cases where the widened type isn't a nice register value, unmerge
2569 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2570 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2571 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2572 B.setInsertPt(B.getMBB(), MI.getIterator());
2573 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2574 }
2575 }
2576
2577 MI.eraseFromParent();
2578 return true;
2579 }
2580
2581 return false;
2582 }
2583
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2584 bool AMDGPULegalizerInfo::legalizeFMad(
2585 MachineInstr &MI, MachineRegisterInfo &MRI,
2586 MachineIRBuilder &B) const {
2587 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2588 assert(Ty.isScalar());
2589
2590 MachineFunction &MF = B.getMF();
2591 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2592
2593 // TODO: Always legal with future ftz flag.
2594 // FIXME: Do we need just output?
2595 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2596 return true;
2597 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2598 return true;
2599
2600 MachineIRBuilder HelperBuilder(MI);
2601 GISelObserverWrapper DummyObserver;
2602 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2603 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2604 }
2605
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2606 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2607 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2608 Register DstReg = MI.getOperand(0).getReg();
2609 Register PtrReg = MI.getOperand(1).getReg();
2610 Register CmpVal = MI.getOperand(2).getReg();
2611 Register NewVal = MI.getOperand(3).getReg();
2612
2613 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2614 "this should not have been custom lowered");
2615
2616 LLT ValTy = MRI.getType(CmpVal);
2617 LLT VecTy = LLT::fixed_vector(2, ValTy);
2618
2619 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2620
2621 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2622 .addDef(DstReg)
2623 .addUse(PtrReg)
2624 .addUse(PackedVal)
2625 .setMemRefs(MI.memoperands());
2626
2627 MI.eraseFromParent();
2628 return true;
2629 }
2630
legalizeFlog(MachineInstr & MI,MachineIRBuilder & B,double Log2BaseInverted) const2631 bool AMDGPULegalizerInfo::legalizeFlog(
2632 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2633 Register Dst = MI.getOperand(0).getReg();
2634 Register Src = MI.getOperand(1).getReg();
2635 LLT Ty = B.getMRI()->getType(Dst);
2636 unsigned Flags = MI.getFlags();
2637
2638 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2639 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2640
2641 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2642 MI.eraseFromParent();
2643 return true;
2644 }
2645
legalizeFExp(MachineInstr & MI,MachineIRBuilder & B) const2646 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2647 MachineIRBuilder &B) const {
2648 Register Dst = MI.getOperand(0).getReg();
2649 Register Src = MI.getOperand(1).getReg();
2650 unsigned Flags = MI.getFlags();
2651 LLT Ty = B.getMRI()->getType(Dst);
2652
2653 auto K = B.buildFConstant(Ty, numbers::log2e);
2654 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2655 B.buildFExp2(Dst, Mul, Flags);
2656 MI.eraseFromParent();
2657 return true;
2658 }
2659
legalizeFPow(MachineInstr & MI,MachineIRBuilder & B) const2660 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2661 MachineIRBuilder &B) const {
2662 Register Dst = MI.getOperand(0).getReg();
2663 Register Src0 = MI.getOperand(1).getReg();
2664 Register Src1 = MI.getOperand(2).getReg();
2665 unsigned Flags = MI.getFlags();
2666 LLT Ty = B.getMRI()->getType(Dst);
2667 const LLT S16 = LLT::scalar(16);
2668 const LLT S32 = LLT::scalar(32);
2669
2670 if (Ty == S32) {
2671 auto Log = B.buildFLog2(S32, Src0, Flags);
2672 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2673 .addUse(Log.getReg(0))
2674 .addUse(Src1)
2675 .setMIFlags(Flags);
2676 B.buildFExp2(Dst, Mul, Flags);
2677 } else if (Ty == S16) {
2678 // There's no f16 fmul_legacy, so we need to convert for it.
2679 auto Log = B.buildFLog2(S16, Src0, Flags);
2680 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2681 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2682 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2683 .addUse(Ext0.getReg(0))
2684 .addUse(Ext1.getReg(0))
2685 .setMIFlags(Flags);
2686
2687 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2688 } else
2689 return false;
2690
2691 MI.eraseFromParent();
2692 return true;
2693 }
2694
2695 // Find a source register, ignoring any possible source modifiers.
stripAnySourceMods(Register OrigSrc,MachineRegisterInfo & MRI)2696 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2697 Register ModSrc = OrigSrc;
2698 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2699 ModSrc = SrcFNeg->getOperand(1).getReg();
2700 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2701 ModSrc = SrcFAbs->getOperand(1).getReg();
2702 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2703 ModSrc = SrcFAbs->getOperand(1).getReg();
2704 return ModSrc;
2705 }
2706
legalizeFFloor(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2707 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2708 MachineRegisterInfo &MRI,
2709 MachineIRBuilder &B) const {
2710
2711 const LLT S1 = LLT::scalar(1);
2712 const LLT S64 = LLT::scalar(64);
2713 Register Dst = MI.getOperand(0).getReg();
2714 Register OrigSrc = MI.getOperand(1).getReg();
2715 unsigned Flags = MI.getFlags();
2716 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2717 "this should not have been custom lowered");
2718
2719 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2720 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2721 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2722 // V_FRACT bug is:
2723 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2724 //
2725 // Convert floor(x) to (x - fract(x))
2726
2727 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2728 .addUse(OrigSrc)
2729 .setMIFlags(Flags);
2730
2731 // Give source modifier matching some assistance before obscuring a foldable
2732 // pattern.
2733
2734 // TODO: We can avoid the neg on the fract? The input sign to fract
2735 // shouldn't matter?
2736 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2737
2738 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2739
2740 Register Min = MRI.createGenericVirtualRegister(S64);
2741
2742 // We don't need to concern ourselves with the snan handling difference, so
2743 // use the one which will directly select.
2744 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2745 if (MFI->getMode().IEEE)
2746 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2747 else
2748 B.buildFMinNum(Min, Fract, Const, Flags);
2749
2750 Register CorrectedFract = Min;
2751 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2752 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2753 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2754 }
2755
2756 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2757 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2758
2759 MI.eraseFromParent();
2760 return true;
2761 }
2762
2763 // Turn an illegal packed v2s16 build vector into bit operations.
2764 // TODO: This should probably be a bitcast action in LegalizerHelper.
legalizeBuildVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2765 bool AMDGPULegalizerInfo::legalizeBuildVector(
2766 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2767 Register Dst = MI.getOperand(0).getReg();
2768 const LLT S32 = LLT::scalar(32);
2769 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2770
2771 Register Src0 = MI.getOperand(1).getReg();
2772 Register Src1 = MI.getOperand(2).getReg();
2773 assert(MRI.getType(Src0) == LLT::scalar(16));
2774
2775 auto Merge = B.buildMerge(S32, {Src0, Src1});
2776 B.buildBitcast(Dst, Merge);
2777
2778 MI.eraseFromParent();
2779 return true;
2780 }
2781
2782 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2783 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2784 // case with a single min instruction instead of a compare+select.
legalizeCTLZ_CTTZ(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2785 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2786 MachineRegisterInfo &MRI,
2787 MachineIRBuilder &B) const {
2788 Register Dst = MI.getOperand(0).getReg();
2789 Register Src = MI.getOperand(1).getReg();
2790 LLT DstTy = MRI.getType(Dst);
2791 LLT SrcTy = MRI.getType(Src);
2792
2793 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2794 ? AMDGPU::G_AMDGPU_FFBH_U32
2795 : AMDGPU::G_AMDGPU_FFBL_B32;
2796 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2797 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2798
2799 MI.eraseFromParent();
2800 return true;
2801 }
2802
2803 // Check that this is a G_XOR x, -1
isNot(const MachineRegisterInfo & MRI,const MachineInstr & MI)2804 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2805 if (MI.getOpcode() != TargetOpcode::G_XOR)
2806 return false;
2807 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2808 return ConstVal && *ConstVal == -1;
2809 }
2810
2811 // Return the use branch instruction, otherwise null if the usage is invalid.
2812 static MachineInstr *
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br,MachineBasicBlock * & UncondBrTarget,bool & Negated)2813 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2814 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2815 Register CondDef = MI.getOperand(0).getReg();
2816 if (!MRI.hasOneNonDBGUse(CondDef))
2817 return nullptr;
2818
2819 MachineBasicBlock *Parent = MI.getParent();
2820 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2821
2822 if (isNot(MRI, *UseMI)) {
2823 Register NegatedCond = UseMI->getOperand(0).getReg();
2824 if (!MRI.hasOneNonDBGUse(NegatedCond))
2825 return nullptr;
2826
2827 // We're deleting the def of this value, so we need to remove it.
2828 eraseInstr(*UseMI, MRI);
2829
2830 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2831 Negated = true;
2832 }
2833
2834 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2835 return nullptr;
2836
2837 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2838 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2839 if (Next == Parent->end()) {
2840 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2841 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2842 return nullptr;
2843 UncondBrTarget = &*NextMBB;
2844 } else {
2845 if (Next->getOpcode() != AMDGPU::G_BR)
2846 return nullptr;
2847 Br = &*Next;
2848 UncondBrTarget = Br->getOperand(0).getMBB();
2849 }
2850
2851 return UseMI;
2852 }
2853
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg,const TargetRegisterClass * ArgRC,LLT ArgTy) const2854 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2855 const ArgDescriptor *Arg,
2856 const TargetRegisterClass *ArgRC,
2857 LLT ArgTy) const {
2858 MCRegister SrcReg = Arg->getRegister();
2859 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2860 assert(DstReg.isVirtual() && "Virtual register expected");
2861
2862 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2863 ArgTy);
2864 if (Arg->isMasked()) {
2865 // TODO: Should we try to emit this once in the entry block?
2866 const LLT S32 = LLT::scalar(32);
2867 const unsigned Mask = Arg->getMask();
2868 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2869
2870 Register AndMaskSrc = LiveIn;
2871
2872 if (Shift != 0) {
2873 auto ShiftAmt = B.buildConstant(S32, Shift);
2874 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2875 }
2876
2877 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2878 } else {
2879 B.buildCopy(DstReg, LiveIn);
2880 }
2881
2882 return true;
2883 }
2884
loadInputValue(Register DstReg,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2885 bool AMDGPULegalizerInfo::loadInputValue(
2886 Register DstReg, MachineIRBuilder &B,
2887 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2888 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2889 const ArgDescriptor *Arg;
2890 const TargetRegisterClass *ArgRC;
2891 LLT ArgTy;
2892 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2893
2894 if (!Arg) {
2895 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2896 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2897 // case the pointer argument may be missing and we use null.
2898 B.buildConstant(DstReg, 0);
2899 return true;
2900 }
2901
2902 // It's undefined behavior if a function marked with the amdgpu-no-*
2903 // attributes uses the corresponding intrinsic.
2904 B.buildUndef(DstReg);
2905 return true;
2906 }
2907
2908 if (!Arg->isRegister() || !Arg->getRegister().isValid())
2909 return false; // TODO: Handle these
2910 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2911 }
2912
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2913 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2914 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2915 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2916 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2917 return false;
2918
2919 MI.eraseFromParent();
2920 return true;
2921 }
2922
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2923 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2924 MachineRegisterInfo &MRI,
2925 MachineIRBuilder &B) const {
2926 Register Dst = MI.getOperand(0).getReg();
2927 LLT DstTy = MRI.getType(Dst);
2928 LLT S16 = LLT::scalar(16);
2929 LLT S32 = LLT::scalar(32);
2930 LLT S64 = LLT::scalar(64);
2931
2932 if (DstTy == S16)
2933 return legalizeFDIV16(MI, MRI, B);
2934 if (DstTy == S32)
2935 return legalizeFDIV32(MI, MRI, B);
2936 if (DstTy == S64)
2937 return legalizeFDIV64(MI, MRI, B);
2938
2939 return false;
2940 }
2941
legalizeUnsignedDIV_REM32Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register X,Register Y) const2942 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2943 Register DstDivReg,
2944 Register DstRemReg,
2945 Register X,
2946 Register Y) const {
2947 const LLT S1 = LLT::scalar(1);
2948 const LLT S32 = LLT::scalar(32);
2949
2950 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2951 // algorithm used here.
2952
2953 // Initial estimate of inv(y).
2954 auto FloatY = B.buildUITOFP(S32, Y);
2955 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2956 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2957 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2958 auto Z = B.buildFPTOUI(S32, ScaledY);
2959
2960 // One round of UNR.
2961 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2962 auto NegYZ = B.buildMul(S32, NegY, Z);
2963 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2964
2965 // Quotient/remainder estimate.
2966 auto Q = B.buildUMulH(S32, X, Z);
2967 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2968
2969 // First quotient/remainder refinement.
2970 auto One = B.buildConstant(S32, 1);
2971 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2972 if (DstDivReg)
2973 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2974 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2975
2976 // Second quotient/remainder refinement.
2977 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2978 if (DstDivReg)
2979 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2980
2981 if (DstRemReg)
2982 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2983 }
2984
2985 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
2986 //
2987 // Return lo, hi of result
2988 //
2989 // %cvt.lo = G_UITOFP Val.lo
2990 // %cvt.hi = G_UITOFP Val.hi
2991 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2992 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2993 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2994 // %mul2 = G_FMUL %mul1, 2**(-32)
2995 // %trunc = G_INTRINSIC_TRUNC %mul2
2996 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2997 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
emitReciprocalU64(MachineIRBuilder & B,Register Val)2998 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2999 Register Val) {
3000 const LLT S32 = LLT::scalar(32);
3001 auto Unmerge = B.buildUnmerge(S32, Val);
3002
3003 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3004 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3005
3006 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3007 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3008
3009 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3010 auto Mul1 =
3011 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3012
3013 // 2**(-32)
3014 auto Mul2 =
3015 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3016 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3017
3018 // -(2**32)
3019 auto Mad2 = B.buildFMAD(S32, Trunc,
3020 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3021
3022 auto ResultLo = B.buildFPTOUI(S32, Mad2);
3023 auto ResultHi = B.buildFPTOUI(S32, Trunc);
3024
3025 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3026 }
3027
legalizeUnsignedDIV_REM64Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register Numer,Register Denom) const3028 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3029 Register DstDivReg,
3030 Register DstRemReg,
3031 Register Numer,
3032 Register Denom) const {
3033 const LLT S32 = LLT::scalar(32);
3034 const LLT S64 = LLT::scalar(64);
3035 const LLT S1 = LLT::scalar(1);
3036 Register RcpLo, RcpHi;
3037
3038 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3039
3040 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3041
3042 auto Zero64 = B.buildConstant(S64, 0);
3043 auto NegDenom = B.buildSub(S64, Zero64, Denom);
3044
3045 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3046 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3047
3048 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3049 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3050 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3051
3052 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3053 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3054 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
3055 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3056
3057 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3058 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3059 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3060 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3061 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3062
3063 auto Zero32 = B.buildConstant(S32, 0);
3064 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3065 auto Add2_HiC =
3066 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
3067 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
3068 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3069
3070 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3071 Register NumerLo = UnmergeNumer.getReg(0);
3072 Register NumerHi = UnmergeNumer.getReg(1);
3073
3074 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3075 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3076 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3077 Register Mul3_Lo = UnmergeMul3.getReg(0);
3078 Register Mul3_Hi = UnmergeMul3.getReg(1);
3079 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3080 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3081 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3082 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3083
3084 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3085 Register DenomLo = UnmergeDenom.getReg(0);
3086 Register DenomHi = UnmergeDenom.getReg(1);
3087
3088 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3089 auto C1 = B.buildSExt(S32, CmpHi);
3090
3091 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3092 auto C2 = B.buildSExt(S32, CmpLo);
3093
3094 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3095 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3096
3097 // TODO: Here and below portions of the code can be enclosed into if/endif.
3098 // Currently control flow is unconditional and we have 4 selects after
3099 // potential endif to substitute PHIs.
3100
3101 // if C3 != 0 ...
3102 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3103 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3104 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3105 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3106
3107 auto One64 = B.buildConstant(S64, 1);
3108 auto Add3 = B.buildAdd(S64, MulHi3, One64);
3109
3110 auto C4 =
3111 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3112 auto C5 =
3113 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3114 auto C6 = B.buildSelect(
3115 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3116
3117 // if (C6 != 0)
3118 auto Add4 = B.buildAdd(S64, Add3, One64);
3119 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3120
3121 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3122 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3123 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3124
3125 // endif C6
3126 // endif C3
3127
3128 if (DstDivReg) {
3129 auto Sel1 = B.buildSelect(
3130 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3131 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3132 Sel1, MulHi3);
3133 }
3134
3135 if (DstRemReg) {
3136 auto Sel2 = B.buildSelect(
3137 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3138 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3139 Sel2, Sub1);
3140 }
3141 }
3142
legalizeUnsignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3143 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3144 MachineRegisterInfo &MRI,
3145 MachineIRBuilder &B) const {
3146 Register DstDivReg, DstRemReg;
3147 switch (MI.getOpcode()) {
3148 default:
3149 llvm_unreachable("Unexpected opcode!");
3150 case AMDGPU::G_UDIV: {
3151 DstDivReg = MI.getOperand(0).getReg();
3152 break;
3153 }
3154 case AMDGPU::G_UREM: {
3155 DstRemReg = MI.getOperand(0).getReg();
3156 break;
3157 }
3158 case AMDGPU::G_UDIVREM: {
3159 DstDivReg = MI.getOperand(0).getReg();
3160 DstRemReg = MI.getOperand(1).getReg();
3161 break;
3162 }
3163 }
3164
3165 const LLT S64 = LLT::scalar(64);
3166 const LLT S32 = LLT::scalar(32);
3167 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3168 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3169 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3170 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3171
3172 if (Ty == S32)
3173 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3174 else if (Ty == S64)
3175 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3176 else
3177 return false;
3178
3179 MI.eraseFromParent();
3180 return true;
3181 }
3182
legalizeSignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3183 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3184 MachineRegisterInfo &MRI,
3185 MachineIRBuilder &B) const {
3186 const LLT S64 = LLT::scalar(64);
3187 const LLT S32 = LLT::scalar(32);
3188
3189 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3190 if (Ty != S32 && Ty != S64)
3191 return false;
3192
3193 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3194 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3195 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3196
3197 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3198 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3199 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3200
3201 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3202 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3203
3204 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3205 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3206
3207 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3208 switch (MI.getOpcode()) {
3209 default:
3210 llvm_unreachable("Unexpected opcode!");
3211 case AMDGPU::G_SDIV: {
3212 DstDivReg = MI.getOperand(0).getReg();
3213 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3214 break;
3215 }
3216 case AMDGPU::G_SREM: {
3217 DstRemReg = MI.getOperand(0).getReg();
3218 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3219 break;
3220 }
3221 case AMDGPU::G_SDIVREM: {
3222 DstDivReg = MI.getOperand(0).getReg();
3223 DstRemReg = MI.getOperand(1).getReg();
3224 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3225 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3226 break;
3227 }
3228 }
3229
3230 if (Ty == S32)
3231 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3232 else
3233 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3234
3235 if (DstDivReg) {
3236 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3237 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3238 B.buildSub(DstDivReg, SignXor, Sign);
3239 }
3240
3241 if (DstRemReg) {
3242 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3243 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3244 B.buildSub(DstRemReg, SignXor, Sign);
3245 }
3246
3247 MI.eraseFromParent();
3248 return true;
3249 }
3250
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3251 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3252 MachineRegisterInfo &MRI,
3253 MachineIRBuilder &B) const {
3254 Register Res = MI.getOperand(0).getReg();
3255 Register LHS = MI.getOperand(1).getReg();
3256 Register RHS = MI.getOperand(2).getReg();
3257 uint16_t Flags = MI.getFlags();
3258 LLT ResTy = MRI.getType(Res);
3259
3260 const MachineFunction &MF = B.getMF();
3261 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3262 MI.getFlag(MachineInstr::FmAfn);
3263
3264 if (!AllowInaccurateRcp)
3265 return false;
3266
3267 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3268 // 1 / x -> RCP(x)
3269 if (CLHS->isExactlyValue(1.0)) {
3270 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3271 .addUse(RHS)
3272 .setMIFlags(Flags);
3273
3274 MI.eraseFromParent();
3275 return true;
3276 }
3277
3278 // -1 / x -> RCP( FNEG(x) )
3279 if (CLHS->isExactlyValue(-1.0)) {
3280 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3281 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3282 .addUse(FNeg.getReg(0))
3283 .setMIFlags(Flags);
3284
3285 MI.eraseFromParent();
3286 return true;
3287 }
3288 }
3289
3290 // x / y -> x * (1.0 / y)
3291 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3292 .addUse(RHS)
3293 .setMIFlags(Flags);
3294 B.buildFMul(Res, LHS, RCP, Flags);
3295
3296 MI.eraseFromParent();
3297 return true;
3298 }
3299
legalizeFastUnsafeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3300 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3301 MachineRegisterInfo &MRI,
3302 MachineIRBuilder &B) const {
3303 Register Res = MI.getOperand(0).getReg();
3304 Register X = MI.getOperand(1).getReg();
3305 Register Y = MI.getOperand(2).getReg();
3306 uint16_t Flags = MI.getFlags();
3307 LLT ResTy = MRI.getType(Res);
3308
3309 const MachineFunction &MF = B.getMF();
3310 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3311 MI.getFlag(MachineInstr::FmAfn);
3312
3313 if (!AllowInaccurateRcp)
3314 return false;
3315
3316 auto NegY = B.buildFNeg(ResTy, Y);
3317 auto One = B.buildFConstant(ResTy, 1.0);
3318
3319 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3320 .addUse(Y)
3321 .setMIFlags(Flags);
3322
3323 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3324 R = B.buildFMA(ResTy, Tmp0, R, R);
3325
3326 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3327 R = B.buildFMA(ResTy, Tmp1, R, R);
3328
3329 auto Ret = B.buildFMul(ResTy, X, R);
3330 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3331
3332 B.buildFMA(Res, Tmp2, R, Ret);
3333 MI.eraseFromParent();
3334 return true;
3335 }
3336
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3337 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3338 MachineRegisterInfo &MRI,
3339 MachineIRBuilder &B) const {
3340 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3341 return true;
3342
3343 Register Res = MI.getOperand(0).getReg();
3344 Register LHS = MI.getOperand(1).getReg();
3345 Register RHS = MI.getOperand(2).getReg();
3346
3347 uint16_t Flags = MI.getFlags();
3348
3349 LLT S16 = LLT::scalar(16);
3350 LLT S32 = LLT::scalar(32);
3351
3352 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3353 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3354
3355 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3356 .addUse(RHSExt.getReg(0))
3357 .setMIFlags(Flags);
3358
3359 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3360 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3361
3362 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3363 .addUse(RDst.getReg(0))
3364 .addUse(RHS)
3365 .addUse(LHS)
3366 .setMIFlags(Flags);
3367
3368 MI.eraseFromParent();
3369 return true;
3370 }
3371
3372 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3373 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,AMDGPU::SIModeRegisterDefaults Mode)3374 static void toggleSPDenormMode(bool Enable,
3375 MachineIRBuilder &B,
3376 const GCNSubtarget &ST,
3377 AMDGPU::SIModeRegisterDefaults Mode) {
3378 // Set SP denorm mode to this value.
3379 unsigned SPDenormMode =
3380 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3381
3382 if (ST.hasDenormModeInst()) {
3383 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3384 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3385
3386 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3387 B.buildInstr(AMDGPU::S_DENORM_MODE)
3388 .addImm(NewDenormModeValue);
3389
3390 } else {
3391 // Select FP32 bit field in mode register.
3392 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3393 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3394 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3395
3396 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3397 .addImm(SPDenormMode)
3398 .addImm(SPDenormModeBitField);
3399 }
3400 }
3401
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3402 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3403 MachineRegisterInfo &MRI,
3404 MachineIRBuilder &B) const {
3405 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3406 return true;
3407
3408 Register Res = MI.getOperand(0).getReg();
3409 Register LHS = MI.getOperand(1).getReg();
3410 Register RHS = MI.getOperand(2).getReg();
3411 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3412 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3413
3414 uint16_t Flags = MI.getFlags();
3415
3416 LLT S32 = LLT::scalar(32);
3417 LLT S1 = LLT::scalar(1);
3418
3419 auto One = B.buildFConstant(S32, 1.0f);
3420
3421 auto DenominatorScaled =
3422 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3423 .addUse(LHS)
3424 .addUse(RHS)
3425 .addImm(0)
3426 .setMIFlags(Flags);
3427 auto NumeratorScaled =
3428 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3429 .addUse(LHS)
3430 .addUse(RHS)
3431 .addImm(1)
3432 .setMIFlags(Flags);
3433
3434 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3435 .addUse(DenominatorScaled.getReg(0))
3436 .setMIFlags(Flags);
3437 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3438
3439 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3440 // aren't modeled as reading it.
3441 if (!Mode.allFP32Denormals())
3442 toggleSPDenormMode(true, B, ST, Mode);
3443
3444 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3445 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3446 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3447 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3448 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3449 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3450
3451 if (!Mode.allFP32Denormals())
3452 toggleSPDenormMode(false, B, ST, Mode);
3453
3454 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3455 .addUse(Fma4.getReg(0))
3456 .addUse(Fma1.getReg(0))
3457 .addUse(Fma3.getReg(0))
3458 .addUse(NumeratorScaled.getReg(1))
3459 .setMIFlags(Flags);
3460
3461 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3462 .addUse(Fmas.getReg(0))
3463 .addUse(RHS)
3464 .addUse(LHS)
3465 .setMIFlags(Flags);
3466
3467 MI.eraseFromParent();
3468 return true;
3469 }
3470
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3471 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3472 MachineRegisterInfo &MRI,
3473 MachineIRBuilder &B) const {
3474 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3475 return true;
3476
3477 Register Res = MI.getOperand(0).getReg();
3478 Register LHS = MI.getOperand(1).getReg();
3479 Register RHS = MI.getOperand(2).getReg();
3480
3481 uint16_t Flags = MI.getFlags();
3482
3483 LLT S64 = LLT::scalar(64);
3484 LLT S1 = LLT::scalar(1);
3485
3486 auto One = B.buildFConstant(S64, 1.0);
3487
3488 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3489 .addUse(LHS)
3490 .addUse(RHS)
3491 .addImm(0)
3492 .setMIFlags(Flags);
3493
3494 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3495
3496 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3497 .addUse(DivScale0.getReg(0))
3498 .setMIFlags(Flags);
3499
3500 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3501 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3502 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3503
3504 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3505 .addUse(LHS)
3506 .addUse(RHS)
3507 .addImm(1)
3508 .setMIFlags(Flags);
3509
3510 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3511 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3512 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3513
3514 Register Scale;
3515 if (!ST.hasUsableDivScaleConditionOutput()) {
3516 // Workaround a hardware bug on SI where the condition output from div_scale
3517 // is not usable.
3518
3519 LLT S32 = LLT::scalar(32);
3520
3521 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3522 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3523 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3524 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3525
3526 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3527 Scale1Unmerge.getReg(1));
3528 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3529 Scale0Unmerge.getReg(1));
3530 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3531 } else {
3532 Scale = DivScale1.getReg(1);
3533 }
3534
3535 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3536 .addUse(Fma4.getReg(0))
3537 .addUse(Fma3.getReg(0))
3538 .addUse(Mul.getReg(0))
3539 .addUse(Scale)
3540 .setMIFlags(Flags);
3541
3542 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3543 .addUse(Fmas.getReg(0))
3544 .addUse(RHS)
3545 .addUse(LHS)
3546 .setMIFlags(Flags);
3547
3548 MI.eraseFromParent();
3549 return true;
3550 }
3551
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3552 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3553 MachineRegisterInfo &MRI,
3554 MachineIRBuilder &B) const {
3555 Register Res = MI.getOperand(0).getReg();
3556 Register LHS = MI.getOperand(2).getReg();
3557 Register RHS = MI.getOperand(3).getReg();
3558 uint16_t Flags = MI.getFlags();
3559
3560 LLT S32 = LLT::scalar(32);
3561 LLT S1 = LLT::scalar(1);
3562
3563 auto Abs = B.buildFAbs(S32, RHS, Flags);
3564 const APFloat C0Val(1.0f);
3565
3566 auto C0 = B.buildConstant(S32, 0x6f800000);
3567 auto C1 = B.buildConstant(S32, 0x2f800000);
3568 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3569
3570 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3571 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3572
3573 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3574
3575 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3576 .addUse(Mul0.getReg(0))
3577 .setMIFlags(Flags);
3578
3579 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3580
3581 B.buildFMul(Res, Sel, Mul1, Flags);
3582
3583 MI.eraseFromParent();
3584 return true;
3585 }
3586
3587 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3588 // FIXME: Why do we handle this one but not other removed instructions?
3589 //
3590 // Reciprocal square root. The clamp prevents infinite results, clamping
3591 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3592 // +-max_float.
legalizeRsqClampIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3593 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3594 MachineRegisterInfo &MRI,
3595 MachineIRBuilder &B) const {
3596 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3597 return true;
3598
3599 Register Dst = MI.getOperand(0).getReg();
3600 Register Src = MI.getOperand(2).getReg();
3601 auto Flags = MI.getFlags();
3602
3603 LLT Ty = MRI.getType(Dst);
3604
3605 const fltSemantics *FltSemantics;
3606 if (Ty == LLT::scalar(32))
3607 FltSemantics = &APFloat::IEEEsingle();
3608 else if (Ty == LLT::scalar(64))
3609 FltSemantics = &APFloat::IEEEdouble();
3610 else
3611 return false;
3612
3613 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3614 .addUse(Src)
3615 .setMIFlags(Flags);
3616
3617 // We don't need to concern ourselves with the snan handling difference, since
3618 // the rsq quieted (or not) so use the one which will directly select.
3619 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3620 const bool UseIEEE = MFI->getMode().IEEE;
3621
3622 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3623 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3624 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3625
3626 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3627
3628 if (UseIEEE)
3629 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3630 else
3631 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3632 MI.eraseFromParent();
3633 return true;
3634 }
3635
getDSFPAtomicOpcode(Intrinsic::ID IID)3636 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3637 switch (IID) {
3638 case Intrinsic::amdgcn_ds_fadd:
3639 return AMDGPU::G_ATOMICRMW_FADD;
3640 case Intrinsic::amdgcn_ds_fmin:
3641 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3642 case Intrinsic::amdgcn_ds_fmax:
3643 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3644 default:
3645 llvm_unreachable("not a DS FP intrinsic");
3646 }
3647 }
3648
legalizeDSAtomicFPIntrinsic(LegalizerHelper & Helper,MachineInstr & MI,Intrinsic::ID IID) const3649 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3650 MachineInstr &MI,
3651 Intrinsic::ID IID) const {
3652 GISelChangeObserver &Observer = Helper.Observer;
3653 Observer.changingInstr(MI);
3654
3655 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3656
3657 // The remaining operands were used to set fields in the MemOperand on
3658 // construction.
3659 for (int I = 6; I > 3; --I)
3660 MI.RemoveOperand(I);
3661
3662 MI.RemoveOperand(1); // Remove the intrinsic ID.
3663 Observer.changedInstr(MI);
3664 return true;
3665 }
3666
getImplicitArgPtr(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3667 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3668 MachineRegisterInfo &MRI,
3669 MachineIRBuilder &B) const {
3670 uint64_t Offset =
3671 ST.getTargetLowering()->getImplicitParameterOffset(
3672 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3673 LLT DstTy = MRI.getType(DstReg);
3674 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3675
3676 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3677 if (!loadInputValue(KernargPtrReg, B,
3678 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3679 return false;
3680
3681 // FIXME: This should be nuw
3682 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3683 return true;
3684 }
3685
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3686 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3687 MachineRegisterInfo &MRI,
3688 MachineIRBuilder &B) const {
3689 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3690 if (!MFI->isEntryFunction()) {
3691 return legalizePreloadedArgIntrin(MI, MRI, B,
3692 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3693 }
3694
3695 Register DstReg = MI.getOperand(0).getReg();
3696 if (!getImplicitArgPtr(DstReg, MRI, B))
3697 return false;
3698
3699 MI.eraseFromParent();
3700 return true;
3701 }
3702
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const3703 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3704 MachineRegisterInfo &MRI,
3705 MachineIRBuilder &B,
3706 unsigned AddrSpace) const {
3707 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3708 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3709 Register Hi32 = Unmerge.getReg(1);
3710
3711 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3712 MI.eraseFromParent();
3713 return true;
3714 }
3715
3716 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3717 // offset (the offset that is included in bounds checking and swizzling, to be
3718 // split between the instruction's voffset and immoffset fields) and soffset
3719 // (the offset that is excluded from bounds checking and swizzling, to go in
3720 // the instruction's soffset field). This function takes the first kind of
3721 // offset and figures out how to split it between voffset and immoffset.
3722 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const3723 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3724 Register OrigOffset) const {
3725 const unsigned MaxImm = 4095;
3726 Register BaseReg;
3727 unsigned ImmOffset;
3728 const LLT S32 = LLT::scalar(32);
3729 MachineRegisterInfo &MRI = *B.getMRI();
3730
3731 std::tie(BaseReg, ImmOffset) =
3732 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3733
3734 // If BaseReg is a pointer, convert it to int.
3735 if (MRI.getType(BaseReg).isPointer())
3736 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3737
3738 // If the immediate value is too big for the immoffset field, put the value
3739 // and -4096 into the immoffset field so that the value that is copied/added
3740 // for the voffset field is a multiple of 4096, and it stands more chance
3741 // of being CSEd with the copy/add for another similar load/store.
3742 // However, do not do that rounding down to a multiple of 4096 if that is a
3743 // negative number, as it appears to be illegal to have a negative offset
3744 // in the vgpr, even if adding the immediate offset makes it positive.
3745 unsigned Overflow = ImmOffset & ~MaxImm;
3746 ImmOffset -= Overflow;
3747 if ((int32_t)Overflow < 0) {
3748 Overflow += ImmOffset;
3749 ImmOffset = 0;
3750 }
3751
3752 if (Overflow != 0) {
3753 if (!BaseReg) {
3754 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3755 } else {
3756 auto OverflowVal = B.buildConstant(S32, Overflow);
3757 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3758 }
3759 }
3760
3761 if (!BaseReg)
3762 BaseReg = B.buildConstant(S32, 0).getReg(0);
3763
3764 return std::make_pair(BaseReg, ImmOffset);
3765 }
3766
3767 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
updateBufferMMO(MachineMemOperand * MMO,Register VOffset,Register SOffset,unsigned ImmOffset,Register VIndex,MachineRegisterInfo & MRI) const3768 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3769 Register VOffset, Register SOffset,
3770 unsigned ImmOffset, Register VIndex,
3771 MachineRegisterInfo &MRI) const {
3772 Optional<ValueAndVReg> MaybeVOffsetVal =
3773 getIConstantVRegValWithLookThrough(VOffset, MRI);
3774 Optional<ValueAndVReg> MaybeSOffsetVal =
3775 getIConstantVRegValWithLookThrough(SOffset, MRI);
3776 Optional<ValueAndVReg> MaybeVIndexVal =
3777 getIConstantVRegValWithLookThrough(VIndex, MRI);
3778 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3779 // update the MMO with that offset. The stride is unknown so we can only do
3780 // this if VIndex is constant 0.
3781 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3782 MaybeVIndexVal->Value == 0) {
3783 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3784 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3785 MMO->setOffset(TotalOffset);
3786 } else {
3787 // We don't have a constant combined offset to use in the MMO. Give up.
3788 MMO->setValue((Value *)nullptr);
3789 }
3790 }
3791
3792 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg,bool ImageStore) const3793 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3794 MachineRegisterInfo &MRI,
3795 Register Reg,
3796 bool ImageStore) const {
3797 const LLT S16 = LLT::scalar(16);
3798 const LLT S32 = LLT::scalar(32);
3799 LLT StoreVT = MRI.getType(Reg);
3800 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3801
3802 if (ST.hasUnpackedD16VMem()) {
3803 auto Unmerge = B.buildUnmerge(S16, Reg);
3804
3805 SmallVector<Register, 4> WideRegs;
3806 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3807 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3808
3809 int NumElts = StoreVT.getNumElements();
3810
3811 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3812 .getReg(0);
3813 }
3814
3815 if (ImageStore && ST.hasImageStoreD16Bug()) {
3816 if (StoreVT.getNumElements() == 2) {
3817 SmallVector<Register, 4> PackedRegs;
3818 Reg = B.buildBitcast(S32, Reg).getReg(0);
3819 PackedRegs.push_back(Reg);
3820 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3821 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3822 .getReg(0);
3823 }
3824
3825 if (StoreVT.getNumElements() == 3) {
3826 SmallVector<Register, 4> PackedRegs;
3827 auto Unmerge = B.buildUnmerge(S16, Reg);
3828 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3829 PackedRegs.push_back(Unmerge.getReg(I));
3830 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3831 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3832 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3833 }
3834
3835 if (StoreVT.getNumElements() == 4) {
3836 SmallVector<Register, 4> PackedRegs;
3837 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3838 auto Unmerge = B.buildUnmerge(S32, Reg);
3839 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3840 PackedRegs.push_back(Unmerge.getReg(I));
3841 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3842 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3843 .getReg(0);
3844 }
3845
3846 llvm_unreachable("invalid data type");
3847 }
3848
3849 return Reg;
3850 }
3851
fixStoreSourceType(MachineIRBuilder & B,Register VData,bool IsFormat) const3852 Register AMDGPULegalizerInfo::fixStoreSourceType(
3853 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3854 MachineRegisterInfo *MRI = B.getMRI();
3855 LLT Ty = MRI->getType(VData);
3856
3857 const LLT S16 = LLT::scalar(16);
3858
3859 // Fixup illegal register types for i8 stores.
3860 if (Ty == LLT::scalar(8) || Ty == S16) {
3861 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3862 return AnyExt;
3863 }
3864
3865 if (Ty.isVector()) {
3866 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3867 if (IsFormat)
3868 return handleD16VData(B, *MRI, VData);
3869 }
3870 }
3871
3872 return VData;
3873 }
3874
legalizeBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsTyped,bool IsFormat) const3875 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3876 MachineRegisterInfo &MRI,
3877 MachineIRBuilder &B,
3878 bool IsTyped,
3879 bool IsFormat) const {
3880 Register VData = MI.getOperand(1).getReg();
3881 LLT Ty = MRI.getType(VData);
3882 LLT EltTy = Ty.getScalarType();
3883 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3884 const LLT S32 = LLT::scalar(32);
3885
3886 VData = fixStoreSourceType(B, VData, IsFormat);
3887 Register RSrc = MI.getOperand(2).getReg();
3888
3889 MachineMemOperand *MMO = *MI.memoperands_begin();
3890 const int MemSize = MMO->getSize();
3891
3892 unsigned ImmOffset;
3893
3894 // The typed intrinsics add an immediate after the registers.
3895 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3896
3897 // The struct intrinsic variants add one additional operand over raw.
3898 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3899 Register VIndex;
3900 int OpOffset = 0;
3901 if (HasVIndex) {
3902 VIndex = MI.getOperand(3).getReg();
3903 OpOffset = 1;
3904 } else {
3905 VIndex = B.buildConstant(S32, 0).getReg(0);
3906 }
3907
3908 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3909 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3910
3911 unsigned Format = 0;
3912 if (IsTyped) {
3913 Format = MI.getOperand(5 + OpOffset).getImm();
3914 ++OpOffset;
3915 }
3916
3917 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3918
3919 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3920 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3921
3922 unsigned Opc;
3923 if (IsTyped) {
3924 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3925 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3926 } else if (IsFormat) {
3927 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3928 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3929 } else {
3930 switch (MemSize) {
3931 case 1:
3932 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3933 break;
3934 case 2:
3935 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3936 break;
3937 default:
3938 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3939 break;
3940 }
3941 }
3942
3943 auto MIB = B.buildInstr(Opc)
3944 .addUse(VData) // vdata
3945 .addUse(RSrc) // rsrc
3946 .addUse(VIndex) // vindex
3947 .addUse(VOffset) // voffset
3948 .addUse(SOffset) // soffset
3949 .addImm(ImmOffset); // offset(imm)
3950
3951 if (IsTyped)
3952 MIB.addImm(Format);
3953
3954 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3955 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3956 .addMemOperand(MMO);
3957
3958 MI.eraseFromParent();
3959 return true;
3960 }
3961
legalizeBufferLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat,bool IsTyped) const3962 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3963 MachineRegisterInfo &MRI,
3964 MachineIRBuilder &B,
3965 bool IsFormat,
3966 bool IsTyped) const {
3967 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3968 MachineMemOperand *MMO = *MI.memoperands_begin();
3969 const LLT MemTy = MMO->getMemoryType();
3970 const LLT S32 = LLT::scalar(32);
3971
3972 Register Dst = MI.getOperand(0).getReg();
3973 Register RSrc = MI.getOperand(2).getReg();
3974
3975 // The typed intrinsics add an immediate after the registers.
3976 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3977
3978 // The struct intrinsic variants add one additional operand over raw.
3979 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3980 Register VIndex;
3981 int OpOffset = 0;
3982 if (HasVIndex) {
3983 VIndex = MI.getOperand(3).getReg();
3984 OpOffset = 1;
3985 } else {
3986 VIndex = B.buildConstant(S32, 0).getReg(0);
3987 }
3988
3989 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3990 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3991
3992 unsigned Format = 0;
3993 if (IsTyped) {
3994 Format = MI.getOperand(5 + OpOffset).getImm();
3995 ++OpOffset;
3996 }
3997
3998 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3999 unsigned ImmOffset;
4000
4001 LLT Ty = MRI.getType(Dst);
4002 LLT EltTy = Ty.getScalarType();
4003 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4004 const bool Unpacked = ST.hasUnpackedD16VMem();
4005
4006 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4007 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4008
4009 unsigned Opc;
4010
4011 if (IsTyped) {
4012 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4013 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4014 } else if (IsFormat) {
4015 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4016 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4017 } else {
4018 switch (MemTy.getSizeInBits()) {
4019 case 8:
4020 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4021 break;
4022 case 16:
4023 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4024 break;
4025 default:
4026 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4027 break;
4028 }
4029 }
4030
4031 Register LoadDstReg;
4032
4033 bool IsExtLoad =
4034 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4035 LLT UnpackedTy = Ty.changeElementSize(32);
4036
4037 if (IsExtLoad)
4038 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4039 else if (Unpacked && IsD16 && Ty.isVector())
4040 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4041 else
4042 LoadDstReg = Dst;
4043
4044 auto MIB = B.buildInstr(Opc)
4045 .addDef(LoadDstReg) // vdata
4046 .addUse(RSrc) // rsrc
4047 .addUse(VIndex) // vindex
4048 .addUse(VOffset) // voffset
4049 .addUse(SOffset) // soffset
4050 .addImm(ImmOffset); // offset(imm)
4051
4052 if (IsTyped)
4053 MIB.addImm(Format);
4054
4055 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4056 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4057 .addMemOperand(MMO);
4058
4059 if (LoadDstReg != Dst) {
4060 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4061
4062 // Widen result for extending loads was widened.
4063 if (IsExtLoad)
4064 B.buildTrunc(Dst, LoadDstReg);
4065 else {
4066 // Repack to original 16-bit vector result
4067 // FIXME: G_TRUNC should work, but legalization currently fails
4068 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4069 SmallVector<Register, 4> Repack;
4070 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4071 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4072 B.buildMerge(Dst, Repack);
4073 }
4074 }
4075
4076 MI.eraseFromParent();
4077 return true;
4078 }
4079
legalizeAtomicIncDec(MachineInstr & MI,MachineIRBuilder & B,bool IsInc) const4080 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4081 MachineIRBuilder &B,
4082 bool IsInc) const {
4083 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4084 AMDGPU::G_AMDGPU_ATOMIC_DEC;
4085 B.buildInstr(Opc)
4086 .addDef(MI.getOperand(0).getReg())
4087 .addUse(MI.getOperand(2).getReg())
4088 .addUse(MI.getOperand(3).getReg())
4089 .cloneMemRefs(MI);
4090 MI.eraseFromParent();
4091 return true;
4092 }
4093
getBufferAtomicPseudo(Intrinsic::ID IntrID)4094 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4095 switch (IntrID) {
4096 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4097 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4098 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4099 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4100 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4101 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4102 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4103 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4104 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4105 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4106 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4107 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4108 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4109 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4110 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4111 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4112 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4113 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4114 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4115 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4116 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4117 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4118 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4119 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4120 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4121 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4122 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4123 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4124 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4125 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4126 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4127 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4128 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4129 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4130 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4131 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4132 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4133 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4134 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4135 case Intrinsic::amdgcn_buffer_atomic_fadd:
4136 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4137 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4138 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4139 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4140 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4141 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4142 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4143 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4144 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4145 default:
4146 llvm_unreachable("unhandled atomic opcode");
4147 }
4148 }
4149
legalizeBufferAtomic(MachineInstr & MI,MachineIRBuilder & B,Intrinsic::ID IID) const4150 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4151 MachineIRBuilder &B,
4152 Intrinsic::ID IID) const {
4153 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4154 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4155 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4156
4157 Register Dst;
4158
4159 int OpOffset = 0;
4160 if (HasReturn) {
4161 // A few FP atomics do not support return values.
4162 Dst = MI.getOperand(0).getReg();
4163 } else {
4164 OpOffset = -1;
4165 }
4166
4167 Register VData = MI.getOperand(2 + OpOffset).getReg();
4168 Register CmpVal;
4169
4170 if (IsCmpSwap) {
4171 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4172 ++OpOffset;
4173 }
4174
4175 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4176 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4177
4178 // The struct intrinsic variants add one additional operand over raw.
4179 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4180 Register VIndex;
4181 if (HasVIndex) {
4182 VIndex = MI.getOperand(4 + OpOffset).getReg();
4183 ++OpOffset;
4184 } else {
4185 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4186 }
4187
4188 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4189 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4190 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4191
4192 MachineMemOperand *MMO = *MI.memoperands_begin();
4193
4194 unsigned ImmOffset;
4195 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4196 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4197
4198 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4199
4200 if (HasReturn)
4201 MIB.addDef(Dst);
4202
4203 MIB.addUse(VData); // vdata
4204
4205 if (IsCmpSwap)
4206 MIB.addReg(CmpVal);
4207
4208 MIB.addUse(RSrc) // rsrc
4209 .addUse(VIndex) // vindex
4210 .addUse(VOffset) // voffset
4211 .addUse(SOffset) // soffset
4212 .addImm(ImmOffset) // offset(imm)
4213 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4214 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4215 .addMemOperand(MMO);
4216
4217 MI.eraseFromParent();
4218 return true;
4219 }
4220
4221 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4222 /// vector with s16 typed elements.
packImage16bitOpsToDwords(MachineIRBuilder & B,MachineInstr & MI,SmallVectorImpl<Register> & PackedAddrs,unsigned ArgOffset,const AMDGPU::ImageDimIntrinsicInfo * Intr,bool IsA16,bool IsG16)4223 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4224 SmallVectorImpl<Register> &PackedAddrs,
4225 unsigned ArgOffset,
4226 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4227 bool IsA16, bool IsG16) {
4228 const LLT S16 = LLT::scalar(16);
4229 const LLT V2S16 = LLT::fixed_vector(2, 16);
4230 auto EndIdx = Intr->VAddrEnd;
4231
4232 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4233 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4234 if (!SrcOp.isReg())
4235 continue; // _L to _LZ may have eliminated this.
4236
4237 Register AddrReg = SrcOp.getReg();
4238
4239 if ((I < Intr->GradientStart) ||
4240 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4241 (I >= Intr->CoordStart && !IsA16)) {
4242 // Handle any gradient or coordinate operands that should not be packed
4243 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4244 PackedAddrs.push_back(AddrReg);
4245 } else {
4246 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4247 // derivatives dx/dh and dx/dv are packed with undef.
4248 if (((I + 1) >= EndIdx) ||
4249 ((Intr->NumGradients / 2) % 2 == 1 &&
4250 (I == static_cast<unsigned>(Intr->GradientStart +
4251 (Intr->NumGradients / 2) - 1) ||
4252 I == static_cast<unsigned>(Intr->GradientStart +
4253 Intr->NumGradients - 1))) ||
4254 // Check for _L to _LZ optimization
4255 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4256 PackedAddrs.push_back(
4257 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4258 .getReg(0));
4259 } else {
4260 PackedAddrs.push_back(
4261 B.buildBuildVector(
4262 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4263 .getReg(0));
4264 ++I;
4265 }
4266 }
4267 }
4268 }
4269
4270 /// Convert from separate vaddr components to a single vector address register,
4271 /// and replace the remaining operands with $noreg.
convertImageAddrToPacked(MachineIRBuilder & B,MachineInstr & MI,int DimIdx,int NumVAddrs)4272 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4273 int DimIdx, int NumVAddrs) {
4274 const LLT S32 = LLT::scalar(32);
4275
4276 SmallVector<Register, 8> AddrRegs;
4277 for (int I = 0; I != NumVAddrs; ++I) {
4278 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4279 if (SrcOp.isReg()) {
4280 AddrRegs.push_back(SrcOp.getReg());
4281 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4282 }
4283 }
4284
4285 int NumAddrRegs = AddrRegs.size();
4286 if (NumAddrRegs != 1) {
4287 // Above 8 elements round up to next power of 2 (i.e. 16).
4288 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4289 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4290 auto Undef = B.buildUndef(S32);
4291 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4292 NumAddrRegs = RoundedNumRegs;
4293 }
4294
4295 auto VAddr =
4296 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4297 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4298 }
4299
4300 for (int I = 1; I != NumVAddrs; ++I) {
4301 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4302 if (SrcOp.isReg())
4303 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4304 }
4305 }
4306
4307 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4308 ///
4309 /// Depending on the subtarget, load/store with 16-bit element data need to be
4310 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4311 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4312 /// registers.
4313 ///
4314 /// We don't want to directly select image instructions just yet, but also want
4315 /// to exposes all register repacking to the legalizer/combiners. We also don't
4316 /// want a selected instrution entering RegBankSelect. In order to avoid
4317 /// defining a multitude of intermediate image instructions, directly hack on
4318 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4319 /// padding now unnecessary arguments with $noreg.
legalizeImageIntrinsic(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer,const AMDGPU::ImageDimIntrinsicInfo * Intr) const4320 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4321 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4322 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4323
4324 const unsigned NumDefs = MI.getNumExplicitDefs();
4325 const unsigned ArgOffset = NumDefs + 1;
4326 bool IsTFE = NumDefs == 2;
4327 // We are only processing the operands of d16 image operations on subtargets
4328 // that use the unpacked register layout, or need to repack the TFE result.
4329
4330 // TODO: Do we need to guard against already legalized intrinsics?
4331 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4332 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4333
4334 MachineRegisterInfo *MRI = B.getMRI();
4335 const LLT S32 = LLT::scalar(32);
4336 const LLT S16 = LLT::scalar(16);
4337 const LLT V2S16 = LLT::fixed_vector(2, 16);
4338
4339 unsigned DMask = 0;
4340
4341 // Check for 16 bit addresses and pack if true.
4342 LLT GradTy =
4343 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4344 LLT AddrTy =
4345 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4346 const bool IsG16 = GradTy == S16;
4347 const bool IsA16 = AddrTy == S16;
4348
4349 int DMaskLanes = 0;
4350 if (!BaseOpcode->Atomic) {
4351 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4352 if (BaseOpcode->Gather4) {
4353 DMaskLanes = 4;
4354 } else if (DMask != 0) {
4355 DMaskLanes = countPopulation(DMask);
4356 } else if (!IsTFE && !BaseOpcode->Store) {
4357 // If dmask is 0, this is a no-op load. This can be eliminated.
4358 B.buildUndef(MI.getOperand(0));
4359 MI.eraseFromParent();
4360 return true;
4361 }
4362 }
4363
4364 Observer.changingInstr(MI);
4365 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4366
4367 unsigned NewOpcode = NumDefs == 0 ?
4368 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4369
4370 // Track that we legalized this
4371 MI.setDesc(B.getTII().get(NewOpcode));
4372
4373 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4374 // dmask to be at least 1 otherwise the instruction will fail
4375 if (IsTFE && DMask == 0) {
4376 DMask = 0x1;
4377 DMaskLanes = 1;
4378 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4379 }
4380
4381 if (BaseOpcode->Atomic) {
4382 Register VData0 = MI.getOperand(2).getReg();
4383 LLT Ty = MRI->getType(VData0);
4384
4385 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4386 if (Ty.isVector())
4387 return false;
4388
4389 if (BaseOpcode->AtomicX2) {
4390 Register VData1 = MI.getOperand(3).getReg();
4391 // The two values are packed in one register.
4392 LLT PackedTy = LLT::fixed_vector(2, Ty);
4393 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4394 MI.getOperand(2).setReg(Concat.getReg(0));
4395 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4396 }
4397 }
4398
4399 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4400
4401 // Optimize _L to _LZ when _L is zero
4402 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4403 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4404 const ConstantFP *ConstantLod;
4405
4406 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4407 m_GFCst(ConstantLod))) {
4408 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4409 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4410 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4411 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4412 Intr->Dim);
4413
4414 // The starting indexes should remain in the same place.
4415 --CorrectedNumVAddrs;
4416
4417 MI.getOperand(MI.getNumExplicitDefs())
4418 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4419 MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4420 Intr = NewImageDimIntr;
4421 }
4422 }
4423 }
4424
4425 // Optimize _mip away, when 'lod' is zero
4426 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4427 int64_t ConstantLod;
4428 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4429 m_ICst(ConstantLod))) {
4430 if (ConstantLod == 0) {
4431 // TODO: Change intrinsic opcode and remove operand instead or replacing
4432 // it with 0, as the _L to _LZ handling is done above.
4433 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4434 --CorrectedNumVAddrs;
4435 }
4436 }
4437 }
4438
4439 // Rewrite the addressing register layout before doing anything else.
4440 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4441 // 16 bit gradients are supported, but are tied to the A16 control
4442 // so both gradients and addresses must be 16 bit
4443 return false;
4444 }
4445
4446 if (IsA16 && !ST.hasA16()) {
4447 // A16 not supported
4448 return false;
4449 }
4450
4451 if (IsA16 || IsG16) {
4452 if (Intr->NumVAddrs > 1) {
4453 SmallVector<Register, 4> PackedRegs;
4454
4455 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4456 IsG16);
4457
4458 // See also below in the non-a16 branch
4459 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4460 PackedRegs.size() <= ST.getNSAMaxSize();
4461
4462 if (!UseNSA && PackedRegs.size() > 1) {
4463 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4464 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4465 PackedRegs[0] = Concat.getReg(0);
4466 PackedRegs.resize(1);
4467 }
4468
4469 const unsigned NumPacked = PackedRegs.size();
4470 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4471 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4472 if (!SrcOp.isReg()) {
4473 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4474 continue;
4475 }
4476
4477 assert(SrcOp.getReg() != AMDGPU::NoRegister);
4478
4479 if (I - Intr->VAddrStart < NumPacked)
4480 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4481 else
4482 SrcOp.setReg(AMDGPU::NoRegister);
4483 }
4484 }
4485 } else {
4486 // If the register allocator cannot place the address registers contiguously
4487 // without introducing moves, then using the non-sequential address encoding
4488 // is always preferable, since it saves VALU instructions and is usually a
4489 // wash in terms of code size or even better.
4490 //
4491 // However, we currently have no way of hinting to the register allocator
4492 // that MIMG addresses should be placed contiguously when it is possible to
4493 // do so, so force non-NSA for the common 2-address case as a heuristic.
4494 //
4495 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4496 // allocation when possible.
4497 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4498 CorrectedNumVAddrs <= ST.getNSAMaxSize();
4499
4500 if (!UseNSA && Intr->NumVAddrs > 1)
4501 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4502 Intr->NumVAddrs);
4503 }
4504
4505 int Flags = 0;
4506 if (IsA16)
4507 Flags |= 1;
4508 if (IsG16)
4509 Flags |= 2;
4510 MI.addOperand(MachineOperand::CreateImm(Flags));
4511
4512 if (BaseOpcode->Store) { // No TFE for stores?
4513 // TODO: Handle dmask trim
4514 Register VData = MI.getOperand(1).getReg();
4515 LLT Ty = MRI->getType(VData);
4516 if (!Ty.isVector() || Ty.getElementType() != S16)
4517 return true;
4518
4519 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4520 if (RepackedReg != VData) {
4521 MI.getOperand(1).setReg(RepackedReg);
4522 }
4523
4524 return true;
4525 }
4526
4527 Register DstReg = MI.getOperand(0).getReg();
4528 LLT Ty = MRI->getType(DstReg);
4529 const LLT EltTy = Ty.getScalarType();
4530 const bool IsD16 = Ty.getScalarType() == S16;
4531 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4532
4533 // Confirm that the return type is large enough for the dmask specified
4534 if (NumElts < DMaskLanes)
4535 return false;
4536
4537 if (NumElts > 4 || DMaskLanes > 4)
4538 return false;
4539
4540 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4541 const LLT AdjustedTy =
4542 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4543
4544 // The raw dword aligned data component of the load. The only legal cases
4545 // where this matters should be when using the packed D16 format, for
4546 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4547 LLT RoundedTy;
4548
4549 // S32 vector to to cover all data, plus TFE result element.
4550 LLT TFETy;
4551
4552 // Register type to use for each loaded component. Will be S32 or V2S16.
4553 LLT RegTy;
4554
4555 if (IsD16 && ST.hasUnpackedD16VMem()) {
4556 RoundedTy =
4557 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4558 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4559 RegTy = S32;
4560 } else {
4561 unsigned EltSize = EltTy.getSizeInBits();
4562 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4563 unsigned RoundedSize = 32 * RoundedElts;
4564 RoundedTy = LLT::scalarOrVector(
4565 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4566 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4567 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4568 }
4569
4570 // The return type does not need adjustment.
4571 // TODO: Should we change s16 case to s32 or <2 x s16>?
4572 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4573 return true;
4574
4575 Register Dst1Reg;
4576
4577 // Insert after the instruction.
4578 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4579
4580 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4581 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4582 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4583 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4584
4585 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4586
4587 MI.getOperand(0).setReg(NewResultReg);
4588
4589 // In the IR, TFE is supposed to be used with a 2 element struct return
4590 // type. The instruction really returns these two values in one contiguous
4591 // register, with one additional dword beyond the loaded data. Rewrite the
4592 // return type to use a single register result.
4593
4594 if (IsTFE) {
4595 Dst1Reg = MI.getOperand(1).getReg();
4596 if (MRI->getType(Dst1Reg) != S32)
4597 return false;
4598
4599 // TODO: Make sure the TFE operand bit is set.
4600 MI.RemoveOperand(1);
4601
4602 // Handle the easy case that requires no repack instructions.
4603 if (Ty == S32) {
4604 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4605 return true;
4606 }
4607 }
4608
4609 // Now figure out how to copy the new result register back into the old
4610 // result.
4611 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4612
4613 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4614
4615 if (ResultNumRegs == 1) {
4616 assert(!IsTFE);
4617 ResultRegs[0] = NewResultReg;
4618 } else {
4619 // We have to repack into a new vector of some kind.
4620 for (int I = 0; I != NumDataRegs; ++I)
4621 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4622 B.buildUnmerge(ResultRegs, NewResultReg);
4623
4624 // Drop the final TFE element to get the data part. The TFE result is
4625 // directly written to the right place already.
4626 if (IsTFE)
4627 ResultRegs.resize(NumDataRegs);
4628 }
4629
4630 // For an s16 scalar result, we form an s32 result with a truncate regardless
4631 // of packed vs. unpacked.
4632 if (IsD16 && !Ty.isVector()) {
4633 B.buildTrunc(DstReg, ResultRegs[0]);
4634 return true;
4635 }
4636
4637 // Avoid a build/concat_vector of 1 entry.
4638 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4639 B.buildBitcast(DstReg, ResultRegs[0]);
4640 return true;
4641 }
4642
4643 assert(Ty.isVector());
4644
4645 if (IsD16) {
4646 // For packed D16 results with TFE enabled, all the data components are
4647 // S32. Cast back to the expected type.
4648 //
4649 // TODO: We don't really need to use load s32 elements. We would only need one
4650 // cast for the TFE result if a multiple of v2s16 was used.
4651 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4652 for (Register &Reg : ResultRegs)
4653 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4654 } else if (ST.hasUnpackedD16VMem()) {
4655 for (Register &Reg : ResultRegs)
4656 Reg = B.buildTrunc(S16, Reg).getReg(0);
4657 }
4658 }
4659
4660 auto padWithUndef = [&](LLT Ty, int NumElts) {
4661 if (NumElts == 0)
4662 return;
4663 Register Undef = B.buildUndef(Ty).getReg(0);
4664 for (int I = 0; I != NumElts; ++I)
4665 ResultRegs.push_back(Undef);
4666 };
4667
4668 // Pad out any elements eliminated due to the dmask.
4669 LLT ResTy = MRI->getType(ResultRegs[0]);
4670 if (!ResTy.isVector()) {
4671 padWithUndef(ResTy, NumElts - ResultRegs.size());
4672 B.buildBuildVector(DstReg, ResultRegs);
4673 return true;
4674 }
4675
4676 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4677 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4678
4679 // Deal with the one annoying legal case.
4680 const LLT V3S16 = LLT::fixed_vector(3, 16);
4681 if (Ty == V3S16) {
4682 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4683 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4684 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4685 return true;
4686 }
4687
4688 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4689 B.buildConcatVectors(DstReg, ResultRegs);
4690 return true;
4691 }
4692
legalizeSBufferLoad(LegalizerHelper & Helper,MachineInstr & MI) const4693 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4694 LegalizerHelper &Helper, MachineInstr &MI) const {
4695 MachineIRBuilder &B = Helper.MIRBuilder;
4696 GISelChangeObserver &Observer = Helper.Observer;
4697
4698 Register Dst = MI.getOperand(0).getReg();
4699 LLT Ty = B.getMRI()->getType(Dst);
4700 unsigned Size = Ty.getSizeInBits();
4701 MachineFunction &MF = B.getMF();
4702
4703 Observer.changingInstr(MI);
4704
4705 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4706 Ty = getBitcastRegisterType(Ty);
4707 Helper.bitcastDst(MI, Ty, 0);
4708 Dst = MI.getOperand(0).getReg();
4709 B.setInsertPt(B.getMBB(), MI);
4710 }
4711
4712 // FIXME: We don't really need this intermediate instruction. The intrinsic
4713 // should be fixed to have a memory operand. Since it's readnone, we're not
4714 // allowed to add one.
4715 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4716 MI.RemoveOperand(1); // Remove intrinsic ID
4717
4718 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4719 // TODO: Should this use datalayout alignment?
4720 const unsigned MemSize = (Size + 7) / 8;
4721 const Align MemAlign(4);
4722 MachineMemOperand *MMO = MF.getMachineMemOperand(
4723 MachinePointerInfo(),
4724 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4725 MachineMemOperand::MOInvariant,
4726 MemSize, MemAlign);
4727 MI.addMemOperand(MF, MMO);
4728
4729 // There are no 96-bit result scalar loads, but widening to 128-bit should
4730 // always be legal. We may need to restore this to a 96-bit result if it turns
4731 // out this needs to be converted to a vector load during RegBankSelect.
4732 if (!isPowerOf2_32(Size)) {
4733 if (Ty.isVector())
4734 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4735 else
4736 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4737 }
4738
4739 Observer.changedInstr(MI);
4740 return true;
4741 }
4742
4743 // TODO: Move to selection
legalizeTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4744 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4745 MachineRegisterInfo &MRI,
4746 MachineIRBuilder &B) const {
4747 if (!ST.isTrapHandlerEnabled() ||
4748 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4749 return legalizeTrapEndpgm(MI, MRI, B);
4750
4751 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4752 switch (*HsaAbiVer) {
4753 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4754 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4755 return legalizeTrapHsaQueuePtr(MI, MRI, B);
4756 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4757 return ST.supportsGetDoorbellID() ?
4758 legalizeTrapHsa(MI, MRI, B) :
4759 legalizeTrapHsaQueuePtr(MI, MRI, B);
4760 }
4761 }
4762
4763 llvm_unreachable("Unknown trap handler");
4764 }
4765
legalizeTrapEndpgm(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4766 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4767 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4768 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4769 MI.eraseFromParent();
4770 return true;
4771 }
4772
legalizeTrapHsaQueuePtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4773 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4774 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4775 // Pass queue pointer to trap handler as input, and insert trap instruction
4776 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4777 Register LiveIn =
4778 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4779 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4780 return false;
4781
4782 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4783 B.buildCopy(SGPR01, LiveIn);
4784 B.buildInstr(AMDGPU::S_TRAP)
4785 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4786 .addReg(SGPR01, RegState::Implicit);
4787
4788 MI.eraseFromParent();
4789 return true;
4790 }
4791
legalizeTrapHsa(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4792 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4793 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4794 B.buildInstr(AMDGPU::S_TRAP)
4795 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4796 MI.eraseFromParent();
4797 return true;
4798 }
4799
legalizeDebugTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4800 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4801 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4802 // Is non-HSA path or trap-handler disabled? Then, report a warning
4803 // accordingly
4804 if (!ST.isTrapHandlerEnabled() ||
4805 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4806 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4807 "debugtrap handler not supported",
4808 MI.getDebugLoc(), DS_Warning);
4809 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4810 Ctx.diagnose(NoTrap);
4811 } else {
4812 // Insert debug-trap instruction
4813 B.buildInstr(AMDGPU::S_TRAP)
4814 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4815 }
4816
4817 MI.eraseFromParent();
4818 return true;
4819 }
4820
legalizeBVHIntrinsic(MachineInstr & MI,MachineIRBuilder & B) const4821 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4822 MachineIRBuilder &B) const {
4823 MachineRegisterInfo &MRI = *B.getMRI();
4824 const LLT S16 = LLT::scalar(16);
4825 const LLT S32 = LLT::scalar(32);
4826
4827 Register DstReg = MI.getOperand(0).getReg();
4828 Register NodePtr = MI.getOperand(2).getReg();
4829 Register RayExtent = MI.getOperand(3).getReg();
4830 Register RayOrigin = MI.getOperand(4).getReg();
4831 Register RayDir = MI.getOperand(5).getReg();
4832 Register RayInvDir = MI.getOperand(6).getReg();
4833 Register TDescr = MI.getOperand(7).getReg();
4834
4835 if (!ST.hasGFX10_AEncoding()) {
4836 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4837 "intrinsic not supported on subtarget",
4838 MI.getDebugLoc());
4839 B.getMF().getFunction().getContext().diagnose(BadIntrin);
4840 return false;
4841 }
4842
4843 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4844 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4845 const unsigned NumVDataDwords = 4;
4846 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4847 const bool UseNSA =
4848 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4849 const unsigned BaseOpcodes[2][2] = {
4850 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4851 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4852 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4853 int Opcode;
4854 if (UseNSA) {
4855 Opcode =
4856 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4857 NumVDataDwords, NumVAddrDwords);
4858 } else {
4859 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4860 AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4861 PowerOf2Ceil(NumVAddrDwords));
4862 }
4863 assert(Opcode != -1);
4864
4865 SmallVector<Register, 12> Ops;
4866 if (Is64) {
4867 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4868 Ops.push_back(Unmerge.getReg(0));
4869 Ops.push_back(Unmerge.getReg(1));
4870 } else {
4871 Ops.push_back(NodePtr);
4872 }
4873 Ops.push_back(RayExtent);
4874
4875 auto packLanes = [&Ops, &S32, &B] (Register Src) {
4876 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4877 Ops.push_back(Unmerge.getReg(0));
4878 Ops.push_back(Unmerge.getReg(1));
4879 Ops.push_back(Unmerge.getReg(2));
4880 };
4881
4882 packLanes(RayOrigin);
4883 if (IsA16) {
4884 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4885 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4886 Register R1 = MRI.createGenericVirtualRegister(S32);
4887 Register R2 = MRI.createGenericVirtualRegister(S32);
4888 Register R3 = MRI.createGenericVirtualRegister(S32);
4889 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4890 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4891 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4892 Ops.push_back(R1);
4893 Ops.push_back(R2);
4894 Ops.push_back(R3);
4895 } else {
4896 packLanes(RayDir);
4897 packLanes(RayInvDir);
4898 }
4899
4900 if (!UseNSA) {
4901 // Build a single vector containing all the operands so far prepared.
4902 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4903 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4904 Ops.clear();
4905 Ops.push_back(MergedOps);
4906 }
4907
4908 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4909 .addDef(DstReg)
4910 .addImm(Opcode);
4911
4912 for (Register R : Ops) {
4913 MIB.addUse(R);
4914 }
4915
4916 MIB.addUse(TDescr)
4917 .addImm(IsA16 ? 1 : 0)
4918 .cloneMemRefs(MI);
4919
4920 MI.eraseFromParent();
4921 return true;
4922 }
4923
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const4924 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4925 MachineInstr &MI) const {
4926 MachineIRBuilder &B = Helper.MIRBuilder;
4927 MachineRegisterInfo &MRI = *B.getMRI();
4928
4929 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4930 auto IntrID = MI.getIntrinsicID();
4931 switch (IntrID) {
4932 case Intrinsic::amdgcn_if:
4933 case Intrinsic::amdgcn_else: {
4934 MachineInstr *Br = nullptr;
4935 MachineBasicBlock *UncondBrTarget = nullptr;
4936 bool Negated = false;
4937 if (MachineInstr *BrCond =
4938 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4939 const SIRegisterInfo *TRI
4940 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4941
4942 Register Def = MI.getOperand(1).getReg();
4943 Register Use = MI.getOperand(3).getReg();
4944
4945 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4946
4947 if (Negated)
4948 std::swap(CondBrTarget, UncondBrTarget);
4949
4950 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4951 if (IntrID == Intrinsic::amdgcn_if) {
4952 B.buildInstr(AMDGPU::SI_IF)
4953 .addDef(Def)
4954 .addUse(Use)
4955 .addMBB(UncondBrTarget);
4956 } else {
4957 B.buildInstr(AMDGPU::SI_ELSE)
4958 .addDef(Def)
4959 .addUse(Use)
4960 .addMBB(UncondBrTarget);
4961 }
4962
4963 if (Br) {
4964 Br->getOperand(0).setMBB(CondBrTarget);
4965 } else {
4966 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4967 // since we're swapping branch targets it needs to be reinserted.
4968 // FIXME: IRTranslator should probably not do this
4969 B.buildBr(*CondBrTarget);
4970 }
4971
4972 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4973 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4974 MI.eraseFromParent();
4975 BrCond->eraseFromParent();
4976 return true;
4977 }
4978
4979 return false;
4980 }
4981 case Intrinsic::amdgcn_loop: {
4982 MachineInstr *Br = nullptr;
4983 MachineBasicBlock *UncondBrTarget = nullptr;
4984 bool Negated = false;
4985 if (MachineInstr *BrCond =
4986 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4987 const SIRegisterInfo *TRI
4988 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4989
4990 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4991 Register Reg = MI.getOperand(2).getReg();
4992
4993 if (Negated)
4994 std::swap(CondBrTarget, UncondBrTarget);
4995
4996 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4997 B.buildInstr(AMDGPU::SI_LOOP)
4998 .addUse(Reg)
4999 .addMBB(UncondBrTarget);
5000
5001 if (Br)
5002 Br->getOperand(0).setMBB(CondBrTarget);
5003 else
5004 B.buildBr(*CondBrTarget);
5005
5006 MI.eraseFromParent();
5007 BrCond->eraseFromParent();
5008 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5009 return true;
5010 }
5011
5012 return false;
5013 }
5014 case Intrinsic::amdgcn_kernarg_segment_ptr:
5015 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5016 // This only makes sense to call in a kernel, so just lower to null.
5017 B.buildConstant(MI.getOperand(0).getReg(), 0);
5018 MI.eraseFromParent();
5019 return true;
5020 }
5021
5022 return legalizePreloadedArgIntrin(
5023 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5024 case Intrinsic::amdgcn_implicitarg_ptr:
5025 return legalizeImplicitArgPtr(MI, MRI, B);
5026 case Intrinsic::amdgcn_workitem_id_x:
5027 return legalizePreloadedArgIntrin(MI, MRI, B,
5028 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5029 case Intrinsic::amdgcn_workitem_id_y:
5030 return legalizePreloadedArgIntrin(MI, MRI, B,
5031 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5032 case Intrinsic::amdgcn_workitem_id_z:
5033 return legalizePreloadedArgIntrin(MI, MRI, B,
5034 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5035 case Intrinsic::amdgcn_workgroup_id_x:
5036 return legalizePreloadedArgIntrin(MI, MRI, B,
5037 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5038 case Intrinsic::amdgcn_workgroup_id_y:
5039 return legalizePreloadedArgIntrin(MI, MRI, B,
5040 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5041 case Intrinsic::amdgcn_workgroup_id_z:
5042 return legalizePreloadedArgIntrin(MI, MRI, B,
5043 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5044 case Intrinsic::amdgcn_dispatch_ptr:
5045 return legalizePreloadedArgIntrin(MI, MRI, B,
5046 AMDGPUFunctionArgInfo::DISPATCH_PTR);
5047 case Intrinsic::amdgcn_queue_ptr:
5048 return legalizePreloadedArgIntrin(MI, MRI, B,
5049 AMDGPUFunctionArgInfo::QUEUE_PTR);
5050 case Intrinsic::amdgcn_implicit_buffer_ptr:
5051 return legalizePreloadedArgIntrin(
5052 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5053 case Intrinsic::amdgcn_dispatch_id:
5054 return legalizePreloadedArgIntrin(MI, MRI, B,
5055 AMDGPUFunctionArgInfo::DISPATCH_ID);
5056 case Intrinsic::amdgcn_fdiv_fast:
5057 return legalizeFDIVFastIntrin(MI, MRI, B);
5058 case Intrinsic::amdgcn_is_shared:
5059 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5060 case Intrinsic::amdgcn_is_private:
5061 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5062 case Intrinsic::amdgcn_wavefrontsize: {
5063 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5064 MI.eraseFromParent();
5065 return true;
5066 }
5067 case Intrinsic::amdgcn_s_buffer_load:
5068 return legalizeSBufferLoad(Helper, MI);
5069 case Intrinsic::amdgcn_raw_buffer_store:
5070 case Intrinsic::amdgcn_struct_buffer_store:
5071 return legalizeBufferStore(MI, MRI, B, false, false);
5072 case Intrinsic::amdgcn_raw_buffer_store_format:
5073 case Intrinsic::amdgcn_struct_buffer_store_format:
5074 return legalizeBufferStore(MI, MRI, B, false, true);
5075 case Intrinsic::amdgcn_raw_tbuffer_store:
5076 case Intrinsic::amdgcn_struct_tbuffer_store:
5077 return legalizeBufferStore(MI, MRI, B, true, true);
5078 case Intrinsic::amdgcn_raw_buffer_load:
5079 case Intrinsic::amdgcn_struct_buffer_load:
5080 return legalizeBufferLoad(MI, MRI, B, false, false);
5081 case Intrinsic::amdgcn_raw_buffer_load_format:
5082 case Intrinsic::amdgcn_struct_buffer_load_format:
5083 return legalizeBufferLoad(MI, MRI, B, true, false);
5084 case Intrinsic::amdgcn_raw_tbuffer_load:
5085 case Intrinsic::amdgcn_struct_tbuffer_load:
5086 return legalizeBufferLoad(MI, MRI, B, true, true);
5087 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5088 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5089 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5090 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5091 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5092 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5093 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5094 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5095 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5096 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5097 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5098 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5099 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5100 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5101 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5102 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5103 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5104 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5105 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5106 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5107 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5108 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5109 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5110 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5111 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5112 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5113 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5114 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5115 case Intrinsic::amdgcn_buffer_atomic_fadd:
5116 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5117 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5118 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5119 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5120 return legalizeBufferAtomic(MI, B, IntrID);
5121 case Intrinsic::amdgcn_atomic_inc:
5122 return legalizeAtomicIncDec(MI, B, true);
5123 case Intrinsic::amdgcn_atomic_dec:
5124 return legalizeAtomicIncDec(MI, B, false);
5125 case Intrinsic::trap:
5126 return legalizeTrapIntrinsic(MI, MRI, B);
5127 case Intrinsic::debugtrap:
5128 return legalizeDebugTrapIntrinsic(MI, MRI, B);
5129 case Intrinsic::amdgcn_rsq_clamp:
5130 return legalizeRsqClampIntrinsic(MI, MRI, B);
5131 case Intrinsic::amdgcn_ds_fadd:
5132 case Intrinsic::amdgcn_ds_fmin:
5133 case Intrinsic::amdgcn_ds_fmax:
5134 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5135 case Intrinsic::amdgcn_image_bvh_intersect_ray:
5136 return legalizeBVHIntrinsic(MI, B);
5137 default: {
5138 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5139 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5140 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5141 return true;
5142 }
5143 }
5144
5145 return true;
5146 }
5147