1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPULegalizerInfo.h"
15
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40 "amdgpu-global-isel-new-legality",
41 cl::desc("Use GlobalISel desired legality, rather than try to use"
42 "rules compatible with selection patterns"),
43 cl::init(false),
44 cl::ReallyHidden);
45
46 static constexpr unsigned MaxRegisterSize = 1024;
47
48 // Round the number of elements to the next power of two elements
getPow2VectorType(LLT Ty)49 static LLT getPow2VectorType(LLT Ty) {
50 unsigned NElts = Ty.getNumElements();
51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52 return Ty.changeNumElements(Pow2NElts);
53 }
54
55 // Round the number of bits to the next power of two bits
getPow2ScalarType(LLT Ty)56 static LLT getPow2ScalarType(LLT Ty) {
57 unsigned Bits = Ty.getSizeInBits();
58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59 return LLT::scalar(Pow2Bits);
60 }
61
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
isSmallOddVector(unsigned TypeIdx)65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66 return [=](const LegalityQuery &Query) {
67 const LLT Ty = Query.Types[TypeIdx];
68 if (!Ty.isVector())
69 return false;
70
71 const LLT EltTy = Ty.getElementType();
72 const unsigned EltSize = EltTy.getSizeInBits();
73 return Ty.getNumElements() % 2 != 0 &&
74 EltSize > 1 && EltSize < 32 &&
75 Ty.getSizeInBits() % 32 != 0;
76 };
77 }
78
sizeIsMultipleOf32(unsigned TypeIdx)79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80 return [=](const LegalityQuery &Query) {
81 const LLT Ty = Query.Types[TypeIdx];
82 return Ty.getSizeInBits() % 32 == 0;
83 };
84 }
85
isWideVec16(unsigned TypeIdx)86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 const LLT EltTy = Ty.getScalarType();
90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91 };
92 }
93
oneMoreElement(unsigned TypeIdx)94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getElementType();
98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99 };
100 }
101
fewerEltsToSize64Vector(unsigned TypeIdx)102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103 return [=](const LegalityQuery &Query) {
104 const LLT Ty = Query.Types[TypeIdx];
105 const LLT EltTy = Ty.getElementType();
106 unsigned Size = Ty.getSizeInBits();
107 unsigned Pieces = (Size + 63) / 64;
108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110 };
111 }
112
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
moreEltsToNext32Bit(unsigned TypeIdx)115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116 return [=](const LegalityQuery &Query) {
117 const LLT Ty = Query.Types[TypeIdx];
118
119 const LLT EltTy = Ty.getElementType();
120 const int Size = Ty.getSizeInBits();
121 const int EltSize = EltTy.getSizeInBits();
122 const int NextMul32 = (Size + 31) / 32;
123
124 assert(EltSize < 32);
125
126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128 };
129 }
130
getBitcastRegisterType(const LLT Ty)131 static LLT getBitcastRegisterType(const LLT Ty) {
132 const unsigned Size = Ty.getSizeInBits();
133
134 LLT CoercedTy;
135 if (Size <= 32) {
136 // <2 x s8> -> s16
137 // <4 x s8> -> s32
138 return LLT::scalar(Size);
139 }
140
141 return LLT::scalarOrVector(Size / 32, 32);
142 }
143
bitcastToRegisterType(unsigned TypeIdx)144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145 return [=](const LegalityQuery &Query) {
146 const LLT Ty = Query.Types[TypeIdx];
147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148 };
149 }
150
bitcastToVectorElement32(unsigned TypeIdx)151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152 return [=](const LegalityQuery &Query) {
153 const LLT Ty = Query.Types[TypeIdx];
154 unsigned Size = Ty.getSizeInBits();
155 assert(Size % 32 == 0);
156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157 };
158 }
159
vectorSmallerThan(unsigned TypeIdx,unsigned Size)160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161 return [=](const LegalityQuery &Query) {
162 const LLT QueryTy = Query.Types[TypeIdx];
163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164 };
165 }
166
vectorWiderThan(unsigned TypeIdx,unsigned Size)167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168 return [=](const LegalityQuery &Query) {
169 const LLT QueryTy = Query.Types[TypeIdx];
170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171 };
172 }
173
numElementsNotEven(unsigned TypeIdx)174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175 return [=](const LegalityQuery &Query) {
176 const LLT QueryTy = Query.Types[TypeIdx];
177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178 };
179 }
180
isRegisterSize(unsigned Size)181 static bool isRegisterSize(unsigned Size) {
182 return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184
isRegisterVectorElementType(LLT EltTy)185 static bool isRegisterVectorElementType(LLT EltTy) {
186 const int EltSize = EltTy.getSizeInBits();
187 return EltSize == 16 || EltSize % 32 == 0;
188 }
189
isRegisterVectorType(LLT Ty)190 static bool isRegisterVectorType(LLT Ty) {
191 const int EltSize = Ty.getElementType().getSizeInBits();
192 return EltSize == 32 || EltSize == 64 ||
193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194 EltSize == 128 || EltSize == 256;
195 }
196
isRegisterType(LLT Ty)197 static bool isRegisterType(LLT Ty) {
198 if (!isRegisterSize(Ty.getSizeInBits()))
199 return false;
200
201 if (Ty.isVector())
202 return isRegisterVectorType(Ty);
203
204 return true;
205 }
206
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
isRegisterType(unsigned TypeIdx)209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210 return [=](const LegalityQuery &Query) {
211 return isRegisterType(Query.Types[TypeIdx]);
212 };
213 }
214
elementTypeIsLegal(unsigned TypeIdx)215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216 return [=](const LegalityQuery &Query) {
217 const LLT QueryTy = Query.Types[TypeIdx];
218 if (!QueryTy.isVector())
219 return false;
220 const LLT EltTy = QueryTy.getElementType();
221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222 };
223 }
224
isWideScalarTruncStore(unsigned TypeIdx)225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
226 return [=](const LegalityQuery &Query) {
227 const LLT Ty = Query.Types[TypeIdx];
228 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
229 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
230 };
231 }
232
233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
234 // handle some operations by just promoting the register during
235 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
maxSizeForAddrSpace(const GCNSubtarget & ST,unsigned AS,bool IsLoad)236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
237 bool IsLoad) {
238 switch (AS) {
239 case AMDGPUAS::PRIVATE_ADDRESS:
240 // FIXME: Private element size.
241 return ST.enableFlatScratch() ? 128 : 32;
242 case AMDGPUAS::LOCAL_ADDRESS:
243 return ST.useDS128() ? 128 : 64;
244 case AMDGPUAS::GLOBAL_ADDRESS:
245 case AMDGPUAS::CONSTANT_ADDRESS:
246 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
247 // Treat constant and global as identical. SMRD loads are sometimes usable for
248 // global loads (ideally constant address space should be eliminated)
249 // depending on the context. Legality cannot be context dependent, but
250 // RegBankSelect can split the load as necessary depending on the pointer
251 // register bank/uniformity and if the memory is invariant or not written in a
252 // kernel.
253 return IsLoad ? 512 : 128;
254 default:
255 // Flat addresses may contextually need to be split to 32-bit parts if they
256 // may alias scratch depending on the subtarget.
257 return 128;
258 }
259 }
260
isLoadStoreSizeLegal(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
262 const LegalityQuery &Query,
263 unsigned Opcode) {
264 const LLT Ty = Query.Types[0];
265
266 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
267 const bool IsLoad = Opcode != AMDGPU::G_STORE;
268
269 unsigned RegSize = Ty.getSizeInBits();
270 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
271 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
272 unsigned AS = Query.Types[1].getAddressSpace();
273
274 // All of these need to be custom lowered to cast the pointer operand.
275 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
276 return false;
277
278 // TODO: We should be able to widen loads if the alignment is high enough, but
279 // we also need to modify the memory access size.
280 #if 0
281 // Accept widening loads based on alignment.
282 if (IsLoad && MemSize < Size)
283 MemSize = std::max(MemSize, Align);
284 #endif
285
286 // Only 1-byte and 2-byte to 32-bit extloads are valid.
287 if (MemSize != RegSize && RegSize != 32)
288 return false;
289
290 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
291 return false;
292
293 switch (MemSize) {
294 case 8:
295 case 16:
296 case 32:
297 case 64:
298 case 128:
299 break;
300 case 96:
301 if (!ST.hasDwordx3LoadStores())
302 return false;
303 break;
304 case 256:
305 case 512:
306 // These may contextually need to be broken down.
307 break;
308 default:
309 return false;
310 }
311
312 assert(RegSize >= MemSize);
313
314 if (AlignBits < MemSize) {
315 const SITargetLowering *TLI = ST.getTargetLowering();
316 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
317 Align(AlignBits / 8)))
318 return false;
319 }
320
321 return true;
322 }
323
324 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
325 // workaround this. Eventually it should ignore the type for loads and only care
326 // about the size. Return true in cases where we will workaround this for now by
327 // bitcasting.
loadStoreBitcastWorkaround(const LLT Ty)328 static bool loadStoreBitcastWorkaround(const LLT Ty) {
329 if (EnableNewLegality)
330 return false;
331
332 const unsigned Size = Ty.getSizeInBits();
333 if (Size <= 64)
334 return false;
335 if (!Ty.isVector())
336 return true;
337
338 LLT EltTy = Ty.getElementType();
339 if (EltTy.isPointer())
340 return true;
341
342 unsigned EltSize = EltTy.getSizeInBits();
343 return EltSize != 32 && EltSize != 64;
344 }
345
isLoadStoreLegal(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)346 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
347 unsigned Opcode) {
348 const LLT Ty = Query.Types[0];
349 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
350 !loadStoreBitcastWorkaround(Ty);
351 }
352
353 /// Return true if a load or store of the type should be lowered with a bitcast
354 /// to a different type.
shouldBitcastLoadStoreType(const GCNSubtarget & ST,const LLT Ty,const unsigned MemSizeInBits)355 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
356 const unsigned MemSizeInBits) {
357 const unsigned Size = Ty.getSizeInBits();
358 if (Size != MemSizeInBits)
359 return Size <= 32 && Ty.isVector();
360
361 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
362 return true;
363 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
364 !isRegisterVectorElementType(Ty.getElementType());
365 }
366
367 /// Return true if we should legalize a load by widening an odd sized memory
368 /// access up to the alignment. Note this case when the memory access itself
369 /// changes, not the size of the result register.
shouldWidenLoad(const GCNSubtarget & ST,unsigned SizeInBits,unsigned AlignInBits,unsigned AddrSpace,unsigned Opcode)370 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
371 unsigned AlignInBits, unsigned AddrSpace,
372 unsigned Opcode) {
373 // We don't want to widen cases that are naturally legal.
374 if (isPowerOf2_32(SizeInBits))
375 return false;
376
377 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
378 // end up widening these for a scalar load during RegBankSelect, since there
379 // aren't 96-bit scalar loads.
380 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
381 return false;
382
383 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
384 return false;
385
386 // A load is known dereferenceable up to the alignment, so it's legal to widen
387 // to it.
388 //
389 // TODO: Could check dereferenceable for less aligned cases.
390 unsigned RoundedSize = NextPowerOf2(SizeInBits);
391 if (AlignInBits < RoundedSize)
392 return false;
393
394 // Do not widen if it would introduce a slow unaligned load.
395 const SITargetLowering *TLI = ST.getTargetLowering();
396 bool Fast = false;
397 return TLI->allowsMisalignedMemoryAccessesImpl(
398 RoundedSize, AddrSpace, Align(AlignInBits / 8),
399 MachineMemOperand::MOLoad, &Fast) &&
400 Fast;
401 }
402
shouldWidenLoad(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)403 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
404 unsigned Opcode) {
405 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
406 return false;
407
408 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
409 Query.MMODescrs[0].AlignInBits,
410 Query.Types[1].getAddressSpace(), Opcode);
411 }
412
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)413 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
414 const GCNTargetMachine &TM)
415 : ST(ST_) {
416 using namespace TargetOpcode;
417
418 auto GetAddrSpacePtr = [&TM](unsigned AS) {
419 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
420 };
421
422 const LLT S1 = LLT::scalar(1);
423 const LLT S8 = LLT::scalar(8);
424 const LLT S16 = LLT::scalar(16);
425 const LLT S32 = LLT::scalar(32);
426 const LLT S64 = LLT::scalar(64);
427 const LLT S128 = LLT::scalar(128);
428 const LLT S256 = LLT::scalar(256);
429 const LLT S512 = LLT::scalar(512);
430 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
431
432 const LLT V2S8 = LLT::vector(2, 8);
433 const LLT V2S16 = LLT::vector(2, 16);
434 const LLT V4S16 = LLT::vector(4, 16);
435
436 const LLT V2S32 = LLT::vector(2, 32);
437 const LLT V3S32 = LLT::vector(3, 32);
438 const LLT V4S32 = LLT::vector(4, 32);
439 const LLT V5S32 = LLT::vector(5, 32);
440 const LLT V6S32 = LLT::vector(6, 32);
441 const LLT V7S32 = LLT::vector(7, 32);
442 const LLT V8S32 = LLT::vector(8, 32);
443 const LLT V9S32 = LLT::vector(9, 32);
444 const LLT V10S32 = LLT::vector(10, 32);
445 const LLT V11S32 = LLT::vector(11, 32);
446 const LLT V12S32 = LLT::vector(12, 32);
447 const LLT V13S32 = LLT::vector(13, 32);
448 const LLT V14S32 = LLT::vector(14, 32);
449 const LLT V15S32 = LLT::vector(15, 32);
450 const LLT V16S32 = LLT::vector(16, 32);
451 const LLT V32S32 = LLT::vector(32, 32);
452
453 const LLT V2S64 = LLT::vector(2, 64);
454 const LLT V3S64 = LLT::vector(3, 64);
455 const LLT V4S64 = LLT::vector(4, 64);
456 const LLT V5S64 = LLT::vector(5, 64);
457 const LLT V6S64 = LLT::vector(6, 64);
458 const LLT V7S64 = LLT::vector(7, 64);
459 const LLT V8S64 = LLT::vector(8, 64);
460 const LLT V16S64 = LLT::vector(16, 64);
461
462 std::initializer_list<LLT> AllS32Vectors =
463 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
464 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
465 std::initializer_list<LLT> AllS64Vectors =
466 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
467
468 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
469 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
470 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
471 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
472 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
473 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
474 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
475
476 const LLT CodePtr = FlatPtr;
477
478 const std::initializer_list<LLT> AddrSpaces64 = {
479 GlobalPtr, ConstantPtr, FlatPtr
480 };
481
482 const std::initializer_list<LLT> AddrSpaces32 = {
483 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
484 };
485
486 const std::initializer_list<LLT> FPTypesBase = {
487 S32, S64
488 };
489
490 const std::initializer_list<LLT> FPTypes16 = {
491 S32, S64, S16
492 };
493
494 const std::initializer_list<LLT> FPTypesPK16 = {
495 S32, S64, S16, V2S16
496 };
497
498 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
499
500 setAction({G_BRCOND, S1}, Legal); // VCC branches
501 setAction({G_BRCOND, S32}, Legal); // SCC branches
502
503 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
504 // elements for v3s16
505 getActionDefinitionsBuilder(G_PHI)
506 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
507 .legalFor(AllS32Vectors)
508 .legalFor(AllS64Vectors)
509 .legalFor(AddrSpaces64)
510 .legalFor(AddrSpaces32)
511 .legalIf(isPointer(0))
512 .clampScalar(0, S16, S256)
513 .widenScalarToNextPow2(0, 32)
514 .clampMaxNumElements(0, S32, 16)
515 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
516 .scalarize(0);
517
518 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
519 // Full set of gfx9 features.
520 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
521 .legalFor({S32, S16, V2S16})
522 .clampScalar(0, S16, S32)
523 .clampMaxNumElements(0, S16, 2)
524 .scalarize(0)
525 .widenScalarToNextPow2(0, 32);
526
527 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
528 .legalFor({S32, S16, V2S16}) // Clamp modifier
529 .minScalarOrElt(0, S16)
530 .clampMaxNumElements(0, S16, 2)
531 .scalarize(0)
532 .widenScalarToNextPow2(0, 32)
533 .lower();
534 } else if (ST.has16BitInsts()) {
535 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
536 .legalFor({S32, S16})
537 .clampScalar(0, S16, S32)
538 .scalarize(0)
539 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
540
541 // Technically the saturating operations require clamp bit support, but this
542 // was introduced at the same time as 16-bit operations.
543 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
544 .legalFor({S32, S16}) // Clamp modifier
545 .minScalar(0, S16)
546 .scalarize(0)
547 .widenScalarToNextPow2(0, 16)
548 .lower();
549
550 // We're just lowering this, but it helps get a better result to try to
551 // coerce to the desired type first.
552 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
553 .minScalar(0, S16)
554 .scalarize(0)
555 .lower();
556 } else {
557 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
558 .legalFor({S32})
559 .clampScalar(0, S32, S32)
560 .scalarize(0);
561
562 if (ST.hasIntClamp()) {
563 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
564 .legalFor({S32}) // Clamp modifier.
565 .scalarize(0)
566 .minScalarOrElt(0, S32)
567 .lower();
568 } else {
569 // Clamp bit support was added in VI, along with 16-bit operations.
570 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571 .minScalar(0, S32)
572 .scalarize(0)
573 .lower();
574 }
575
576 // FIXME: DAG expansion gets better results. The widening uses the smaller
577 // range values and goes for the min/max lowering directly.
578 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
579 .minScalar(0, S32)
580 .scalarize(0)
581 .lower();
582 }
583
584 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
585 .customFor({S32, S64})
586 .clampScalar(0, S32, S64)
587 .widenScalarToNextPow2(0, 32)
588 .scalarize(0);
589
590 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
591 .legalFor({S32})
592 .maxScalarOrElt(0, S32);
593
594 if (ST.hasVOP3PInsts()) {
595 Mulh
596 .clampMaxNumElements(0, S8, 2)
597 .lowerFor({V2S8});
598 }
599
600 Mulh
601 .scalarize(0)
602 .lower();
603
604 // Report legal for any types we can handle anywhere. For the cases only legal
605 // on the SALU, RegBankSelect will be able to re-legalize.
606 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
607 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
608 .clampScalar(0, S32, S64)
609 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
610 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
611 .widenScalarToNextPow2(0)
612 .scalarize(0);
613
614 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
615 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
616 .legalFor({{S32, S1}, {S32, S32}})
617 .minScalar(0, S32)
618 // TODO: .scalarize(0)
619 .lower();
620
621 getActionDefinitionsBuilder(G_BITCAST)
622 // Don't worry about the size constraint.
623 .legalIf(all(isRegisterType(0), isRegisterType(1)))
624 .lower();
625
626
627 getActionDefinitionsBuilder(G_CONSTANT)
628 .legalFor({S1, S32, S64, S16, GlobalPtr,
629 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
630 .legalIf(isPointer(0))
631 .clampScalar(0, S32, S64)
632 .widenScalarToNextPow2(0);
633
634 getActionDefinitionsBuilder(G_FCONSTANT)
635 .legalFor({S32, S64, S16})
636 .clampScalar(0, S16, S64);
637
638 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
639 .legalIf(isRegisterType(0))
640 // s1 and s16 are special cases because they have legal operations on
641 // them, but don't really occupy registers in the normal way.
642 .legalFor({S1, S16})
643 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
644 .clampScalarOrElt(0, S32, MaxScalar)
645 .widenScalarToNextPow2(0, 32)
646 .clampMaxNumElements(0, S32, 16);
647
648 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
649
650 // If the amount is divergent, we have to do a wave reduction to get the
651 // maximum value, so this is expanded during RegBankSelect.
652 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
653 .legalFor({{PrivatePtr, S32}});
654
655 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
656 .customIf(typeIsNot(0, PrivatePtr));
657
658 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
659
660 auto &FPOpActions = getActionDefinitionsBuilder(
661 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
662 .legalFor({S32, S64});
663 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
664 .customFor({S32, S64});
665 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
666 .customFor({S32, S64});
667
668 if (ST.has16BitInsts()) {
669 if (ST.hasVOP3PInsts())
670 FPOpActions.legalFor({S16, V2S16});
671 else
672 FPOpActions.legalFor({S16});
673
674 TrigActions.customFor({S16});
675 FDIVActions.customFor({S16});
676 }
677
678 auto &MinNumMaxNum = getActionDefinitionsBuilder({
679 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
680
681 if (ST.hasVOP3PInsts()) {
682 MinNumMaxNum.customFor(FPTypesPK16)
683 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
684 .clampMaxNumElements(0, S16, 2)
685 .clampScalar(0, S16, S64)
686 .scalarize(0);
687 } else if (ST.has16BitInsts()) {
688 MinNumMaxNum.customFor(FPTypes16)
689 .clampScalar(0, S16, S64)
690 .scalarize(0);
691 } else {
692 MinNumMaxNum.customFor(FPTypesBase)
693 .clampScalar(0, S32, S64)
694 .scalarize(0);
695 }
696
697 if (ST.hasVOP3PInsts())
698 FPOpActions.clampMaxNumElements(0, S16, 2);
699
700 FPOpActions
701 .scalarize(0)
702 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
703
704 TrigActions
705 .scalarize(0)
706 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
707
708 FDIVActions
709 .scalarize(0)
710 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
711
712 getActionDefinitionsBuilder({G_FNEG, G_FABS})
713 .legalFor(FPTypesPK16)
714 .clampMaxNumElements(0, S16, 2)
715 .scalarize(0)
716 .clampScalar(0, S16, S64);
717
718 if (ST.has16BitInsts()) {
719 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
720 .legalFor({S32, S64, S16})
721 .scalarize(0)
722 .clampScalar(0, S16, S64);
723 } else {
724 getActionDefinitionsBuilder(G_FSQRT)
725 .legalFor({S32, S64})
726 .scalarize(0)
727 .clampScalar(0, S32, S64);
728
729 if (ST.hasFractBug()) {
730 getActionDefinitionsBuilder(G_FFLOOR)
731 .customFor({S64})
732 .legalFor({S32, S64})
733 .scalarize(0)
734 .clampScalar(0, S32, S64);
735 } else {
736 getActionDefinitionsBuilder(G_FFLOOR)
737 .legalFor({S32, S64})
738 .scalarize(0)
739 .clampScalar(0, S32, S64);
740 }
741 }
742
743 getActionDefinitionsBuilder(G_FPTRUNC)
744 .legalFor({{S32, S64}, {S16, S32}})
745 .scalarize(0)
746 .lower();
747
748 getActionDefinitionsBuilder(G_FPEXT)
749 .legalFor({{S64, S32}, {S32, S16}})
750 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
751 .scalarize(0);
752
753 getActionDefinitionsBuilder(G_FSUB)
754 // Use actual fsub instruction
755 .legalFor({S32})
756 // Must use fadd + fneg
757 .lowerFor({S64, S16, V2S16})
758 .scalarize(0)
759 .clampScalar(0, S32, S64);
760
761 // Whether this is legal depends on the floating point mode for the function.
762 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
763 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
764 FMad.customFor({S32, S16});
765 else if (ST.hasMadMacF32Insts())
766 FMad.customFor({S32});
767 else if (ST.hasMadF16())
768 FMad.customFor({S16});
769 FMad.scalarize(0)
770 .lower();
771
772 auto &FRem = getActionDefinitionsBuilder(G_FREM);
773 if (ST.has16BitInsts()) {
774 FRem.customFor({S16, S32, S64});
775 } else {
776 FRem.minScalar(0, S32)
777 .customFor({S32, S64});
778 }
779 FRem.scalarize(0);
780
781 // TODO: Do we need to clamp maximum bitwidth?
782 getActionDefinitionsBuilder(G_TRUNC)
783 .legalIf(isScalar(0))
784 .legalFor({{V2S16, V2S32}})
785 .clampMaxNumElements(0, S16, 2)
786 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
787 // situations (like an invalid implicit use), we don't want to infinite loop
788 // in the legalizer.
789 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
790 .alwaysLegal();
791
792 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
793 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
794 {S32, S1}, {S64, S1}, {S16, S1}})
795 .scalarize(0)
796 .clampScalar(0, S32, S64)
797 .widenScalarToNextPow2(1, 32);
798
799 // TODO: Split s1->s64 during regbankselect for VALU.
800 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
801 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
802 .lowerFor({{S32, S64}})
803 .lowerIf(typeIs(1, S1))
804 .customFor({{S64, S64}});
805 if (ST.has16BitInsts())
806 IToFP.legalFor({{S16, S16}});
807 IToFP.clampScalar(1, S32, S64)
808 .minScalar(0, S32)
809 .scalarize(0)
810 .widenScalarToNextPow2(1);
811
812 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
813 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
814 .customFor({{S64, S64}})
815 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
816 if (ST.has16BitInsts())
817 FPToI.legalFor({{S16, S16}});
818 else
819 FPToI.minScalar(1, S32);
820
821 FPToI.minScalar(0, S32)
822 .widenScalarToNextPow2(0, 32)
823 .scalarize(0)
824 .lower();
825
826 // Lower roundeven into G_FRINT
827 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
828 .scalarize(0)
829 .lower();
830
831 if (ST.has16BitInsts()) {
832 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
833 .legalFor({S16, S32, S64})
834 .clampScalar(0, S16, S64)
835 .scalarize(0);
836 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
837 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
838 .legalFor({S32, S64})
839 .clampScalar(0, S32, S64)
840 .scalarize(0);
841 } else {
842 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
843 .legalFor({S32})
844 .customFor({S64})
845 .clampScalar(0, S32, S64)
846 .scalarize(0);
847 }
848
849 getActionDefinitionsBuilder(G_PTR_ADD)
850 .legalIf(all(isPointer(0), sameSize(0, 1)))
851 .scalarize(0)
852 .scalarSameSizeAs(1, 0);
853
854 getActionDefinitionsBuilder(G_PTRMASK)
855 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
856 .scalarSameSizeAs(1, 0)
857 .scalarize(0);
858
859 auto &CmpBuilder =
860 getActionDefinitionsBuilder(G_ICMP)
861 // The compare output type differs based on the register bank of the output,
862 // so make both s1 and s32 legal.
863 //
864 // Scalar compares producing output in scc will be promoted to s32, as that
865 // is the allocatable register type that will be needed for the copy from
866 // scc. This will be promoted during RegBankSelect, and we assume something
867 // before that won't try to use s32 result types.
868 //
869 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
870 // bank.
871 .legalForCartesianProduct(
872 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
873 .legalForCartesianProduct(
874 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
875 if (ST.has16BitInsts()) {
876 CmpBuilder.legalFor({{S1, S16}});
877 }
878
879 CmpBuilder
880 .widenScalarToNextPow2(1)
881 .clampScalar(1, S32, S64)
882 .scalarize(0)
883 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
884
885 getActionDefinitionsBuilder(G_FCMP)
886 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
887 .widenScalarToNextPow2(1)
888 .clampScalar(1, S32, S64)
889 .scalarize(0);
890
891 // FIXME: fpow has a selection pattern that should move to custom lowering.
892 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
893 if (ST.has16BitInsts())
894 Exp2Ops.legalFor({S32, S16});
895 else
896 Exp2Ops.legalFor({S32});
897 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
898 Exp2Ops.scalarize(0);
899
900 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
901 if (ST.has16BitInsts())
902 ExpOps.customFor({{S32}, {S16}});
903 else
904 ExpOps.customFor({S32});
905 ExpOps.clampScalar(0, MinScalarFPTy, S32)
906 .scalarize(0);
907
908 getActionDefinitionsBuilder(G_FPOWI)
909 .clampScalar(0, MinScalarFPTy, S32)
910 .lower();
911
912 // The 64-bit versions produce 32-bit results, but only on the SALU.
913 getActionDefinitionsBuilder(G_CTPOP)
914 .legalFor({{S32, S32}, {S32, S64}})
915 .clampScalar(0, S32, S32)
916 .clampScalar(1, S32, S64)
917 .scalarize(0)
918 .widenScalarToNextPow2(0, 32)
919 .widenScalarToNextPow2(1, 32);
920
921 // The hardware instructions return a different result on 0 than the generic
922 // instructions expect. The hardware produces -1, but these produce the
923 // bitwidth.
924 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
925 .scalarize(0)
926 .clampScalar(0, S32, S32)
927 .clampScalar(1, S32, S64)
928 .widenScalarToNextPow2(0, 32)
929 .widenScalarToNextPow2(1, 32)
930 .lower();
931
932 // The 64-bit versions produce 32-bit results, but only on the SALU.
933 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
934 .legalFor({{S32, S32}, {S32, S64}})
935 .clampScalar(0, S32, S32)
936 .clampScalar(1, S32, S64)
937 .scalarize(0)
938 .widenScalarToNextPow2(0, 32)
939 .widenScalarToNextPow2(1, 32);
940
941 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
942 // RegBankSelect.
943 getActionDefinitionsBuilder(G_BITREVERSE)
944 .legalFor({S32, S64})
945 .clampScalar(0, S32, S64)
946 .scalarize(0)
947 .widenScalarToNextPow2(0);
948
949 if (ST.has16BitInsts()) {
950 getActionDefinitionsBuilder(G_BSWAP)
951 .legalFor({S16, S32, V2S16})
952 .clampMaxNumElements(0, S16, 2)
953 // FIXME: Fixing non-power-of-2 before clamp is workaround for
954 // narrowScalar limitation.
955 .widenScalarToNextPow2(0)
956 .clampScalar(0, S16, S32)
957 .scalarize(0);
958
959 if (ST.hasVOP3PInsts()) {
960 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
961 .legalFor({S32, S16, V2S16})
962 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
963 .clampMaxNumElements(0, S16, 2)
964 .minScalar(0, S16)
965 .widenScalarToNextPow2(0)
966 .scalarize(0)
967 .lower();
968 } else {
969 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
970 .legalFor({S32, S16})
971 .widenScalarToNextPow2(0)
972 .minScalar(0, S16)
973 .scalarize(0)
974 .lower();
975 }
976 } else {
977 // TODO: Should have same legality without v_perm_b32
978 getActionDefinitionsBuilder(G_BSWAP)
979 .legalFor({S32})
980 .lowerIf(scalarNarrowerThan(0, 32))
981 // FIXME: Fixing non-power-of-2 before clamp is workaround for
982 // narrowScalar limitation.
983 .widenScalarToNextPow2(0)
984 .maxScalar(0, S32)
985 .scalarize(0)
986 .lower();
987
988 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
989 .legalFor({S32})
990 .minScalar(0, S32)
991 .widenScalarToNextPow2(0)
992 .scalarize(0)
993 .lower();
994 }
995
996 getActionDefinitionsBuilder(G_INTTOPTR)
997 // List the common cases
998 .legalForCartesianProduct(AddrSpaces64, {S64})
999 .legalForCartesianProduct(AddrSpaces32, {S32})
1000 .scalarize(0)
1001 // Accept any address space as long as the size matches
1002 .legalIf(sameSize(0, 1))
1003 .widenScalarIf(smallerThan(1, 0),
1004 [](const LegalityQuery &Query) {
1005 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1006 })
1007 .narrowScalarIf(largerThan(1, 0),
1008 [](const LegalityQuery &Query) {
1009 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1010 });
1011
1012 getActionDefinitionsBuilder(G_PTRTOINT)
1013 // List the common cases
1014 .legalForCartesianProduct(AddrSpaces64, {S64})
1015 .legalForCartesianProduct(AddrSpaces32, {S32})
1016 .scalarize(0)
1017 // Accept any address space as long as the size matches
1018 .legalIf(sameSize(0, 1))
1019 .widenScalarIf(smallerThan(0, 1),
1020 [](const LegalityQuery &Query) {
1021 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1022 })
1023 .narrowScalarIf(
1024 largerThan(0, 1),
1025 [](const LegalityQuery &Query) {
1026 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1027 });
1028
1029 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1030 .scalarize(0)
1031 .custom();
1032
1033 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1034 bool IsLoad) -> bool {
1035 const LLT DstTy = Query.Types[0];
1036
1037 // Split vector extloads.
1038 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1039 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1040
1041 if (MemSize < DstTy.getSizeInBits())
1042 MemSize = std::max(MemSize, AlignBits);
1043
1044 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1045 return true;
1046
1047 const LLT PtrTy = Query.Types[1];
1048 unsigned AS = PtrTy.getAddressSpace();
1049 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1050 return true;
1051
1052 // Catch weird sized loads that don't evenly divide into the access sizes
1053 // TODO: May be able to widen depending on alignment etc.
1054 unsigned NumRegs = (MemSize + 31) / 32;
1055 if (NumRegs == 3) {
1056 if (!ST.hasDwordx3LoadStores())
1057 return true;
1058 } else {
1059 // If the alignment allows, these should have been widened.
1060 if (!isPowerOf2_32(NumRegs))
1061 return true;
1062 }
1063
1064 if (AlignBits < MemSize) {
1065 const SITargetLowering *TLI = ST.getTargetLowering();
1066 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1067 Align(AlignBits / 8));
1068 }
1069
1070 return false;
1071 };
1072
1073 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1074 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1075 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1076
1077 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1078 // LDS
1079 // TODO: Unsupported flat for SI.
1080
1081 for (unsigned Op : {G_LOAD, G_STORE}) {
1082 const bool IsStore = Op == G_STORE;
1083
1084 auto &Actions = getActionDefinitionsBuilder(Op);
1085 // Explicitly list some common cases.
1086 // TODO: Does this help compile time at all?
1087 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1088 {V2S32, GlobalPtr, 64, GlobalAlign32},
1089 {V4S32, GlobalPtr, 128, GlobalAlign32},
1090 {S64, GlobalPtr, 64, GlobalAlign32},
1091 {V2S64, GlobalPtr, 128, GlobalAlign32},
1092 {V2S16, GlobalPtr, 32, GlobalAlign32},
1093 {S32, GlobalPtr, 8, GlobalAlign8},
1094 {S32, GlobalPtr, 16, GlobalAlign16},
1095
1096 {S32, LocalPtr, 32, 32},
1097 {S64, LocalPtr, 64, 32},
1098 {V2S32, LocalPtr, 64, 32},
1099 {S32, LocalPtr, 8, 8},
1100 {S32, LocalPtr, 16, 16},
1101 {V2S16, LocalPtr, 32, 32},
1102
1103 {S32, PrivatePtr, 32, 32},
1104 {S32, PrivatePtr, 8, 8},
1105 {S32, PrivatePtr, 16, 16},
1106 {V2S16, PrivatePtr, 32, 32},
1107
1108 {S32, ConstantPtr, 32, GlobalAlign32},
1109 {V2S32, ConstantPtr, 64, GlobalAlign32},
1110 {V4S32, ConstantPtr, 128, GlobalAlign32},
1111 {S64, ConstantPtr, 64, GlobalAlign32},
1112 {V2S32, ConstantPtr, 32, GlobalAlign32}});
1113 Actions.legalIf(
1114 [=](const LegalityQuery &Query) -> bool {
1115 return isLoadStoreLegal(ST, Query, Op);
1116 });
1117
1118 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1119 // 64-bits.
1120 //
1121 // TODO: Should generalize bitcast action into coerce, which will also cover
1122 // inserting addrspacecasts.
1123 Actions.customIf(typeIs(1, Constant32Ptr));
1124
1125 // Turn any illegal element vectors into something easier to deal
1126 // with. These will ultimately produce 32-bit scalar shifts to extract the
1127 // parts anyway.
1128 //
1129 // For odd 16-bit element vectors, prefer to split those into pieces with
1130 // 16-bit vector parts.
1131 Actions.bitcastIf(
1132 [=](const LegalityQuery &Query) -> bool {
1133 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1134 Query.MMODescrs[0].SizeInBits);
1135 }, bitcastToRegisterType(0));
1136
1137 if (!IsStore) {
1138 // Widen suitably aligned loads by loading extra bytes. The standard
1139 // legalization actions can't properly express widening memory operands.
1140 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1141 return shouldWidenLoad(ST, Query, G_LOAD);
1142 });
1143 }
1144
1145 // FIXME: load/store narrowing should be moved to lower action
1146 Actions
1147 .narrowScalarIf(
1148 [=](const LegalityQuery &Query) -> bool {
1149 return !Query.Types[0].isVector() &&
1150 needToSplitMemOp(Query, Op == G_LOAD);
1151 },
1152 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1153 const LLT DstTy = Query.Types[0];
1154 const LLT PtrTy = Query.Types[1];
1155
1156 const unsigned DstSize = DstTy.getSizeInBits();
1157 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1158
1159 // Split extloads.
1160 if (DstSize > MemSize)
1161 return std::make_pair(0, LLT::scalar(MemSize));
1162
1163 if (!isPowerOf2_32(DstSize)) {
1164 // We're probably decomposing an odd sized store. Try to split
1165 // to the widest type. TODO: Account for alignment. As-is it
1166 // should be OK, since the new parts will be further legalized.
1167 unsigned FloorSize = PowerOf2Floor(DstSize);
1168 return std::make_pair(0, LLT::scalar(FloorSize));
1169 }
1170
1171 if (DstSize > 32 && (DstSize % 32 != 0)) {
1172 // FIXME: Need a way to specify non-extload of larger size if
1173 // suitably aligned.
1174 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1175 }
1176
1177 unsigned MaxSize = maxSizeForAddrSpace(ST,
1178 PtrTy.getAddressSpace(),
1179 Op == G_LOAD);
1180 if (MemSize > MaxSize)
1181 return std::make_pair(0, LLT::scalar(MaxSize));
1182
1183 unsigned Align = Query.MMODescrs[0].AlignInBits;
1184 return std::make_pair(0, LLT::scalar(Align));
1185 })
1186 .fewerElementsIf(
1187 [=](const LegalityQuery &Query) -> bool {
1188 return Query.Types[0].isVector() &&
1189 needToSplitMemOp(Query, Op == G_LOAD);
1190 },
1191 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1192 const LLT DstTy = Query.Types[0];
1193 const LLT PtrTy = Query.Types[1];
1194
1195 LLT EltTy = DstTy.getElementType();
1196 unsigned MaxSize = maxSizeForAddrSpace(ST,
1197 PtrTy.getAddressSpace(),
1198 Op == G_LOAD);
1199
1200 // FIXME: Handle widened to power of 2 results better. This ends
1201 // up scalarizing.
1202 // FIXME: 3 element stores scalarized on SI
1203
1204 // Split if it's too large for the address space.
1205 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1206 unsigned NumElts = DstTy.getNumElements();
1207 unsigned EltSize = EltTy.getSizeInBits();
1208
1209 if (MaxSize % EltSize == 0) {
1210 return std::make_pair(
1211 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1212 }
1213
1214 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1215
1216 // FIXME: Refine when odd breakdowns handled
1217 // The scalars will need to be re-legalized.
1218 if (NumPieces == 1 || NumPieces >= NumElts ||
1219 NumElts % NumPieces != 0)
1220 return std::make_pair(0, EltTy);
1221
1222 return std::make_pair(0,
1223 LLT::vector(NumElts / NumPieces, EltTy));
1224 }
1225
1226 // FIXME: We could probably handle weird extending loads better.
1227 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1228 if (DstTy.getSizeInBits() > MemSize)
1229 return std::make_pair(0, EltTy);
1230
1231 unsigned EltSize = EltTy.getSizeInBits();
1232 unsigned DstSize = DstTy.getSizeInBits();
1233 if (!isPowerOf2_32(DstSize)) {
1234 // We're probably decomposing an odd sized store. Try to split
1235 // to the widest type. TODO: Account for alignment. As-is it
1236 // should be OK, since the new parts will be further legalized.
1237 unsigned FloorSize = PowerOf2Floor(DstSize);
1238 return std::make_pair(
1239 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1240 }
1241
1242 // Need to split because of alignment.
1243 unsigned Align = Query.MMODescrs[0].AlignInBits;
1244 if (EltSize > Align &&
1245 (EltSize / Align < DstTy.getNumElements())) {
1246 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1247 }
1248
1249 // May need relegalization for the scalars.
1250 return std::make_pair(0, EltTy);
1251 })
1252 .lowerIfMemSizeNotPow2()
1253 .minScalar(0, S32);
1254
1255 if (IsStore)
1256 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1257
1258 Actions
1259 .widenScalarToNextPow2(0)
1260 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1261 .lower();
1262 }
1263
1264 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1265 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1266 {S32, GlobalPtr, 16, 2 * 8},
1267 {S32, LocalPtr, 8, 8},
1268 {S32, LocalPtr, 16, 16},
1269 {S32, PrivatePtr, 8, 8},
1270 {S32, PrivatePtr, 16, 16},
1271 {S32, ConstantPtr, 8, 8},
1272 {S32, ConstantPtr, 16, 2 * 8}});
1273 if (ST.hasFlatAddressSpace()) {
1274 ExtLoads.legalForTypesWithMemDesc(
1275 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1276 }
1277
1278 ExtLoads.clampScalar(0, S32, S32)
1279 .widenScalarToNextPow2(0)
1280 .unsupportedIfMemSizeNotPow2()
1281 .lower();
1282
1283 auto &Atomics = getActionDefinitionsBuilder(
1284 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1285 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1286 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1287 G_ATOMICRMW_UMIN})
1288 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1289 {S64, GlobalPtr}, {S64, LocalPtr},
1290 {S32, RegionPtr}, {S64, RegionPtr}});
1291 if (ST.hasFlatAddressSpace()) {
1292 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1293 }
1294
1295 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1296 if (ST.hasLDSFPAtomics()) {
1297 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1298 if (ST.hasGFX90AInsts())
1299 Atomic.legalFor({{S64, LocalPtr}});
1300 }
1301 if (ST.hasAtomicFaddInsts())
1302 Atomic.legalFor({{S32, GlobalPtr}});
1303
1304 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1305 // demarshalling
1306 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1307 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1308 {S32, FlatPtr}, {S64, FlatPtr}})
1309 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1310 {S32, RegionPtr}, {S64, RegionPtr}});
1311 // TODO: Pointer types, any 32-bit or 64-bit vector
1312
1313 // Condition should be s32 for scalar, s1 for vector.
1314 getActionDefinitionsBuilder(G_SELECT)
1315 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1316 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1317 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1318 .clampScalar(0, S16, S64)
1319 .scalarize(1)
1320 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1321 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1322 .clampMaxNumElements(0, S32, 2)
1323 .clampMaxNumElements(0, LocalPtr, 2)
1324 .clampMaxNumElements(0, PrivatePtr, 2)
1325 .scalarize(0)
1326 .widenScalarToNextPow2(0)
1327 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1328
1329 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1330 // be more flexible with the shift amount type.
1331 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1332 .legalFor({{S32, S32}, {S64, S32}});
1333 if (ST.has16BitInsts()) {
1334 if (ST.hasVOP3PInsts()) {
1335 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1336 .clampMaxNumElements(0, S16, 2);
1337 } else
1338 Shifts.legalFor({{S16, S16}});
1339
1340 // TODO: Support 16-bit shift amounts for all types
1341 Shifts.widenScalarIf(
1342 [=](const LegalityQuery &Query) {
1343 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1344 // 32-bit amount.
1345 const LLT ValTy = Query.Types[0];
1346 const LLT AmountTy = Query.Types[1];
1347 return ValTy.getSizeInBits() <= 16 &&
1348 AmountTy.getSizeInBits() < 16;
1349 }, changeTo(1, S16));
1350 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1351 Shifts.clampScalar(1, S32, S32);
1352 Shifts.clampScalar(0, S16, S64);
1353 Shifts.widenScalarToNextPow2(0, 16);
1354
1355 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1356 .minScalar(0, S16)
1357 .scalarize(0)
1358 .lower();
1359 } else {
1360 // Make sure we legalize the shift amount type first, as the general
1361 // expansion for the shifted type will produce much worse code if it hasn't
1362 // been truncated already.
1363 Shifts.clampScalar(1, S32, S32);
1364 Shifts.clampScalar(0, S32, S64);
1365 Shifts.widenScalarToNextPow2(0, 32);
1366
1367 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1368 .minScalar(0, S32)
1369 .scalarize(0)
1370 .lower();
1371 }
1372 Shifts.scalarize(0);
1373
1374 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1375 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1376 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1377 unsigned IdxTypeIdx = 2;
1378
1379 getActionDefinitionsBuilder(Op)
1380 .customIf([=](const LegalityQuery &Query) {
1381 const LLT EltTy = Query.Types[EltTypeIdx];
1382 const LLT VecTy = Query.Types[VecTypeIdx];
1383 const LLT IdxTy = Query.Types[IdxTypeIdx];
1384 const unsigned EltSize = EltTy.getSizeInBits();
1385 return (EltSize == 32 || EltSize == 64) &&
1386 VecTy.getSizeInBits() % 32 == 0 &&
1387 VecTy.getSizeInBits() <= MaxRegisterSize &&
1388 IdxTy.getSizeInBits() == 32;
1389 })
1390 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1391 bitcastToVectorElement32(VecTypeIdx))
1392 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1393 .bitcastIf(
1394 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1395 [=](const LegalityQuery &Query) {
1396 // For > 64-bit element types, try to turn this into a 64-bit
1397 // element vector since we may be able to do better indexing
1398 // if this is scalar. If not, fall back to 32.
1399 const LLT EltTy = Query.Types[EltTypeIdx];
1400 const LLT VecTy = Query.Types[VecTypeIdx];
1401 const unsigned DstEltSize = EltTy.getSizeInBits();
1402 const unsigned VecSize = VecTy.getSizeInBits();
1403
1404 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1405 return std::make_pair(
1406 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1407 })
1408 .clampScalar(EltTypeIdx, S32, S64)
1409 .clampScalar(VecTypeIdx, S32, S64)
1410 .clampScalar(IdxTypeIdx, S32, S32)
1411 .clampMaxNumElements(VecTypeIdx, S32, 32)
1412 // TODO: Clamp elements for 64-bit vectors?
1413 // It should only be necessary with variable indexes.
1414 // As a last resort, lower to the stack
1415 .lower();
1416 }
1417
1418 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1419 .unsupportedIf([=](const LegalityQuery &Query) {
1420 const LLT &EltTy = Query.Types[1].getElementType();
1421 return Query.Types[0] != EltTy;
1422 });
1423
1424 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1425 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1426 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1427
1428 // FIXME: Doesn't handle extract of illegal sizes.
1429 getActionDefinitionsBuilder(Op)
1430 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1431 // FIXME: Multiples of 16 should not be legal.
1432 .legalIf([=](const LegalityQuery &Query) {
1433 const LLT BigTy = Query.Types[BigTyIdx];
1434 const LLT LitTy = Query.Types[LitTyIdx];
1435 return (BigTy.getSizeInBits() % 32 == 0) &&
1436 (LitTy.getSizeInBits() % 16 == 0);
1437 })
1438 .widenScalarIf(
1439 [=](const LegalityQuery &Query) {
1440 const LLT BigTy = Query.Types[BigTyIdx];
1441 return (BigTy.getScalarSizeInBits() < 16);
1442 },
1443 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1444 .widenScalarIf(
1445 [=](const LegalityQuery &Query) {
1446 const LLT LitTy = Query.Types[LitTyIdx];
1447 return (LitTy.getScalarSizeInBits() < 16);
1448 },
1449 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1450 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1451 .widenScalarToNextPow2(BigTyIdx, 32);
1452
1453 }
1454
1455 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1456 .legalForCartesianProduct(AllS32Vectors, {S32})
1457 .legalForCartesianProduct(AllS64Vectors, {S64})
1458 .clampNumElements(0, V16S32, V32S32)
1459 .clampNumElements(0, V2S64, V16S64)
1460 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1461
1462 if (ST.hasScalarPackInsts()) {
1463 BuildVector
1464 // FIXME: Should probably widen s1 vectors straight to s32
1465 .minScalarOrElt(0, S16)
1466 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1467 .minScalar(1, S32);
1468
1469 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1470 .legalFor({V2S16, S32})
1471 .lower();
1472 BuildVector.minScalarOrElt(0, S32);
1473 } else {
1474 BuildVector.customFor({V2S16, S16});
1475 BuildVector.minScalarOrElt(0, S32);
1476
1477 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1478 .customFor({V2S16, S32})
1479 .lower();
1480 }
1481
1482 BuildVector.legalIf(isRegisterType(0));
1483
1484 // FIXME: Clamp maximum size
1485 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1486 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1487 .clampMaxNumElements(0, S32, 32)
1488 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1489 .clampMaxNumElements(0, S16, 64);
1490
1491 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1492 // pre-legalize.
1493 if (ST.hasVOP3PInsts()) {
1494 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1495 .customFor({V2S16, V2S16})
1496 .lower();
1497 } else
1498 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1499
1500 // Merge/Unmerge
1501 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1502 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1503 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1504
1505 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1506 const LLT Ty = Query.Types[TypeIdx];
1507 if (Ty.isVector()) {
1508 const LLT &EltTy = Ty.getElementType();
1509 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1510 return true;
1511 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1512 return true;
1513 }
1514 return false;
1515 };
1516
1517 auto &Builder = getActionDefinitionsBuilder(Op)
1518 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1519 .lowerFor({{S16, V2S16}})
1520 .lowerIf([=](const LegalityQuery &Query) {
1521 const LLT BigTy = Query.Types[BigTyIdx];
1522 return BigTy.getSizeInBits() == 32;
1523 })
1524 // Try to widen to s16 first for small types.
1525 // TODO: Only do this on targets with legal s16 shifts
1526 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1527 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1528 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1529 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1530 elementTypeIs(1, S16)),
1531 changeTo(1, V2S16))
1532 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1533 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1534 // valid.
1535 .clampScalar(LitTyIdx, S32, S512)
1536 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1537 // Break up vectors with weird elements into scalars
1538 .fewerElementsIf(
1539 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1540 scalarize(0))
1541 .fewerElementsIf(
1542 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1543 scalarize(1))
1544 .clampScalar(BigTyIdx, S32, MaxScalar);
1545
1546 if (Op == G_MERGE_VALUES) {
1547 Builder.widenScalarIf(
1548 // TODO: Use 16-bit shifts if legal for 8-bit values?
1549 [=](const LegalityQuery &Query) {
1550 const LLT Ty = Query.Types[LitTyIdx];
1551 return Ty.getSizeInBits() < 32;
1552 },
1553 changeTo(LitTyIdx, S32));
1554 }
1555
1556 Builder.widenScalarIf(
1557 [=](const LegalityQuery &Query) {
1558 const LLT Ty = Query.Types[BigTyIdx];
1559 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1560 Ty.getSizeInBits() % 16 != 0;
1561 },
1562 [=](const LegalityQuery &Query) {
1563 // Pick the next power of 2, or a multiple of 64 over 128.
1564 // Whichever is smaller.
1565 const LLT &Ty = Query.Types[BigTyIdx];
1566 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1567 if (NewSizeInBits >= 256) {
1568 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1569 if (RoundedTo < NewSizeInBits)
1570 NewSizeInBits = RoundedTo;
1571 }
1572 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1573 })
1574 // Any vectors left are the wrong size. Scalarize them.
1575 .scalarize(0)
1576 .scalarize(1);
1577 }
1578
1579 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1580 // RegBankSelect.
1581 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1582 .legalFor({{S32}, {S64}});
1583
1584 if (ST.hasVOP3PInsts()) {
1585 SextInReg.lowerFor({{V2S16}})
1586 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1587 // get more vector shift opportunities, since we'll get those when
1588 // expanded.
1589 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1590 } else if (ST.has16BitInsts()) {
1591 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1592 } else {
1593 // Prefer to promote to s32 before lowering if we don't have 16-bit
1594 // shifts. This avoid a lot of intermediate truncate and extend operations.
1595 SextInReg.lowerFor({{S32}, {S64}});
1596 }
1597
1598 SextInReg
1599 .scalarize(0)
1600 .clampScalar(0, S32, S64)
1601 .lower();
1602
1603 // TODO: Only Try to form v2s16 with legal packed instructions.
1604 getActionDefinitionsBuilder(G_FSHR)
1605 .legalFor({{S32, S32}})
1606 .lowerFor({{V2S16, V2S16}})
1607 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1608 .scalarize(0)
1609 .lower();
1610
1611 if (ST.hasVOP3PInsts()) {
1612 getActionDefinitionsBuilder(G_FSHL)
1613 .lowerFor({{V2S16, V2S16}})
1614 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1615 .scalarize(0)
1616 .lower();
1617 } else {
1618 getActionDefinitionsBuilder(G_FSHL)
1619 .scalarize(0)
1620 .lower();
1621 }
1622
1623 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1624 .legalFor({S64});
1625
1626 getActionDefinitionsBuilder(G_FENCE)
1627 .alwaysLegal();
1628
1629 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1630 .scalarize(0)
1631 .minScalar(0, S32)
1632 .lower();
1633
1634 getActionDefinitionsBuilder({
1635 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1636 G_FCOPYSIGN,
1637
1638 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1639 G_ATOMICRMW_NAND,
1640 G_ATOMICRMW_FSUB,
1641 G_READ_REGISTER,
1642 G_WRITE_REGISTER,
1643
1644 G_SADDO, G_SSUBO,
1645
1646 // TODO: Implement
1647 G_FMINIMUM, G_FMAXIMUM}).lower();
1648
1649 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1650 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1651 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1652 .unsupported();
1653
1654 computeTables();
1655 verify(*ST.getInstrInfo());
1656 }
1657
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI) const1658 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1659 MachineInstr &MI) const {
1660 MachineIRBuilder &B = Helper.MIRBuilder;
1661 MachineRegisterInfo &MRI = *B.getMRI();
1662
1663 switch (MI.getOpcode()) {
1664 case TargetOpcode::G_ADDRSPACE_CAST:
1665 return legalizeAddrSpaceCast(MI, MRI, B);
1666 case TargetOpcode::G_FRINT:
1667 return legalizeFrint(MI, MRI, B);
1668 case TargetOpcode::G_FCEIL:
1669 return legalizeFceil(MI, MRI, B);
1670 case TargetOpcode::G_FREM:
1671 return legalizeFrem(MI, MRI, B);
1672 case TargetOpcode::G_INTRINSIC_TRUNC:
1673 return legalizeIntrinsicTrunc(MI, MRI, B);
1674 case TargetOpcode::G_SITOFP:
1675 return legalizeITOFP(MI, MRI, B, true);
1676 case TargetOpcode::G_UITOFP:
1677 return legalizeITOFP(MI, MRI, B, false);
1678 case TargetOpcode::G_FPTOSI:
1679 return legalizeFPTOI(MI, MRI, B, true);
1680 case TargetOpcode::G_FPTOUI:
1681 return legalizeFPTOI(MI, MRI, B, false);
1682 case TargetOpcode::G_FMINNUM:
1683 case TargetOpcode::G_FMAXNUM:
1684 case TargetOpcode::G_FMINNUM_IEEE:
1685 case TargetOpcode::G_FMAXNUM_IEEE:
1686 return legalizeMinNumMaxNum(Helper, MI);
1687 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1688 return legalizeExtractVectorElt(MI, MRI, B);
1689 case TargetOpcode::G_INSERT_VECTOR_ELT:
1690 return legalizeInsertVectorElt(MI, MRI, B);
1691 case TargetOpcode::G_SHUFFLE_VECTOR:
1692 return legalizeShuffleVector(MI, MRI, B);
1693 case TargetOpcode::G_FSIN:
1694 case TargetOpcode::G_FCOS:
1695 return legalizeSinCos(MI, MRI, B);
1696 case TargetOpcode::G_GLOBAL_VALUE:
1697 return legalizeGlobalValue(MI, MRI, B);
1698 case TargetOpcode::G_LOAD:
1699 return legalizeLoad(Helper, MI);
1700 case TargetOpcode::G_FMAD:
1701 return legalizeFMad(MI, MRI, B);
1702 case TargetOpcode::G_FDIV:
1703 return legalizeFDIV(MI, MRI, B);
1704 case TargetOpcode::G_UDIV:
1705 case TargetOpcode::G_UREM:
1706 return legalizeUDIV_UREM(MI, MRI, B);
1707 case TargetOpcode::G_SDIV:
1708 case TargetOpcode::G_SREM:
1709 return legalizeSDIV_SREM(MI, MRI, B);
1710 case TargetOpcode::G_ATOMIC_CMPXCHG:
1711 return legalizeAtomicCmpXChg(MI, MRI, B);
1712 case TargetOpcode::G_FLOG:
1713 return legalizeFlog(MI, B, numbers::ln2f);
1714 case TargetOpcode::G_FLOG10:
1715 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1716 case TargetOpcode::G_FEXP:
1717 return legalizeFExp(MI, B);
1718 case TargetOpcode::G_FPOW:
1719 return legalizeFPow(MI, B);
1720 case TargetOpcode::G_FFLOOR:
1721 return legalizeFFloor(MI, MRI, B);
1722 case TargetOpcode::G_BUILD_VECTOR:
1723 return legalizeBuildVector(MI, MRI, B);
1724 default:
1725 return false;
1726 }
1727
1728 llvm_unreachable("expected switch to return");
1729 }
1730
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1731 Register AMDGPULegalizerInfo::getSegmentAperture(
1732 unsigned AS,
1733 MachineRegisterInfo &MRI,
1734 MachineIRBuilder &B) const {
1735 MachineFunction &MF = B.getMF();
1736 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1737 const LLT S32 = LLT::scalar(32);
1738
1739 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1740
1741 if (ST.hasApertureRegs()) {
1742 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1743 // getreg.
1744 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1745 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1746 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1747 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1748 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1749 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1750 unsigned Encoding =
1751 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1752 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1753 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1754
1755 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1756
1757 B.buildInstr(AMDGPU::S_GETREG_B32)
1758 .addDef(GetReg)
1759 .addImm(Encoding);
1760 MRI.setType(GetReg, S32);
1761
1762 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1763 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1764 }
1765
1766 Register QueuePtr = MRI.createGenericVirtualRegister(
1767 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1768
1769 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1770 return Register();
1771
1772 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1773 // private_segment_aperture_base_hi.
1774 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1775
1776 // TODO: can we be smarter about machine pointer info?
1777 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1778 MachineMemOperand *MMO = MF.getMachineMemOperand(
1779 PtrInfo,
1780 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1781 MachineMemOperand::MOInvariant,
1782 4, commonAlignment(Align(64), StructOffset));
1783
1784 Register LoadAddr;
1785
1786 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1787 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1788 }
1789
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1790 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1791 MachineInstr &MI, MachineRegisterInfo &MRI,
1792 MachineIRBuilder &B) const {
1793 MachineFunction &MF = B.getMF();
1794
1795 const LLT S32 = LLT::scalar(32);
1796 Register Dst = MI.getOperand(0).getReg();
1797 Register Src = MI.getOperand(1).getReg();
1798
1799 LLT DstTy = MRI.getType(Dst);
1800 LLT SrcTy = MRI.getType(Src);
1801 unsigned DestAS = DstTy.getAddressSpace();
1802 unsigned SrcAS = SrcTy.getAddressSpace();
1803
1804 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1805 // vector element.
1806 assert(!DstTy.isVector());
1807
1808 const AMDGPUTargetMachine &TM
1809 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1810
1811 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1812 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1813 return true;
1814 }
1815
1816 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1817 // Truncate.
1818 B.buildExtract(Dst, Src, 0);
1819 MI.eraseFromParent();
1820 return true;
1821 }
1822
1823 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1824 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1825 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1826
1827 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1828 // another. Merge operands are required to be the same type, but creating an
1829 // extra ptrtoint would be kind of pointless.
1830 auto HighAddr = B.buildConstant(
1831 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1832 B.buildMerge(Dst, {Src, HighAddr});
1833 MI.eraseFromParent();
1834 return true;
1835 }
1836
1837 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1838 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1839 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1840 unsigned NullVal = TM.getNullPointerValue(DestAS);
1841
1842 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1843 auto FlatNull = B.buildConstant(SrcTy, 0);
1844
1845 // Extract low 32-bits of the pointer.
1846 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1847
1848 auto CmpRes =
1849 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1850 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1851
1852 MI.eraseFromParent();
1853 return true;
1854 }
1855
1856 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1857 return false;
1858
1859 if (!ST.hasFlatAddressSpace())
1860 return false;
1861
1862 auto SegmentNull =
1863 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1864 auto FlatNull =
1865 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1866
1867 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1868 if (!ApertureReg.isValid())
1869 return false;
1870
1871 auto CmpRes =
1872 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1873
1874 // Coerce the type of the low half of the result so we can use merge_values.
1875 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1876
1877 // TODO: Should we allow mismatched types but matching sizes in merges to
1878 // avoid the ptrtoint?
1879 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1880 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1881
1882 MI.eraseFromParent();
1883 return true;
1884 }
1885
legalizeFrint(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1886 bool AMDGPULegalizerInfo::legalizeFrint(
1887 MachineInstr &MI, MachineRegisterInfo &MRI,
1888 MachineIRBuilder &B) const {
1889 Register Src = MI.getOperand(1).getReg();
1890 LLT Ty = MRI.getType(Src);
1891 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1892
1893 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1894 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1895
1896 auto C1 = B.buildFConstant(Ty, C1Val);
1897 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1898
1899 // TODO: Should this propagate fast-math-flags?
1900 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1901 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1902
1903 auto C2 = B.buildFConstant(Ty, C2Val);
1904 auto Fabs = B.buildFAbs(Ty, Src);
1905
1906 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1907 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1908 MI.eraseFromParent();
1909 return true;
1910 }
1911
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1912 bool AMDGPULegalizerInfo::legalizeFceil(
1913 MachineInstr &MI, MachineRegisterInfo &MRI,
1914 MachineIRBuilder &B) const {
1915
1916 const LLT S1 = LLT::scalar(1);
1917 const LLT S64 = LLT::scalar(64);
1918
1919 Register Src = MI.getOperand(1).getReg();
1920 assert(MRI.getType(Src) == S64);
1921
1922 // result = trunc(src)
1923 // if (src > 0.0 && src != result)
1924 // result += 1.0
1925
1926 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1927
1928 const auto Zero = B.buildFConstant(S64, 0.0);
1929 const auto One = B.buildFConstant(S64, 1.0);
1930 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1931 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1932 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1933 auto Add = B.buildSelect(S64, And, One, Zero);
1934
1935 // TODO: Should this propagate fast-math-flags?
1936 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1937 return true;
1938 }
1939
legalizeFrem(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1940 bool AMDGPULegalizerInfo::legalizeFrem(
1941 MachineInstr &MI, MachineRegisterInfo &MRI,
1942 MachineIRBuilder &B) const {
1943 Register DstReg = MI.getOperand(0).getReg();
1944 Register Src0Reg = MI.getOperand(1).getReg();
1945 Register Src1Reg = MI.getOperand(2).getReg();
1946 auto Flags = MI.getFlags();
1947 LLT Ty = MRI.getType(DstReg);
1948
1949 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1950 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1951 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1952 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1953 MI.eraseFromParent();
1954 return true;
1955 }
1956
extractF64Exponent(Register Hi,MachineIRBuilder & B)1957 static MachineInstrBuilder extractF64Exponent(Register Hi,
1958 MachineIRBuilder &B) {
1959 const unsigned FractBits = 52;
1960 const unsigned ExpBits = 11;
1961 LLT S32 = LLT::scalar(32);
1962
1963 auto Const0 = B.buildConstant(S32, FractBits - 32);
1964 auto Const1 = B.buildConstant(S32, ExpBits);
1965
1966 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1967 .addUse(Hi)
1968 .addUse(Const0.getReg(0))
1969 .addUse(Const1.getReg(0));
1970
1971 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1972 }
1973
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1974 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1975 MachineInstr &MI, MachineRegisterInfo &MRI,
1976 MachineIRBuilder &B) const {
1977 const LLT S1 = LLT::scalar(1);
1978 const LLT S32 = LLT::scalar(32);
1979 const LLT S64 = LLT::scalar(64);
1980
1981 Register Src = MI.getOperand(1).getReg();
1982 assert(MRI.getType(Src) == S64);
1983
1984 // TODO: Should this use extract since the low half is unused?
1985 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1986 Register Hi = Unmerge.getReg(1);
1987
1988 // Extract the upper half, since this is where we will find the sign and
1989 // exponent.
1990 auto Exp = extractF64Exponent(Hi, B);
1991
1992 const unsigned FractBits = 52;
1993
1994 // Extract the sign bit.
1995 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1996 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1997
1998 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1999
2000 const auto Zero32 = B.buildConstant(S32, 0);
2001
2002 // Extend back to 64-bits.
2003 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2004
2005 auto Shr = B.buildAShr(S64, FractMask, Exp);
2006 auto Not = B.buildNot(S64, Shr);
2007 auto Tmp0 = B.buildAnd(S64, Src, Not);
2008 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2009
2010 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2011 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2012
2013 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2014 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2015 MI.eraseFromParent();
2016 return true;
2017 }
2018
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2019 bool AMDGPULegalizerInfo::legalizeITOFP(
2020 MachineInstr &MI, MachineRegisterInfo &MRI,
2021 MachineIRBuilder &B, bool Signed) const {
2022
2023 Register Dst = MI.getOperand(0).getReg();
2024 Register Src = MI.getOperand(1).getReg();
2025
2026 const LLT S64 = LLT::scalar(64);
2027 const LLT S32 = LLT::scalar(32);
2028
2029 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2030
2031 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2032
2033 auto CvtHi = Signed ?
2034 B.buildSITOFP(S64, Unmerge.getReg(1)) :
2035 B.buildUITOFP(S64, Unmerge.getReg(1));
2036
2037 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2038
2039 auto ThirtyTwo = B.buildConstant(S32, 32);
2040 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2041 .addUse(CvtHi.getReg(0))
2042 .addUse(ThirtyTwo.getReg(0));
2043
2044 // TODO: Should this propagate fast-math-flags?
2045 B.buildFAdd(Dst, LdExp, CvtLo);
2046 MI.eraseFromParent();
2047 return true;
2048 }
2049
2050 // TODO: Copied from DAG implementation. Verify logic and document how this
2051 // actually works.
legalizeFPTOI(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2052 bool AMDGPULegalizerInfo::legalizeFPTOI(
2053 MachineInstr &MI, MachineRegisterInfo &MRI,
2054 MachineIRBuilder &B, bool Signed) const {
2055
2056 Register Dst = MI.getOperand(0).getReg();
2057 Register Src = MI.getOperand(1).getReg();
2058
2059 const LLT S64 = LLT::scalar(64);
2060 const LLT S32 = LLT::scalar(32);
2061
2062 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2063
2064 unsigned Flags = MI.getFlags();
2065
2066 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2067 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2068 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2069
2070 auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2071 auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2072 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2073
2074 auto Hi = Signed ?
2075 B.buildFPTOSI(S32, FloorMul) :
2076 B.buildFPTOUI(S32, FloorMul);
2077 auto Lo = B.buildFPTOUI(S32, Fma);
2078
2079 B.buildMerge(Dst, { Lo, Hi });
2080 MI.eraseFromParent();
2081
2082 return true;
2083 }
2084
legalizeMinNumMaxNum(LegalizerHelper & Helper,MachineInstr & MI) const2085 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2086 MachineInstr &MI) const {
2087 MachineFunction &MF = Helper.MIRBuilder.getMF();
2088 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2089
2090 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2091 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2092
2093 // With ieee_mode disabled, the instructions have the correct behavior
2094 // already for G_FMINNUM/G_FMAXNUM
2095 if (!MFI->getMode().IEEE)
2096 return !IsIEEEOp;
2097
2098 if (IsIEEEOp)
2099 return true;
2100
2101 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2102 }
2103
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2104 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2105 MachineInstr &MI, MachineRegisterInfo &MRI,
2106 MachineIRBuilder &B) const {
2107 // TODO: Should move some of this into LegalizerHelper.
2108
2109 // TODO: Promote dynamic indexing of s16 to s32
2110
2111 // FIXME: Artifact combiner probably should have replaced the truncated
2112 // constant before this, so we shouldn't need
2113 // getConstantVRegValWithLookThrough.
2114 Optional<ValueAndVReg> MaybeIdxVal =
2115 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2116 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2117 return true;
2118 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2119
2120 Register Dst = MI.getOperand(0).getReg();
2121 Register Vec = MI.getOperand(1).getReg();
2122
2123 LLT VecTy = MRI.getType(Vec);
2124 LLT EltTy = VecTy.getElementType();
2125 assert(EltTy == MRI.getType(Dst));
2126
2127 if (IdxVal < VecTy.getNumElements())
2128 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2129 else
2130 B.buildUndef(Dst);
2131
2132 MI.eraseFromParent();
2133 return true;
2134 }
2135
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2136 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2137 MachineInstr &MI, MachineRegisterInfo &MRI,
2138 MachineIRBuilder &B) const {
2139 // TODO: Should move some of this into LegalizerHelper.
2140
2141 // TODO: Promote dynamic indexing of s16 to s32
2142
2143 // FIXME: Artifact combiner probably should have replaced the truncated
2144 // constant before this, so we shouldn't need
2145 // getConstantVRegValWithLookThrough.
2146 Optional<ValueAndVReg> MaybeIdxVal =
2147 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2148 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2149 return true;
2150
2151 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2152 Register Dst = MI.getOperand(0).getReg();
2153 Register Vec = MI.getOperand(1).getReg();
2154 Register Ins = MI.getOperand(2).getReg();
2155
2156 LLT VecTy = MRI.getType(Vec);
2157 LLT EltTy = VecTy.getElementType();
2158 assert(EltTy == MRI.getType(Ins));
2159
2160 if (IdxVal < VecTy.getNumElements())
2161 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2162 else
2163 B.buildUndef(Dst);
2164
2165 MI.eraseFromParent();
2166 return true;
2167 }
2168
legalizeShuffleVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2169 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2170 MachineInstr &MI, MachineRegisterInfo &MRI,
2171 MachineIRBuilder &B) const {
2172 const LLT V2S16 = LLT::vector(2, 16);
2173
2174 Register Dst = MI.getOperand(0).getReg();
2175 Register Src0 = MI.getOperand(1).getReg();
2176 LLT DstTy = MRI.getType(Dst);
2177 LLT SrcTy = MRI.getType(Src0);
2178
2179 if (SrcTy == V2S16 && DstTy == V2S16 &&
2180 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2181 return true;
2182
2183 MachineIRBuilder HelperBuilder(MI);
2184 GISelObserverWrapper DummyObserver;
2185 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2186 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2187 }
2188
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2189 bool AMDGPULegalizerInfo::legalizeSinCos(
2190 MachineInstr &MI, MachineRegisterInfo &MRI,
2191 MachineIRBuilder &B) const {
2192
2193 Register DstReg = MI.getOperand(0).getReg();
2194 Register SrcReg = MI.getOperand(1).getReg();
2195 LLT Ty = MRI.getType(DstReg);
2196 unsigned Flags = MI.getFlags();
2197
2198 Register TrigVal;
2199 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2200 if (ST.hasTrigReducedRange()) {
2201 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2202 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2203 .addUse(MulVal.getReg(0))
2204 .setMIFlags(Flags).getReg(0);
2205 } else
2206 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2207
2208 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2209 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2210 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2211 .addUse(TrigVal)
2212 .setMIFlags(Flags);
2213 MI.eraseFromParent();
2214 return true;
2215 }
2216
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,int64_t Offset,unsigned GAFlags) const2217 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2218 MachineIRBuilder &B,
2219 const GlobalValue *GV,
2220 int64_t Offset,
2221 unsigned GAFlags) const {
2222 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2223 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2224 // to the following code sequence:
2225 //
2226 // For constant address space:
2227 // s_getpc_b64 s[0:1]
2228 // s_add_u32 s0, s0, $symbol
2229 // s_addc_u32 s1, s1, 0
2230 //
2231 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2232 // a fixup or relocation is emitted to replace $symbol with a literal
2233 // constant, which is a pc-relative offset from the encoding of the $symbol
2234 // operand to the global variable.
2235 //
2236 // For global address space:
2237 // s_getpc_b64 s[0:1]
2238 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2239 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2240 //
2241 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2242 // fixups or relocations are emitted to replace $symbol@*@lo and
2243 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2244 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2245 // operand to the global variable.
2246 //
2247 // What we want here is an offset from the value returned by s_getpc
2248 // (which is the address of the s_add_u32 instruction) to the global
2249 // variable, but since the encoding of $symbol starts 4 bytes after the start
2250 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2251 // small. This requires us to add 4 to the global variable offset in order to
2252 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2253 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2254 // instruction.
2255
2256 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2257
2258 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2259 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2260
2261 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2262 .addDef(PCReg);
2263
2264 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2265 if (GAFlags == SIInstrInfo::MO_NONE)
2266 MIB.addImm(0);
2267 else
2268 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2269
2270 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2271
2272 if (PtrTy.getSizeInBits() == 32)
2273 B.buildExtract(DstReg, PCReg, 0);
2274 return true;
2275 }
2276
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2277 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2278 MachineInstr &MI, MachineRegisterInfo &MRI,
2279 MachineIRBuilder &B) const {
2280 Register DstReg = MI.getOperand(0).getReg();
2281 LLT Ty = MRI.getType(DstReg);
2282 unsigned AS = Ty.getAddressSpace();
2283
2284 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2285 MachineFunction &MF = B.getMF();
2286 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2287
2288 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2289 if (!MFI->isModuleEntryFunction() &&
2290 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2291 const Function &Fn = MF.getFunction();
2292 DiagnosticInfoUnsupported BadLDSDecl(
2293 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2294 DS_Warning);
2295 Fn.getContext().diagnose(BadLDSDecl);
2296
2297 // We currently don't have a way to correctly allocate LDS objects that
2298 // aren't directly associated with a kernel. We do force inlining of
2299 // functions that use local objects. However, if these dead functions are
2300 // not eliminated, we don't want a compile time error. Just emit a warning
2301 // and a trap, since there should be no callable path here.
2302 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2303 B.buildUndef(DstReg);
2304 MI.eraseFromParent();
2305 return true;
2306 }
2307
2308 // TODO: We could emit code to handle the initialization somewhere.
2309 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2310 const SITargetLowering *TLI = ST.getTargetLowering();
2311 if (!TLI->shouldUseLDSConstAddress(GV)) {
2312 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2313 return true; // Leave in place;
2314 }
2315
2316 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2317 Type *Ty = GV->getValueType();
2318 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2319 // zero-sized type in other languages to declare the dynamic shared
2320 // memory which size is not known at the compile time. They will be
2321 // allocated by the runtime and placed directly after the static
2322 // allocated ones. They all share the same offset.
2323 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2324 // Adjust alignment for that dynamic shared memory array.
2325 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2326 LLT S32 = LLT::scalar(32);
2327 auto Sz =
2328 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2329 B.buildIntToPtr(DstReg, Sz);
2330 MI.eraseFromParent();
2331 return true;
2332 }
2333 }
2334
2335 B.buildConstant(
2336 DstReg,
2337 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2338 MI.eraseFromParent();
2339 return true;
2340 }
2341
2342 const Function &Fn = MF.getFunction();
2343 DiagnosticInfoUnsupported BadInit(
2344 Fn, "unsupported initializer for address space", MI.getDebugLoc());
2345 Fn.getContext().diagnose(BadInit);
2346 return true;
2347 }
2348
2349 const SITargetLowering *TLI = ST.getTargetLowering();
2350
2351 if (TLI->shouldEmitFixup(GV)) {
2352 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2353 MI.eraseFromParent();
2354 return true;
2355 }
2356
2357 if (TLI->shouldEmitPCReloc(GV)) {
2358 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2359 MI.eraseFromParent();
2360 return true;
2361 }
2362
2363 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2364 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2365
2366 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2367 MachinePointerInfo::getGOT(MF),
2368 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2369 MachineMemOperand::MOInvariant,
2370 8 /*Size*/, Align(8));
2371
2372 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2373
2374 if (Ty.getSizeInBits() == 32) {
2375 // Truncate if this is a 32-bit constant adrdess.
2376 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2377 B.buildExtract(DstReg, Load, 0);
2378 } else
2379 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2380
2381 MI.eraseFromParent();
2382 return true;
2383 }
2384
widenToNextPowerOf2(LLT Ty)2385 static LLT widenToNextPowerOf2(LLT Ty) {
2386 if (Ty.isVector())
2387 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2388 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2389 }
2390
legalizeLoad(LegalizerHelper & Helper,MachineInstr & MI) const2391 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2392 MachineInstr &MI) const {
2393 MachineIRBuilder &B = Helper.MIRBuilder;
2394 MachineRegisterInfo &MRI = *B.getMRI();
2395 GISelChangeObserver &Observer = Helper.Observer;
2396
2397 Register PtrReg = MI.getOperand(1).getReg();
2398 LLT PtrTy = MRI.getType(PtrReg);
2399 unsigned AddrSpace = PtrTy.getAddressSpace();
2400
2401 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2402 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2403 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2404 Observer.changingInstr(MI);
2405 MI.getOperand(1).setReg(Cast.getReg(0));
2406 Observer.changedInstr(MI);
2407 return true;
2408 }
2409
2410 Register ValReg = MI.getOperand(0).getReg();
2411 LLT ValTy = MRI.getType(ValReg);
2412
2413 MachineMemOperand *MMO = *MI.memoperands_begin();
2414 const unsigned ValSize = ValTy.getSizeInBits();
2415 const unsigned MemSize = 8 * MMO->getSize();
2416 const Align MemAlign = MMO->getAlign();
2417 const unsigned AlignInBits = 8 * MemAlign.value();
2418
2419 // Widen non-power-of-2 loads to the alignment if needed
2420 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2421 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2422
2423 // This was already the correct extending load result type, so just adjust
2424 // the memory type.
2425 if (WideMemSize == ValSize) {
2426 MachineFunction &MF = B.getMF();
2427
2428 MachineMemOperand *WideMMO =
2429 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2430 Observer.changingInstr(MI);
2431 MI.setMemRefs(MF, {WideMMO});
2432 Observer.changedInstr(MI);
2433 return true;
2434 }
2435
2436 // Don't bother handling edge case that should probably never be produced.
2437 if (ValSize > WideMemSize)
2438 return false;
2439
2440 LLT WideTy = widenToNextPowerOf2(ValTy);
2441
2442 Register WideLoad;
2443 if (!WideTy.isVector()) {
2444 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2445 B.buildTrunc(ValReg, WideLoad).getReg(0);
2446 } else {
2447 // Extract the subvector.
2448
2449 if (isRegisterType(ValTy)) {
2450 // If this a case where G_EXTRACT is legal, use it.
2451 // (e.g. <3 x s32> -> <4 x s32>)
2452 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2453 B.buildExtract(ValReg, WideLoad, 0);
2454 } else {
2455 // For cases where the widened type isn't a nice register value, unmerge
2456 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2457 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2458 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2459 B.setInsertPt(B.getMBB(), MI.getIterator());
2460 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2461 }
2462 }
2463
2464 MI.eraseFromParent();
2465 return true;
2466 }
2467
2468 return false;
2469 }
2470
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2471 bool AMDGPULegalizerInfo::legalizeFMad(
2472 MachineInstr &MI, MachineRegisterInfo &MRI,
2473 MachineIRBuilder &B) const {
2474 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2475 assert(Ty.isScalar());
2476
2477 MachineFunction &MF = B.getMF();
2478 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2479
2480 // TODO: Always legal with future ftz flag.
2481 // FIXME: Do we need just output?
2482 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2483 return true;
2484 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2485 return true;
2486
2487 MachineIRBuilder HelperBuilder(MI);
2488 GISelObserverWrapper DummyObserver;
2489 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2490 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2491 }
2492
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2493 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2494 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2495 Register DstReg = MI.getOperand(0).getReg();
2496 Register PtrReg = MI.getOperand(1).getReg();
2497 Register CmpVal = MI.getOperand(2).getReg();
2498 Register NewVal = MI.getOperand(3).getReg();
2499
2500 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2501 "this should not have been custom lowered");
2502
2503 LLT ValTy = MRI.getType(CmpVal);
2504 LLT VecTy = LLT::vector(2, ValTy);
2505
2506 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2507
2508 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2509 .addDef(DstReg)
2510 .addUse(PtrReg)
2511 .addUse(PackedVal)
2512 .setMemRefs(MI.memoperands());
2513
2514 MI.eraseFromParent();
2515 return true;
2516 }
2517
legalizeFlog(MachineInstr & MI,MachineIRBuilder & B,double Log2BaseInverted) const2518 bool AMDGPULegalizerInfo::legalizeFlog(
2519 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2520 Register Dst = MI.getOperand(0).getReg();
2521 Register Src = MI.getOperand(1).getReg();
2522 LLT Ty = B.getMRI()->getType(Dst);
2523 unsigned Flags = MI.getFlags();
2524
2525 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2526 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2527
2528 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2529 MI.eraseFromParent();
2530 return true;
2531 }
2532
legalizeFExp(MachineInstr & MI,MachineIRBuilder & B) const2533 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2534 MachineIRBuilder &B) const {
2535 Register Dst = MI.getOperand(0).getReg();
2536 Register Src = MI.getOperand(1).getReg();
2537 unsigned Flags = MI.getFlags();
2538 LLT Ty = B.getMRI()->getType(Dst);
2539
2540 auto K = B.buildFConstant(Ty, numbers::log2e);
2541 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2542 B.buildFExp2(Dst, Mul, Flags);
2543 MI.eraseFromParent();
2544 return true;
2545 }
2546
legalizeFPow(MachineInstr & MI,MachineIRBuilder & B) const2547 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2548 MachineIRBuilder &B) const {
2549 Register Dst = MI.getOperand(0).getReg();
2550 Register Src0 = MI.getOperand(1).getReg();
2551 Register Src1 = MI.getOperand(2).getReg();
2552 unsigned Flags = MI.getFlags();
2553 LLT Ty = B.getMRI()->getType(Dst);
2554 const LLT S16 = LLT::scalar(16);
2555 const LLT S32 = LLT::scalar(32);
2556
2557 if (Ty == S32) {
2558 auto Log = B.buildFLog2(S32, Src0, Flags);
2559 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2560 .addUse(Log.getReg(0))
2561 .addUse(Src1)
2562 .setMIFlags(Flags);
2563 B.buildFExp2(Dst, Mul, Flags);
2564 } else if (Ty == S16) {
2565 // There's no f16 fmul_legacy, so we need to convert for it.
2566 auto Log = B.buildFLog2(S16, Src0, Flags);
2567 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2568 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2569 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2570 .addUse(Ext0.getReg(0))
2571 .addUse(Ext1.getReg(0))
2572 .setMIFlags(Flags);
2573
2574 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2575 } else
2576 return false;
2577
2578 MI.eraseFromParent();
2579 return true;
2580 }
2581
2582 // Find a source register, ignoring any possible source modifiers.
stripAnySourceMods(Register OrigSrc,MachineRegisterInfo & MRI)2583 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2584 Register ModSrc = OrigSrc;
2585 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2586 ModSrc = SrcFNeg->getOperand(1).getReg();
2587 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2588 ModSrc = SrcFAbs->getOperand(1).getReg();
2589 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2590 ModSrc = SrcFAbs->getOperand(1).getReg();
2591 return ModSrc;
2592 }
2593
legalizeFFloor(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2594 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2595 MachineRegisterInfo &MRI,
2596 MachineIRBuilder &B) const {
2597
2598 const LLT S1 = LLT::scalar(1);
2599 const LLT S64 = LLT::scalar(64);
2600 Register Dst = MI.getOperand(0).getReg();
2601 Register OrigSrc = MI.getOperand(1).getReg();
2602 unsigned Flags = MI.getFlags();
2603 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2604 "this should not have been custom lowered");
2605
2606 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2607 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2608 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2609 // V_FRACT bug is:
2610 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2611 //
2612 // Convert floor(x) to (x - fract(x))
2613
2614 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2615 .addUse(OrigSrc)
2616 .setMIFlags(Flags);
2617
2618 // Give source modifier matching some assistance before obscuring a foldable
2619 // pattern.
2620
2621 // TODO: We can avoid the neg on the fract? The input sign to fract
2622 // shouldn't matter?
2623 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2624
2625 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2626
2627 Register Min = MRI.createGenericVirtualRegister(S64);
2628
2629 // We don't need to concern ourselves with the snan handling difference, so
2630 // use the one which will directly select.
2631 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2632 if (MFI->getMode().IEEE)
2633 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2634 else
2635 B.buildFMinNum(Min, Fract, Const, Flags);
2636
2637 Register CorrectedFract = Min;
2638 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2639 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2640 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2641 }
2642
2643 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2644 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2645
2646 MI.eraseFromParent();
2647 return true;
2648 }
2649
2650 // Turn an illegal packed v2s16 build vector into bit operations.
2651 // TODO: This should probably be a bitcast action in LegalizerHelper.
legalizeBuildVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2652 bool AMDGPULegalizerInfo::legalizeBuildVector(
2653 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2654 Register Dst = MI.getOperand(0).getReg();
2655 const LLT S32 = LLT::scalar(32);
2656 assert(MRI.getType(Dst) == LLT::vector(2, 16));
2657
2658 Register Src0 = MI.getOperand(1).getReg();
2659 Register Src1 = MI.getOperand(2).getReg();
2660 assert(MRI.getType(Src0) == LLT::scalar(16));
2661
2662 auto Merge = B.buildMerge(S32, {Src0, Src1});
2663 B.buildBitcast(Dst, Merge);
2664
2665 MI.eraseFromParent();
2666 return true;
2667 }
2668
2669 // Check that this is a G_XOR x, -1
isNot(const MachineRegisterInfo & MRI,const MachineInstr & MI)2670 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2671 if (MI.getOpcode() != TargetOpcode::G_XOR)
2672 return false;
2673 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2674 return ConstVal && *ConstVal == -1;
2675 }
2676
2677 // Return the use branch instruction, otherwise null if the usage is invalid.
2678 static MachineInstr *
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br,MachineBasicBlock * & UncondBrTarget,bool & Negated)2679 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2680 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2681 Register CondDef = MI.getOperand(0).getReg();
2682 if (!MRI.hasOneNonDBGUse(CondDef))
2683 return nullptr;
2684
2685 MachineBasicBlock *Parent = MI.getParent();
2686 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2687
2688 if (isNot(MRI, *UseMI)) {
2689 Register NegatedCond = UseMI->getOperand(0).getReg();
2690 if (!MRI.hasOneNonDBGUse(NegatedCond))
2691 return nullptr;
2692
2693 // We're deleting the def of this value, so we need to remove it.
2694 UseMI->eraseFromParent();
2695
2696 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2697 Negated = true;
2698 }
2699
2700 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2701 return nullptr;
2702
2703 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2704 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2705 if (Next == Parent->end()) {
2706 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2707 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2708 return nullptr;
2709 UncondBrTarget = &*NextMBB;
2710 } else {
2711 if (Next->getOpcode() != AMDGPU::G_BR)
2712 return nullptr;
2713 Br = &*Next;
2714 UncondBrTarget = Br->getOperand(0).getMBB();
2715 }
2716
2717 return UseMI;
2718 }
2719
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg,const TargetRegisterClass * ArgRC,LLT ArgTy) const2720 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2721 const ArgDescriptor *Arg,
2722 const TargetRegisterClass *ArgRC,
2723 LLT ArgTy) const {
2724 MCRegister SrcReg = Arg->getRegister();
2725 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2726 assert(DstReg.isVirtual() && "Virtual register expected");
2727
2728 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2729 ArgTy);
2730 if (Arg->isMasked()) {
2731 // TODO: Should we try to emit this once in the entry block?
2732 const LLT S32 = LLT::scalar(32);
2733 const unsigned Mask = Arg->getMask();
2734 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2735
2736 Register AndMaskSrc = LiveIn;
2737
2738 if (Shift != 0) {
2739 auto ShiftAmt = B.buildConstant(S32, Shift);
2740 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2741 }
2742
2743 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2744 } else {
2745 B.buildCopy(DstReg, LiveIn);
2746 }
2747
2748 return true;
2749 }
2750
loadInputValue(Register DstReg,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2751 bool AMDGPULegalizerInfo::loadInputValue(
2752 Register DstReg, MachineIRBuilder &B,
2753 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2754 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2755 const ArgDescriptor *Arg;
2756 const TargetRegisterClass *ArgRC;
2757 LLT ArgTy;
2758 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2759
2760 if (!Arg->isRegister() || !Arg->getRegister().isValid())
2761 return false; // TODO: Handle these
2762 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2763 }
2764
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2765 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2766 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2767 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2768 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2769 return false;
2770
2771 MI.eraseFromParent();
2772 return true;
2773 }
2774
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2775 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2776 MachineRegisterInfo &MRI,
2777 MachineIRBuilder &B) const {
2778 Register Dst = MI.getOperand(0).getReg();
2779 LLT DstTy = MRI.getType(Dst);
2780 LLT S16 = LLT::scalar(16);
2781 LLT S32 = LLT::scalar(32);
2782 LLT S64 = LLT::scalar(64);
2783
2784 if (DstTy == S16)
2785 return legalizeFDIV16(MI, MRI, B);
2786 if (DstTy == S32)
2787 return legalizeFDIV32(MI, MRI, B);
2788 if (DstTy == S64)
2789 return legalizeFDIV64(MI, MRI, B);
2790
2791 return false;
2792 }
2793
legalizeUDIV_UREM32Impl(MachineIRBuilder & B,Register DstReg,Register X,Register Y,bool IsDiv) const2794 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2795 Register DstReg,
2796 Register X,
2797 Register Y,
2798 bool IsDiv) const {
2799 const LLT S1 = LLT::scalar(1);
2800 const LLT S32 = LLT::scalar(32);
2801
2802 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2803 // algorithm used here.
2804
2805 // Initial estimate of inv(y).
2806 auto FloatY = B.buildUITOFP(S32, Y);
2807 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2808 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2809 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2810 auto Z = B.buildFPTOUI(S32, ScaledY);
2811
2812 // One round of UNR.
2813 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2814 auto NegYZ = B.buildMul(S32, NegY, Z);
2815 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2816
2817 // Quotient/remainder estimate.
2818 auto Q = B.buildUMulH(S32, X, Z);
2819 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2820
2821 // First quotient/remainder refinement.
2822 auto One = B.buildConstant(S32, 1);
2823 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2824 if (IsDiv)
2825 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2826 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2827
2828 // Second quotient/remainder refinement.
2829 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2830 if (IsDiv)
2831 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2832 else
2833 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2834 }
2835
2836 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2837 //
2838 // Return lo, hi of result
2839 //
2840 // %cvt.lo = G_UITOFP Val.lo
2841 // %cvt.hi = G_UITOFP Val.hi
2842 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2843 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2844 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2845 // %mul2 = G_FMUL %mul1, 2**(-32)
2846 // %trunc = G_INTRINSIC_TRUNC %mul2
2847 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2848 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
emitReciprocalU64(MachineIRBuilder & B,Register Val)2849 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2850 Register Val) {
2851 const LLT S32 = LLT::scalar(32);
2852 auto Unmerge = B.buildUnmerge(S32, Val);
2853
2854 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2855 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2856
2857 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2858 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2859
2860 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2861 auto Mul1 =
2862 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2863
2864 // 2**(-32)
2865 auto Mul2 =
2866 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2867 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2868
2869 // -(2**32)
2870 auto Mad2 = B.buildFMAD(S32, Trunc,
2871 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2872
2873 auto ResultLo = B.buildFPTOUI(S32, Mad2);
2874 auto ResultHi = B.buildFPTOUI(S32, Trunc);
2875
2876 return {ResultLo.getReg(0), ResultHi.getReg(0)};
2877 }
2878
legalizeUDIV_UREM64Impl(MachineIRBuilder & B,Register DstReg,Register Numer,Register Denom,bool IsDiv) const2879 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2880 Register DstReg,
2881 Register Numer,
2882 Register Denom,
2883 bool IsDiv) const {
2884 const LLT S32 = LLT::scalar(32);
2885 const LLT S64 = LLT::scalar(64);
2886 const LLT S1 = LLT::scalar(1);
2887 Register RcpLo, RcpHi;
2888
2889 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2890
2891 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2892
2893 auto Zero64 = B.buildConstant(S64, 0);
2894 auto NegDenom = B.buildSub(S64, Zero64, Denom);
2895
2896 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2897 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2898
2899 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2900 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2901 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2902
2903 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2904 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2905 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2906 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2907
2908 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2909 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2910 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2911 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2912 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2913
2914 auto Zero32 = B.buildConstant(S32, 0);
2915 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2916 auto Add2_HiC =
2917 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2918 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2919 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2920
2921 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2922 Register NumerLo = UnmergeNumer.getReg(0);
2923 Register NumerHi = UnmergeNumer.getReg(1);
2924
2925 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2926 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2927 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2928 Register Mul3_Lo = UnmergeMul3.getReg(0);
2929 Register Mul3_Hi = UnmergeMul3.getReg(1);
2930 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2931 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2932 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2933 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2934
2935 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2936 Register DenomLo = UnmergeDenom.getReg(0);
2937 Register DenomHi = UnmergeDenom.getReg(1);
2938
2939 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2940 auto C1 = B.buildSExt(S32, CmpHi);
2941
2942 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2943 auto C2 = B.buildSExt(S32, CmpLo);
2944
2945 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2946 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2947
2948 // TODO: Here and below portions of the code can be enclosed into if/endif.
2949 // Currently control flow is unconditional and we have 4 selects after
2950 // potential endif to substitute PHIs.
2951
2952 // if C3 != 0 ...
2953 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2954 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2955 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2956 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2957
2958 auto One64 = B.buildConstant(S64, 1);
2959 auto Add3 = B.buildAdd(S64, MulHi3, One64);
2960
2961 auto C4 =
2962 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2963 auto C5 =
2964 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2965 auto C6 = B.buildSelect(
2966 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2967
2968 // if (C6 != 0)
2969 auto Add4 = B.buildAdd(S64, Add3, One64);
2970 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2971
2972 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2973 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2974 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2975
2976 // endif C6
2977 // endif C3
2978
2979 if (IsDiv) {
2980 auto Sel1 = B.buildSelect(
2981 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2982 B.buildSelect(DstReg,
2983 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2984 } else {
2985 auto Sel2 = B.buildSelect(
2986 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2987 B.buildSelect(DstReg,
2988 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2989 }
2990 }
2991
legalizeUDIV_UREM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2992 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2993 MachineRegisterInfo &MRI,
2994 MachineIRBuilder &B) const {
2995 const LLT S64 = LLT::scalar(64);
2996 const LLT S32 = LLT::scalar(32);
2997 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2998 Register DstReg = MI.getOperand(0).getReg();
2999 Register Num = MI.getOperand(1).getReg();
3000 Register Den = MI.getOperand(2).getReg();
3001 LLT Ty = MRI.getType(DstReg);
3002
3003 if (Ty == S32)
3004 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
3005 else if (Ty == S64)
3006 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
3007 else
3008 return false;
3009
3010 MI.eraseFromParent();
3011 return true;
3012
3013 }
3014
legalizeSDIV_SREM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3015 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3016 MachineRegisterInfo &MRI,
3017 MachineIRBuilder &B) const {
3018 const LLT S64 = LLT::scalar(64);
3019 const LLT S32 = LLT::scalar(32);
3020
3021 Register DstReg = MI.getOperand(0).getReg();
3022 const LLT Ty = MRI.getType(DstReg);
3023 if (Ty != S32 && Ty != S64)
3024 return false;
3025
3026 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3027
3028 Register LHS = MI.getOperand(1).getReg();
3029 Register RHS = MI.getOperand(2).getReg();
3030
3031 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3032 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3033 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3034
3035 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3036 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3037
3038 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3039 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3040
3041 Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3042 if (Ty == S32)
3043 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3044 else
3045 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3046
3047 Register Sign;
3048 if (IsDiv)
3049 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3050 else
3051 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3052
3053 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3054 B.buildSub(DstReg, UDivRem, Sign);
3055
3056 MI.eraseFromParent();
3057 return true;
3058 }
3059
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3060 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3061 MachineRegisterInfo &MRI,
3062 MachineIRBuilder &B) const {
3063 Register Res = MI.getOperand(0).getReg();
3064 Register LHS = MI.getOperand(1).getReg();
3065 Register RHS = MI.getOperand(2).getReg();
3066 uint16_t Flags = MI.getFlags();
3067 LLT ResTy = MRI.getType(Res);
3068
3069 const MachineFunction &MF = B.getMF();
3070 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3071 MI.getFlag(MachineInstr::FmAfn);
3072
3073 if (!AllowInaccurateRcp)
3074 return false;
3075
3076 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3077 // 1 / x -> RCP(x)
3078 if (CLHS->isExactlyValue(1.0)) {
3079 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3080 .addUse(RHS)
3081 .setMIFlags(Flags);
3082
3083 MI.eraseFromParent();
3084 return true;
3085 }
3086
3087 // -1 / x -> RCP( FNEG(x) )
3088 if (CLHS->isExactlyValue(-1.0)) {
3089 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3090 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3091 .addUse(FNeg.getReg(0))
3092 .setMIFlags(Flags);
3093
3094 MI.eraseFromParent();
3095 return true;
3096 }
3097 }
3098
3099 // x / y -> x * (1.0 / y)
3100 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3101 .addUse(RHS)
3102 .setMIFlags(Flags);
3103 B.buildFMul(Res, LHS, RCP, Flags);
3104
3105 MI.eraseFromParent();
3106 return true;
3107 }
3108
legalizeFastUnsafeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3109 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3110 MachineRegisterInfo &MRI,
3111 MachineIRBuilder &B) const {
3112 Register Res = MI.getOperand(0).getReg();
3113 Register X = MI.getOperand(1).getReg();
3114 Register Y = MI.getOperand(2).getReg();
3115 uint16_t Flags = MI.getFlags();
3116 LLT ResTy = MRI.getType(Res);
3117
3118 const MachineFunction &MF = B.getMF();
3119 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3120 MI.getFlag(MachineInstr::FmAfn);
3121
3122 if (!AllowInaccurateRcp)
3123 return false;
3124
3125 auto NegY = B.buildFNeg(ResTy, Y);
3126 auto One = B.buildFConstant(ResTy, 1.0);
3127
3128 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3129 .addUse(Y)
3130 .setMIFlags(Flags);
3131
3132 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3133 R = B.buildFMA(ResTy, Tmp0, R, R);
3134
3135 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3136 R = B.buildFMA(ResTy, Tmp1, R, R);
3137
3138 auto Ret = B.buildFMul(ResTy, X, R);
3139 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3140
3141 B.buildFMA(Res, Tmp2, R, Ret);
3142 MI.eraseFromParent();
3143 return true;
3144 }
3145
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3146 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3147 MachineRegisterInfo &MRI,
3148 MachineIRBuilder &B) const {
3149 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3150 return true;
3151
3152 Register Res = MI.getOperand(0).getReg();
3153 Register LHS = MI.getOperand(1).getReg();
3154 Register RHS = MI.getOperand(2).getReg();
3155
3156 uint16_t Flags = MI.getFlags();
3157
3158 LLT S16 = LLT::scalar(16);
3159 LLT S32 = LLT::scalar(32);
3160
3161 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3162 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3163
3164 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3165 .addUse(RHSExt.getReg(0))
3166 .setMIFlags(Flags);
3167
3168 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3169 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3170
3171 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3172 .addUse(RDst.getReg(0))
3173 .addUse(RHS)
3174 .addUse(LHS)
3175 .setMIFlags(Flags);
3176
3177 MI.eraseFromParent();
3178 return true;
3179 }
3180
3181 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3182 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,AMDGPU::SIModeRegisterDefaults Mode)3183 static void toggleSPDenormMode(bool Enable,
3184 MachineIRBuilder &B,
3185 const GCNSubtarget &ST,
3186 AMDGPU::SIModeRegisterDefaults Mode) {
3187 // Set SP denorm mode to this value.
3188 unsigned SPDenormMode =
3189 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3190
3191 if (ST.hasDenormModeInst()) {
3192 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3193 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3194
3195 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3196 B.buildInstr(AMDGPU::S_DENORM_MODE)
3197 .addImm(NewDenormModeValue);
3198
3199 } else {
3200 // Select FP32 bit field in mode register.
3201 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3202 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3203 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3204
3205 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3206 .addImm(SPDenormMode)
3207 .addImm(SPDenormModeBitField);
3208 }
3209 }
3210
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3211 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3212 MachineRegisterInfo &MRI,
3213 MachineIRBuilder &B) const {
3214 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3215 return true;
3216
3217 Register Res = MI.getOperand(0).getReg();
3218 Register LHS = MI.getOperand(1).getReg();
3219 Register RHS = MI.getOperand(2).getReg();
3220 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3221 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3222
3223 uint16_t Flags = MI.getFlags();
3224
3225 LLT S32 = LLT::scalar(32);
3226 LLT S1 = LLT::scalar(1);
3227
3228 auto One = B.buildFConstant(S32, 1.0f);
3229
3230 auto DenominatorScaled =
3231 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3232 .addUse(LHS)
3233 .addUse(RHS)
3234 .addImm(0)
3235 .setMIFlags(Flags);
3236 auto NumeratorScaled =
3237 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3238 .addUse(LHS)
3239 .addUse(RHS)
3240 .addImm(1)
3241 .setMIFlags(Flags);
3242
3243 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3244 .addUse(DenominatorScaled.getReg(0))
3245 .setMIFlags(Flags);
3246 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3247
3248 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3249 // aren't modeled as reading it.
3250 if (!Mode.allFP32Denormals())
3251 toggleSPDenormMode(true, B, ST, Mode);
3252
3253 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3254 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3255 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3256 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3257 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3258 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3259
3260 if (!Mode.allFP32Denormals())
3261 toggleSPDenormMode(false, B, ST, Mode);
3262
3263 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3264 .addUse(Fma4.getReg(0))
3265 .addUse(Fma1.getReg(0))
3266 .addUse(Fma3.getReg(0))
3267 .addUse(NumeratorScaled.getReg(1))
3268 .setMIFlags(Flags);
3269
3270 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3271 .addUse(Fmas.getReg(0))
3272 .addUse(RHS)
3273 .addUse(LHS)
3274 .setMIFlags(Flags);
3275
3276 MI.eraseFromParent();
3277 return true;
3278 }
3279
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3280 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3281 MachineRegisterInfo &MRI,
3282 MachineIRBuilder &B) const {
3283 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3284 return true;
3285
3286 Register Res = MI.getOperand(0).getReg();
3287 Register LHS = MI.getOperand(1).getReg();
3288 Register RHS = MI.getOperand(2).getReg();
3289
3290 uint16_t Flags = MI.getFlags();
3291
3292 LLT S64 = LLT::scalar(64);
3293 LLT S1 = LLT::scalar(1);
3294
3295 auto One = B.buildFConstant(S64, 1.0);
3296
3297 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3298 .addUse(LHS)
3299 .addUse(RHS)
3300 .addImm(0)
3301 .setMIFlags(Flags);
3302
3303 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3304
3305 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3306 .addUse(DivScale0.getReg(0))
3307 .setMIFlags(Flags);
3308
3309 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3310 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3311 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3312
3313 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3314 .addUse(LHS)
3315 .addUse(RHS)
3316 .addImm(1)
3317 .setMIFlags(Flags);
3318
3319 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3320 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3321 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3322
3323 Register Scale;
3324 if (!ST.hasUsableDivScaleConditionOutput()) {
3325 // Workaround a hardware bug on SI where the condition output from div_scale
3326 // is not usable.
3327
3328 LLT S32 = LLT::scalar(32);
3329
3330 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3331 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3332 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3333 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3334
3335 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3336 Scale1Unmerge.getReg(1));
3337 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3338 Scale0Unmerge.getReg(1));
3339 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3340 } else {
3341 Scale = DivScale1.getReg(1);
3342 }
3343
3344 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3345 .addUse(Fma4.getReg(0))
3346 .addUse(Fma3.getReg(0))
3347 .addUse(Mul.getReg(0))
3348 .addUse(Scale)
3349 .setMIFlags(Flags);
3350
3351 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3352 .addUse(Fmas.getReg(0))
3353 .addUse(RHS)
3354 .addUse(LHS)
3355 .setMIFlags(Flags);
3356
3357 MI.eraseFromParent();
3358 return true;
3359 }
3360
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3361 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3362 MachineRegisterInfo &MRI,
3363 MachineIRBuilder &B) const {
3364 Register Res = MI.getOperand(0).getReg();
3365 Register LHS = MI.getOperand(2).getReg();
3366 Register RHS = MI.getOperand(3).getReg();
3367 uint16_t Flags = MI.getFlags();
3368
3369 LLT S32 = LLT::scalar(32);
3370 LLT S1 = LLT::scalar(1);
3371
3372 auto Abs = B.buildFAbs(S32, RHS, Flags);
3373 const APFloat C0Val(1.0f);
3374
3375 auto C0 = B.buildConstant(S32, 0x6f800000);
3376 auto C1 = B.buildConstant(S32, 0x2f800000);
3377 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3378
3379 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3380 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3381
3382 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3383
3384 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3385 .addUse(Mul0.getReg(0))
3386 .setMIFlags(Flags);
3387
3388 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3389
3390 B.buildFMul(Res, Sel, Mul1, Flags);
3391
3392 MI.eraseFromParent();
3393 return true;
3394 }
3395
3396 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3397 // FIXME: Why do we handle this one but not other removed instructions?
3398 //
3399 // Reciprocal square root. The clamp prevents infinite results, clamping
3400 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3401 // +-max_float.
legalizeRsqClampIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3402 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3403 MachineRegisterInfo &MRI,
3404 MachineIRBuilder &B) const {
3405 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3406 return true;
3407
3408 Register Dst = MI.getOperand(0).getReg();
3409 Register Src = MI.getOperand(2).getReg();
3410 auto Flags = MI.getFlags();
3411
3412 LLT Ty = MRI.getType(Dst);
3413
3414 const fltSemantics *FltSemantics;
3415 if (Ty == LLT::scalar(32))
3416 FltSemantics = &APFloat::IEEEsingle();
3417 else if (Ty == LLT::scalar(64))
3418 FltSemantics = &APFloat::IEEEdouble();
3419 else
3420 return false;
3421
3422 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3423 .addUse(Src)
3424 .setMIFlags(Flags);
3425
3426 // We don't need to concern ourselves with the snan handling difference, since
3427 // the rsq quieted (or not) so use the one which will directly select.
3428 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3429 const bool UseIEEE = MFI->getMode().IEEE;
3430
3431 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3432 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3433 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3434
3435 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3436
3437 if (UseIEEE)
3438 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3439 else
3440 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3441 MI.eraseFromParent();
3442 return true;
3443 }
3444
getDSFPAtomicOpcode(Intrinsic::ID IID)3445 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3446 switch (IID) {
3447 case Intrinsic::amdgcn_ds_fadd:
3448 return AMDGPU::G_ATOMICRMW_FADD;
3449 case Intrinsic::amdgcn_ds_fmin:
3450 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3451 case Intrinsic::amdgcn_ds_fmax:
3452 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3453 default:
3454 llvm_unreachable("not a DS FP intrinsic");
3455 }
3456 }
3457
legalizeDSAtomicFPIntrinsic(LegalizerHelper & Helper,MachineInstr & MI,Intrinsic::ID IID) const3458 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3459 MachineInstr &MI,
3460 Intrinsic::ID IID) const {
3461 GISelChangeObserver &Observer = Helper.Observer;
3462 Observer.changingInstr(MI);
3463
3464 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3465
3466 // The remaining operands were used to set fields in the MemOperand on
3467 // construction.
3468 for (int I = 6; I > 3; --I)
3469 MI.RemoveOperand(I);
3470
3471 MI.RemoveOperand(1); // Remove the intrinsic ID.
3472 Observer.changedInstr(MI);
3473 return true;
3474 }
3475
getImplicitArgPtr(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3476 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3477 MachineRegisterInfo &MRI,
3478 MachineIRBuilder &B) const {
3479 uint64_t Offset =
3480 ST.getTargetLowering()->getImplicitParameterOffset(
3481 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3482 LLT DstTy = MRI.getType(DstReg);
3483 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3484
3485 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3486 if (!loadInputValue(KernargPtrReg, B,
3487 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3488 return false;
3489
3490 // FIXME: This should be nuw
3491 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3492 return true;
3493 }
3494
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3495 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3496 MachineRegisterInfo &MRI,
3497 MachineIRBuilder &B) const {
3498 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3499 if (!MFI->isEntryFunction()) {
3500 return legalizePreloadedArgIntrin(MI, MRI, B,
3501 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3502 }
3503
3504 Register DstReg = MI.getOperand(0).getReg();
3505 if (!getImplicitArgPtr(DstReg, MRI, B))
3506 return false;
3507
3508 MI.eraseFromParent();
3509 return true;
3510 }
3511
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const3512 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3513 MachineRegisterInfo &MRI,
3514 MachineIRBuilder &B,
3515 unsigned AddrSpace) const {
3516 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3517 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3518 Register Hi32 = Unmerge.getReg(1);
3519
3520 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3521 MI.eraseFromParent();
3522 return true;
3523 }
3524
3525 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3526 // offset (the offset that is included in bounds checking and swizzling, to be
3527 // split between the instruction's voffset and immoffset fields) and soffset
3528 // (the offset that is excluded from bounds checking and swizzling, to go in
3529 // the instruction's soffset field). This function takes the first kind of
3530 // offset and figures out how to split it between voffset and immoffset.
3531 std::tuple<Register, unsigned, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const3532 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3533 Register OrigOffset) const {
3534 const unsigned MaxImm = 4095;
3535 Register BaseReg;
3536 unsigned TotalConstOffset;
3537 const LLT S32 = LLT::scalar(32);
3538 MachineRegisterInfo &MRI = *B.getMRI();
3539
3540 std::tie(BaseReg, TotalConstOffset) =
3541 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3542
3543 unsigned ImmOffset = TotalConstOffset;
3544
3545 // If BaseReg is a pointer, convert it to int.
3546 if (MRI.getType(BaseReg).isPointer())
3547 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3548
3549 // If the immediate value is too big for the immoffset field, put the value
3550 // and -4096 into the immoffset field so that the value that is copied/added
3551 // for the voffset field is a multiple of 4096, and it stands more chance
3552 // of being CSEd with the copy/add for another similar load/store.
3553 // However, do not do that rounding down to a multiple of 4096 if that is a
3554 // negative number, as it appears to be illegal to have a negative offset
3555 // in the vgpr, even if adding the immediate offset makes it positive.
3556 unsigned Overflow = ImmOffset & ~MaxImm;
3557 ImmOffset -= Overflow;
3558 if ((int32_t)Overflow < 0) {
3559 Overflow += ImmOffset;
3560 ImmOffset = 0;
3561 }
3562
3563 if (Overflow != 0) {
3564 if (!BaseReg) {
3565 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3566 } else {
3567 auto OverflowVal = B.buildConstant(S32, Overflow);
3568 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3569 }
3570 }
3571
3572 if (!BaseReg)
3573 BaseReg = B.buildConstant(S32, 0).getReg(0);
3574
3575 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3576 }
3577
3578 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg,bool ImageStore) const3579 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3580 MachineRegisterInfo &MRI,
3581 Register Reg,
3582 bool ImageStore) const {
3583 const LLT S16 = LLT::scalar(16);
3584 const LLT S32 = LLT::scalar(32);
3585 LLT StoreVT = MRI.getType(Reg);
3586 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3587
3588 if (ST.hasUnpackedD16VMem()) {
3589 auto Unmerge = B.buildUnmerge(S16, Reg);
3590
3591 SmallVector<Register, 4> WideRegs;
3592 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3593 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3594
3595 int NumElts = StoreVT.getNumElements();
3596
3597 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3598 }
3599
3600 if (ImageStore && ST.hasImageStoreD16Bug()) {
3601 if (StoreVT.getNumElements() == 2) {
3602 SmallVector<Register, 4> PackedRegs;
3603 Reg = B.buildBitcast(S32, Reg).getReg(0);
3604 PackedRegs.push_back(Reg);
3605 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3606 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3607 }
3608
3609 if (StoreVT.getNumElements() == 3) {
3610 SmallVector<Register, 4> PackedRegs;
3611 auto Unmerge = B.buildUnmerge(S16, Reg);
3612 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3613 PackedRegs.push_back(Unmerge.getReg(I));
3614 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3615 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3616 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3617 }
3618
3619 if (StoreVT.getNumElements() == 4) {
3620 SmallVector<Register, 4> PackedRegs;
3621 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3622 auto Unmerge = B.buildUnmerge(S32, Reg);
3623 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3624 PackedRegs.push_back(Unmerge.getReg(I));
3625 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3626 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3627 }
3628
3629 llvm_unreachable("invalid data type");
3630 }
3631
3632 return Reg;
3633 }
3634
fixStoreSourceType(MachineIRBuilder & B,Register VData,bool IsFormat) const3635 Register AMDGPULegalizerInfo::fixStoreSourceType(
3636 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3637 MachineRegisterInfo *MRI = B.getMRI();
3638 LLT Ty = MRI->getType(VData);
3639
3640 const LLT S16 = LLT::scalar(16);
3641
3642 // Fixup illegal register types for i8 stores.
3643 if (Ty == LLT::scalar(8) || Ty == S16) {
3644 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3645 return AnyExt;
3646 }
3647
3648 if (Ty.isVector()) {
3649 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3650 if (IsFormat)
3651 return handleD16VData(B, *MRI, VData);
3652 }
3653 }
3654
3655 return VData;
3656 }
3657
legalizeBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsTyped,bool IsFormat) const3658 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3659 MachineRegisterInfo &MRI,
3660 MachineIRBuilder &B,
3661 bool IsTyped,
3662 bool IsFormat) const {
3663 Register VData = MI.getOperand(1).getReg();
3664 LLT Ty = MRI.getType(VData);
3665 LLT EltTy = Ty.getScalarType();
3666 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3667 const LLT S32 = LLT::scalar(32);
3668
3669 VData = fixStoreSourceType(B, VData, IsFormat);
3670 Register RSrc = MI.getOperand(2).getReg();
3671
3672 MachineMemOperand *MMO = *MI.memoperands_begin();
3673 const int MemSize = MMO->getSize();
3674
3675 unsigned ImmOffset;
3676 unsigned TotalOffset;
3677
3678 // The typed intrinsics add an immediate after the registers.
3679 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3680
3681 // The struct intrinsic variants add one additional operand over raw.
3682 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3683 Register VIndex;
3684 int OpOffset = 0;
3685 if (HasVIndex) {
3686 VIndex = MI.getOperand(3).getReg();
3687 OpOffset = 1;
3688 }
3689
3690 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3691 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3692
3693 unsigned Format = 0;
3694 if (IsTyped) {
3695 Format = MI.getOperand(5 + OpOffset).getImm();
3696 ++OpOffset;
3697 }
3698
3699 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3700
3701 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3702 if (TotalOffset != 0)
3703 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3704
3705 unsigned Opc;
3706 if (IsTyped) {
3707 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3708 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3709 } else if (IsFormat) {
3710 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3711 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3712 } else {
3713 switch (MemSize) {
3714 case 1:
3715 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3716 break;
3717 case 2:
3718 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3719 break;
3720 default:
3721 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3722 break;
3723 }
3724 }
3725
3726 if (!VIndex)
3727 VIndex = B.buildConstant(S32, 0).getReg(0);
3728
3729 auto MIB = B.buildInstr(Opc)
3730 .addUse(VData) // vdata
3731 .addUse(RSrc) // rsrc
3732 .addUse(VIndex) // vindex
3733 .addUse(VOffset) // voffset
3734 .addUse(SOffset) // soffset
3735 .addImm(ImmOffset); // offset(imm)
3736
3737 if (IsTyped)
3738 MIB.addImm(Format);
3739
3740 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3741 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3742 .addMemOperand(MMO);
3743
3744 MI.eraseFromParent();
3745 return true;
3746 }
3747
legalizeBufferLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat,bool IsTyped) const3748 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3749 MachineRegisterInfo &MRI,
3750 MachineIRBuilder &B,
3751 bool IsFormat,
3752 bool IsTyped) const {
3753 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3754 MachineMemOperand *MMO = *MI.memoperands_begin();
3755 const int MemSize = MMO->getSize();
3756 const LLT S32 = LLT::scalar(32);
3757
3758 Register Dst = MI.getOperand(0).getReg();
3759 Register RSrc = MI.getOperand(2).getReg();
3760
3761 // The typed intrinsics add an immediate after the registers.
3762 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3763
3764 // The struct intrinsic variants add one additional operand over raw.
3765 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3766 Register VIndex;
3767 int OpOffset = 0;
3768 if (HasVIndex) {
3769 VIndex = MI.getOperand(3).getReg();
3770 OpOffset = 1;
3771 }
3772
3773 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3774 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3775
3776 unsigned Format = 0;
3777 if (IsTyped) {
3778 Format = MI.getOperand(5 + OpOffset).getImm();
3779 ++OpOffset;
3780 }
3781
3782 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3783 unsigned ImmOffset;
3784 unsigned TotalOffset;
3785
3786 LLT Ty = MRI.getType(Dst);
3787 LLT EltTy = Ty.getScalarType();
3788 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3789 const bool Unpacked = ST.hasUnpackedD16VMem();
3790
3791 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3792 if (TotalOffset != 0)
3793 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3794
3795 unsigned Opc;
3796
3797 if (IsTyped) {
3798 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3799 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3800 } else if (IsFormat) {
3801 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3802 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3803 } else {
3804 switch (MemSize) {
3805 case 1:
3806 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3807 break;
3808 case 2:
3809 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3810 break;
3811 default:
3812 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3813 break;
3814 }
3815 }
3816
3817 Register LoadDstReg;
3818
3819 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3820 LLT UnpackedTy = Ty.changeElementSize(32);
3821
3822 if (IsExtLoad)
3823 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3824 else if (Unpacked && IsD16 && Ty.isVector())
3825 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3826 else
3827 LoadDstReg = Dst;
3828
3829 if (!VIndex)
3830 VIndex = B.buildConstant(S32, 0).getReg(0);
3831
3832 auto MIB = B.buildInstr(Opc)
3833 .addDef(LoadDstReg) // vdata
3834 .addUse(RSrc) // rsrc
3835 .addUse(VIndex) // vindex
3836 .addUse(VOffset) // voffset
3837 .addUse(SOffset) // soffset
3838 .addImm(ImmOffset); // offset(imm)
3839
3840 if (IsTyped)
3841 MIB.addImm(Format);
3842
3843 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3844 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3845 .addMemOperand(MMO);
3846
3847 if (LoadDstReg != Dst) {
3848 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3849
3850 // Widen result for extending loads was widened.
3851 if (IsExtLoad)
3852 B.buildTrunc(Dst, LoadDstReg);
3853 else {
3854 // Repack to original 16-bit vector result
3855 // FIXME: G_TRUNC should work, but legalization currently fails
3856 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3857 SmallVector<Register, 4> Repack;
3858 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3859 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3860 B.buildMerge(Dst, Repack);
3861 }
3862 }
3863
3864 MI.eraseFromParent();
3865 return true;
3866 }
3867
legalizeAtomicIncDec(MachineInstr & MI,MachineIRBuilder & B,bool IsInc) const3868 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3869 MachineIRBuilder &B,
3870 bool IsInc) const {
3871 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3872 AMDGPU::G_AMDGPU_ATOMIC_DEC;
3873 B.buildInstr(Opc)
3874 .addDef(MI.getOperand(0).getReg())
3875 .addUse(MI.getOperand(2).getReg())
3876 .addUse(MI.getOperand(3).getReg())
3877 .cloneMemRefs(MI);
3878 MI.eraseFromParent();
3879 return true;
3880 }
3881
getBufferAtomicPseudo(Intrinsic::ID IntrID)3882 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3883 switch (IntrID) {
3884 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3885 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3886 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3887 case Intrinsic::amdgcn_raw_buffer_atomic_add:
3888 case Intrinsic::amdgcn_struct_buffer_atomic_add:
3889 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3890 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3891 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3892 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3893 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3894 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3896 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3897 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3898 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3899 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3900 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3901 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3902 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3903 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3904 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3905 case Intrinsic::amdgcn_raw_buffer_atomic_and:
3906 case Intrinsic::amdgcn_struct_buffer_atomic_and:
3907 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3908 case Intrinsic::amdgcn_raw_buffer_atomic_or:
3909 case Intrinsic::amdgcn_struct_buffer_atomic_or:
3910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3911 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3912 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3913 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3914 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3915 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3916 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3917 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3918 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3919 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3920 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3921 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3922 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3923 case Intrinsic::amdgcn_buffer_atomic_fadd:
3924 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3925 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3926 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3927 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3928 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3929 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3930 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3931 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3932 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3933 default:
3934 llvm_unreachable("unhandled atomic opcode");
3935 }
3936 }
3937
legalizeBufferAtomic(MachineInstr & MI,MachineIRBuilder & B,Intrinsic::ID IID) const3938 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3939 MachineIRBuilder &B,
3940 Intrinsic::ID IID) const {
3941 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3942 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3943 const bool HasReturn = MI.getNumExplicitDefs() != 0;
3944
3945 Register Dst;
3946
3947 int OpOffset = 0;
3948 if (HasReturn) {
3949 // A few FP atomics do not support return values.
3950 Dst = MI.getOperand(0).getReg();
3951 } else {
3952 OpOffset = -1;
3953 }
3954
3955 Register VData = MI.getOperand(2 + OpOffset).getReg();
3956 Register CmpVal;
3957
3958 if (IsCmpSwap) {
3959 CmpVal = MI.getOperand(3 + OpOffset).getReg();
3960 ++OpOffset;
3961 }
3962
3963 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3964 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3965
3966 // The struct intrinsic variants add one additional operand over raw.
3967 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3968 Register VIndex;
3969 if (HasVIndex) {
3970 VIndex = MI.getOperand(4 + OpOffset).getReg();
3971 ++OpOffset;
3972 }
3973
3974 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3975 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3976 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3977
3978 MachineMemOperand *MMO = *MI.memoperands_begin();
3979
3980 unsigned ImmOffset;
3981 unsigned TotalOffset;
3982 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3983 if (TotalOffset != 0)
3984 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3985
3986 if (!VIndex)
3987 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3988
3989 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3990
3991 if (HasReturn)
3992 MIB.addDef(Dst);
3993
3994 MIB.addUse(VData); // vdata
3995
3996 if (IsCmpSwap)
3997 MIB.addReg(CmpVal);
3998
3999 MIB.addUse(RSrc) // rsrc
4000 .addUse(VIndex) // vindex
4001 .addUse(VOffset) // voffset
4002 .addUse(SOffset) // soffset
4003 .addImm(ImmOffset) // offset(imm)
4004 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4005 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4006 .addMemOperand(MMO);
4007
4008 MI.eraseFromParent();
4009 return true;
4010 }
4011
4012 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4013 /// vector with s16 typed elements.
packImage16bitOpsToDwords(MachineIRBuilder & B,MachineInstr & MI,SmallVectorImpl<Register> & PackedAddrs,unsigned ArgOffset,const AMDGPU::ImageDimIntrinsicInfo * Intr,bool IsA16,bool IsG16)4014 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4015 SmallVectorImpl<Register> &PackedAddrs,
4016 unsigned ArgOffset,
4017 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4018 bool IsA16, bool IsG16) {
4019 const LLT S16 = LLT::scalar(16);
4020 const LLT V2S16 = LLT::vector(2, 16);
4021 auto EndIdx = Intr->VAddrEnd;
4022
4023 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4024 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4025 if (!SrcOp.isReg())
4026 continue; // _L to _LZ may have eliminated this.
4027
4028 Register AddrReg = SrcOp.getReg();
4029
4030 if (I < Intr->GradientStart) {
4031 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4032 PackedAddrs.push_back(AddrReg);
4033 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4034 (I >= Intr->CoordStart && !IsA16)) {
4035 // Handle any gradient or coordinate operands that should not be packed
4036 PackedAddrs.push_back(AddrReg);
4037 } else {
4038 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4039 // derivatives dx/dh and dx/dv are packed with undef.
4040 if (((I + 1) >= EndIdx) ||
4041 ((Intr->NumGradients / 2) % 2 == 1 &&
4042 (I == static_cast<unsigned>(Intr->GradientStart +
4043 (Intr->NumGradients / 2) - 1) ||
4044 I == static_cast<unsigned>(Intr->GradientStart +
4045 Intr->NumGradients - 1))) ||
4046 // Check for _L to _LZ optimization
4047 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4048 PackedAddrs.push_back(
4049 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4050 .getReg(0));
4051 } else {
4052 PackedAddrs.push_back(
4053 B.buildBuildVector(
4054 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4055 .getReg(0));
4056 ++I;
4057 }
4058 }
4059 }
4060 }
4061
4062 /// Convert from separate vaddr components to a single vector address register,
4063 /// and replace the remaining operands with $noreg.
convertImageAddrToPacked(MachineIRBuilder & B,MachineInstr & MI,int DimIdx,int NumVAddrs)4064 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4065 int DimIdx, int NumVAddrs) {
4066 const LLT S32 = LLT::scalar(32);
4067
4068 SmallVector<Register, 8> AddrRegs;
4069 for (int I = 0; I != NumVAddrs; ++I) {
4070 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4071 if (SrcOp.isReg()) {
4072 AddrRegs.push_back(SrcOp.getReg());
4073 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4074 }
4075 }
4076
4077 int NumAddrRegs = AddrRegs.size();
4078 if (NumAddrRegs != 1) {
4079 // Round up to 8 elements for v5-v7
4080 // FIXME: Missing intermediate sized register classes and instructions.
4081 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4082 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4083 auto Undef = B.buildUndef(S32);
4084 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4085 NumAddrRegs = RoundedNumRegs;
4086 }
4087
4088 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4089 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4090 }
4091
4092 for (int I = 1; I != NumVAddrs; ++I) {
4093 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4094 if (SrcOp.isReg())
4095 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4096 }
4097 }
4098
4099 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4100 ///
4101 /// Depending on the subtarget, load/store with 16-bit element data need to be
4102 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4103 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4104 /// registers.
4105 ///
4106 /// We don't want to directly select image instructions just yet, but also want
4107 /// to exposes all register repacking to the legalizer/combiners. We also don't
4108 /// want a selected instrution entering RegBankSelect. In order to avoid
4109 /// defining a multitude of intermediate image instructions, directly hack on
4110 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4111 /// now unnecessary arguments with $noreg.
legalizeImageIntrinsic(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer,const AMDGPU::ImageDimIntrinsicInfo * Intr) const4112 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4113 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4114 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4115
4116 const unsigned NumDefs = MI.getNumExplicitDefs();
4117 const unsigned ArgOffset = NumDefs + 1;
4118 bool IsTFE = NumDefs == 2;
4119 // We are only processing the operands of d16 image operations on subtargets
4120 // that use the unpacked register layout, or need to repack the TFE result.
4121
4122 // TODO: Do we need to guard against already legalized intrinsics?
4123 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4124 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4125
4126 MachineRegisterInfo *MRI = B.getMRI();
4127 const LLT S32 = LLT::scalar(32);
4128 const LLT S16 = LLT::scalar(16);
4129 const LLT V2S16 = LLT::vector(2, 16);
4130
4131 unsigned DMask = 0;
4132
4133 // Check for 16 bit addresses and pack if true.
4134 LLT GradTy =
4135 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4136 LLT AddrTy =
4137 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4138 const bool IsG16 = GradTy == S16;
4139 const bool IsA16 = AddrTy == S16;
4140
4141 int DMaskLanes = 0;
4142 if (!BaseOpcode->Atomic) {
4143 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4144 if (BaseOpcode->Gather4) {
4145 DMaskLanes = 4;
4146 } else if (DMask != 0) {
4147 DMaskLanes = countPopulation(DMask);
4148 } else if (!IsTFE && !BaseOpcode->Store) {
4149 // If dmask is 0, this is a no-op load. This can be eliminated.
4150 B.buildUndef(MI.getOperand(0));
4151 MI.eraseFromParent();
4152 return true;
4153 }
4154 }
4155
4156 Observer.changingInstr(MI);
4157 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4158
4159 unsigned NewOpcode = NumDefs == 0 ?
4160 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4161
4162 // Track that we legalized this
4163 MI.setDesc(B.getTII().get(NewOpcode));
4164
4165 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4166 // dmask to be at least 1 otherwise the instruction will fail
4167 if (IsTFE && DMask == 0) {
4168 DMask = 0x1;
4169 DMaskLanes = 1;
4170 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4171 }
4172
4173 if (BaseOpcode->Atomic) {
4174 Register VData0 = MI.getOperand(2).getReg();
4175 LLT Ty = MRI->getType(VData0);
4176
4177 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4178 if (Ty.isVector())
4179 return false;
4180
4181 if (BaseOpcode->AtomicX2) {
4182 Register VData1 = MI.getOperand(3).getReg();
4183 // The two values are packed in one register.
4184 LLT PackedTy = LLT::vector(2, Ty);
4185 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4186 MI.getOperand(2).setReg(Concat.getReg(0));
4187 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4188 }
4189 }
4190
4191 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4192
4193 // Optimize _L to _LZ when _L is zero
4194 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4195 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4196 const ConstantFP *ConstantLod;
4197
4198 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4199 m_GFCst(ConstantLod))) {
4200 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4201 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4202 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4203 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4204 Intr->Dim);
4205
4206 // The starting indexes should remain in the same place.
4207 --CorrectedNumVAddrs;
4208
4209 MI.getOperand(MI.getNumExplicitDefs())
4210 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4211 MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4212 Intr = NewImageDimIntr;
4213 }
4214 }
4215 }
4216
4217 // Optimize _mip away, when 'lod' is zero
4218 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4219 int64_t ConstantLod;
4220 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4221 m_ICst(ConstantLod))) {
4222 if (ConstantLod == 0) {
4223 // TODO: Change intrinsic opcode and remove operand instead or replacing
4224 // it with 0, as the _L to _LZ handling is done above.
4225 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4226 --CorrectedNumVAddrs;
4227 }
4228 }
4229 }
4230
4231 // Rewrite the addressing register layout before doing anything else.
4232 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4233 // 16 bit gradients are supported, but are tied to the A16 control
4234 // so both gradients and addresses must be 16 bit
4235 return false;
4236 }
4237
4238 if (IsA16 && !ST.hasA16()) {
4239 // A16 not supported
4240 return false;
4241 }
4242
4243 if (IsA16 || IsG16) {
4244 if (Intr->NumVAddrs > 1) {
4245 SmallVector<Register, 4> PackedRegs;
4246
4247 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4248 IsG16);
4249
4250 // See also below in the non-a16 branch
4251 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4252
4253 if (!UseNSA && PackedRegs.size() > 1) {
4254 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4255 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4256 PackedRegs[0] = Concat.getReg(0);
4257 PackedRegs.resize(1);
4258 }
4259
4260 const unsigned NumPacked = PackedRegs.size();
4261 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4262 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4263 if (!SrcOp.isReg()) {
4264 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4265 continue;
4266 }
4267
4268 assert(SrcOp.getReg() != AMDGPU::NoRegister);
4269
4270 if (I - Intr->VAddrStart < NumPacked)
4271 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4272 else
4273 SrcOp.setReg(AMDGPU::NoRegister);
4274 }
4275 }
4276 } else {
4277 // If the register allocator cannot place the address registers contiguously
4278 // without introducing moves, then using the non-sequential address encoding
4279 // is always preferable, since it saves VALU instructions and is usually a
4280 // wash in terms of code size or even better.
4281 //
4282 // However, we currently have no way of hinting to the register allocator
4283 // that MIMG addresses should be placed contiguously when it is possible to
4284 // do so, so force non-NSA for the common 2-address case as a heuristic.
4285 //
4286 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4287 // allocation when possible.
4288 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4289
4290 if (!UseNSA && Intr->NumVAddrs > 1)
4291 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4292 Intr->NumVAddrs);
4293 }
4294
4295 int Flags = 0;
4296 if (IsA16)
4297 Flags |= 1;
4298 if (IsG16)
4299 Flags |= 2;
4300 MI.addOperand(MachineOperand::CreateImm(Flags));
4301
4302 if (BaseOpcode->Store) { // No TFE for stores?
4303 // TODO: Handle dmask trim
4304 Register VData = MI.getOperand(1).getReg();
4305 LLT Ty = MRI->getType(VData);
4306 if (!Ty.isVector() || Ty.getElementType() != S16)
4307 return true;
4308
4309 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4310 if (RepackedReg != VData) {
4311 MI.getOperand(1).setReg(RepackedReg);
4312 }
4313
4314 return true;
4315 }
4316
4317 Register DstReg = MI.getOperand(0).getReg();
4318 LLT Ty = MRI->getType(DstReg);
4319 const LLT EltTy = Ty.getScalarType();
4320 const bool IsD16 = Ty.getScalarType() == S16;
4321 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4322
4323 // Confirm that the return type is large enough for the dmask specified
4324 if (NumElts < DMaskLanes)
4325 return false;
4326
4327 if (NumElts > 4 || DMaskLanes > 4)
4328 return false;
4329
4330 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4331 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4332
4333 // The raw dword aligned data component of the load. The only legal cases
4334 // where this matters should be when using the packed D16 format, for
4335 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4336 LLT RoundedTy;
4337
4338 // S32 vector to to cover all data, plus TFE result element.
4339 LLT TFETy;
4340
4341 // Register type to use for each loaded component. Will be S32 or V2S16.
4342 LLT RegTy;
4343
4344 if (IsD16 && ST.hasUnpackedD16VMem()) {
4345 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4346 TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4347 RegTy = S32;
4348 } else {
4349 unsigned EltSize = EltTy.getSizeInBits();
4350 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4351 unsigned RoundedSize = 32 * RoundedElts;
4352 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4353 TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4354 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4355 }
4356
4357 // The return type does not need adjustment.
4358 // TODO: Should we change s16 case to s32 or <2 x s16>?
4359 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4360 return true;
4361
4362 Register Dst1Reg;
4363
4364 // Insert after the instruction.
4365 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4366
4367 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4368 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4369 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4370 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4371
4372 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4373
4374 MI.getOperand(0).setReg(NewResultReg);
4375
4376 // In the IR, TFE is supposed to be used with a 2 element struct return
4377 // type. The intruction really returns these two values in one contiguous
4378 // register, with one additional dword beyond the loaded data. Rewrite the
4379 // return type to use a single register result.
4380
4381 if (IsTFE) {
4382 Dst1Reg = MI.getOperand(1).getReg();
4383 if (MRI->getType(Dst1Reg) != S32)
4384 return false;
4385
4386 // TODO: Make sure the TFE operand bit is set.
4387 MI.RemoveOperand(1);
4388
4389 // Handle the easy case that requires no repack instructions.
4390 if (Ty == S32) {
4391 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4392 return true;
4393 }
4394 }
4395
4396 // Now figure out how to copy the new result register back into the old
4397 // result.
4398 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4399
4400 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4401
4402 if (ResultNumRegs == 1) {
4403 assert(!IsTFE);
4404 ResultRegs[0] = NewResultReg;
4405 } else {
4406 // We have to repack into a new vector of some kind.
4407 for (int I = 0; I != NumDataRegs; ++I)
4408 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4409 B.buildUnmerge(ResultRegs, NewResultReg);
4410
4411 // Drop the final TFE element to get the data part. The TFE result is
4412 // directly written to the right place already.
4413 if (IsTFE)
4414 ResultRegs.resize(NumDataRegs);
4415 }
4416
4417 // For an s16 scalar result, we form an s32 result with a truncate regardless
4418 // of packed vs. unpacked.
4419 if (IsD16 && !Ty.isVector()) {
4420 B.buildTrunc(DstReg, ResultRegs[0]);
4421 return true;
4422 }
4423
4424 // Avoid a build/concat_vector of 1 entry.
4425 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4426 B.buildBitcast(DstReg, ResultRegs[0]);
4427 return true;
4428 }
4429
4430 assert(Ty.isVector());
4431
4432 if (IsD16) {
4433 // For packed D16 results with TFE enabled, all the data components are
4434 // S32. Cast back to the expected type.
4435 //
4436 // TODO: We don't really need to use load s32 elements. We would only need one
4437 // cast for the TFE result if a multiple of v2s16 was used.
4438 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4439 for (Register &Reg : ResultRegs)
4440 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4441 } else if (ST.hasUnpackedD16VMem()) {
4442 for (Register &Reg : ResultRegs)
4443 Reg = B.buildTrunc(S16, Reg).getReg(0);
4444 }
4445 }
4446
4447 auto padWithUndef = [&](LLT Ty, int NumElts) {
4448 if (NumElts == 0)
4449 return;
4450 Register Undef = B.buildUndef(Ty).getReg(0);
4451 for (int I = 0; I != NumElts; ++I)
4452 ResultRegs.push_back(Undef);
4453 };
4454
4455 // Pad out any elements eliminated due to the dmask.
4456 LLT ResTy = MRI->getType(ResultRegs[0]);
4457 if (!ResTy.isVector()) {
4458 padWithUndef(ResTy, NumElts - ResultRegs.size());
4459 B.buildBuildVector(DstReg, ResultRegs);
4460 return true;
4461 }
4462
4463 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4464 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4465
4466 // Deal with the one annoying legal case.
4467 const LLT V3S16 = LLT::vector(3, 16);
4468 if (Ty == V3S16) {
4469 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4470 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4471 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4472 return true;
4473 }
4474
4475 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4476 B.buildConcatVectors(DstReg, ResultRegs);
4477 return true;
4478 }
4479
legalizeSBufferLoad(LegalizerHelper & Helper,MachineInstr & MI) const4480 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4481 LegalizerHelper &Helper, MachineInstr &MI) const {
4482 MachineIRBuilder &B = Helper.MIRBuilder;
4483 GISelChangeObserver &Observer = Helper.Observer;
4484
4485 Register Dst = MI.getOperand(0).getReg();
4486 LLT Ty = B.getMRI()->getType(Dst);
4487 unsigned Size = Ty.getSizeInBits();
4488 MachineFunction &MF = B.getMF();
4489
4490 Observer.changingInstr(MI);
4491
4492 if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4493 Ty = getBitcastRegisterType(Ty);
4494 Helper.bitcastDst(MI, Ty, 0);
4495 Dst = MI.getOperand(0).getReg();
4496 B.setInsertPt(B.getMBB(), MI);
4497 }
4498
4499 // FIXME: We don't really need this intermediate instruction. The intrinsic
4500 // should be fixed to have a memory operand. Since it's readnone, we're not
4501 // allowed to add one.
4502 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4503 MI.RemoveOperand(1); // Remove intrinsic ID
4504
4505 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4506 // TODO: Should this use datalayout alignment?
4507 const unsigned MemSize = (Size + 7) / 8;
4508 const Align MemAlign(4);
4509 MachineMemOperand *MMO = MF.getMachineMemOperand(
4510 MachinePointerInfo(),
4511 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4512 MachineMemOperand::MOInvariant,
4513 MemSize, MemAlign);
4514 MI.addMemOperand(MF, MMO);
4515
4516 // There are no 96-bit result scalar loads, but widening to 128-bit should
4517 // always be legal. We may need to restore this to a 96-bit result if it turns
4518 // out this needs to be converted to a vector load during RegBankSelect.
4519 if (!isPowerOf2_32(Size)) {
4520 if (Ty.isVector())
4521 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4522 else
4523 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4524 }
4525
4526 Observer.changedInstr(MI);
4527 return true;
4528 }
4529
4530 // TODO: Move to selection
legalizeTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4531 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4532 MachineRegisterInfo &MRI,
4533 MachineIRBuilder &B) const {
4534 if (!ST.isTrapHandlerEnabled() ||
4535 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4536 return legalizeTrapEndpgm(MI, MRI, B);
4537
4538 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4539 switch (*HsaAbiVer) {
4540 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4541 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4542 return legalizeTrapHsaQueuePtr(MI, MRI, B);
4543 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4544 return ST.supportsGetDoorbellID() ?
4545 legalizeTrapHsa(MI, MRI, B) :
4546 legalizeTrapHsaQueuePtr(MI, MRI, B);
4547 }
4548 }
4549
4550 llvm_unreachable("Unknown trap handler");
4551 }
4552
legalizeTrapEndpgm(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4553 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4554 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4555 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4556 MI.eraseFromParent();
4557 return true;
4558 }
4559
legalizeTrapHsaQueuePtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4560 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4561 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4562 // Pass queue pointer to trap handler as input, and insert trap instruction
4563 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4564 Register LiveIn =
4565 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4566 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4567 return false;
4568
4569 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4570 B.buildCopy(SGPR01, LiveIn);
4571 B.buildInstr(AMDGPU::S_TRAP)
4572 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4573 .addReg(SGPR01, RegState::Implicit);
4574
4575 MI.eraseFromParent();
4576 return true;
4577 }
4578
legalizeTrapHsa(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4579 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4580 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4581 B.buildInstr(AMDGPU::S_TRAP)
4582 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4583 MI.eraseFromParent();
4584 return true;
4585 }
4586
legalizeDebugTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4587 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4588 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4589 // Is non-HSA path or trap-handler disabled? then, report a warning
4590 // accordingly
4591 if (!ST.isTrapHandlerEnabled() ||
4592 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4593 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4594 "debugtrap handler not supported",
4595 MI.getDebugLoc(), DS_Warning);
4596 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4597 Ctx.diagnose(NoTrap);
4598 } else {
4599 // Insert debug-trap instruction
4600 B.buildInstr(AMDGPU::S_TRAP)
4601 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4602 }
4603
4604 MI.eraseFromParent();
4605 return true;
4606 }
4607
legalizeBVHIntrinsic(MachineInstr & MI,MachineIRBuilder & B) const4608 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4609 MachineIRBuilder &B) const {
4610 MachineRegisterInfo &MRI = *B.getMRI();
4611 const LLT S16 = LLT::scalar(16);
4612 const LLT S32 = LLT::scalar(32);
4613
4614 Register DstReg = MI.getOperand(0).getReg();
4615 Register NodePtr = MI.getOperand(2).getReg();
4616 Register RayExtent = MI.getOperand(3).getReg();
4617 Register RayOrigin = MI.getOperand(4).getReg();
4618 Register RayDir = MI.getOperand(5).getReg();
4619 Register RayInvDir = MI.getOperand(6).getReg();
4620 Register TDescr = MI.getOperand(7).getReg();
4621
4622 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4623 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4624 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4625 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4626 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4627 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4628
4629 SmallVector<Register, 12> Ops;
4630 if (Is64) {
4631 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4632 Ops.push_back(Unmerge.getReg(0));
4633 Ops.push_back(Unmerge.getReg(1));
4634 } else {
4635 Ops.push_back(NodePtr);
4636 }
4637 Ops.push_back(RayExtent);
4638
4639 auto packLanes = [&Ops, &S32, &B] (Register Src) {
4640 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4641 Ops.push_back(Unmerge.getReg(0));
4642 Ops.push_back(Unmerge.getReg(1));
4643 Ops.push_back(Unmerge.getReg(2));
4644 };
4645
4646 packLanes(RayOrigin);
4647 if (IsA16) {
4648 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4649 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4650 Register R1 = MRI.createGenericVirtualRegister(S32);
4651 Register R2 = MRI.createGenericVirtualRegister(S32);
4652 Register R3 = MRI.createGenericVirtualRegister(S32);
4653 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4654 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4655 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4656 Ops.push_back(R1);
4657 Ops.push_back(R2);
4658 Ops.push_back(R3);
4659 } else {
4660 packLanes(RayDir);
4661 packLanes(RayInvDir);
4662 }
4663
4664 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4665 .addDef(DstReg)
4666 .addImm(Opcode);
4667
4668 for (Register R : Ops) {
4669 MIB.addUse(R);
4670 }
4671
4672 MIB.addUse(TDescr)
4673 .addImm(IsA16 ? 1 : 0)
4674 .cloneMemRefs(MI);
4675
4676 MI.eraseFromParent();
4677 return true;
4678 }
4679
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const4680 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4681 MachineInstr &MI) const {
4682 MachineIRBuilder &B = Helper.MIRBuilder;
4683 MachineRegisterInfo &MRI = *B.getMRI();
4684
4685 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4686 auto IntrID = MI.getIntrinsicID();
4687 switch (IntrID) {
4688 case Intrinsic::amdgcn_if:
4689 case Intrinsic::amdgcn_else: {
4690 MachineInstr *Br = nullptr;
4691 MachineBasicBlock *UncondBrTarget = nullptr;
4692 bool Negated = false;
4693 if (MachineInstr *BrCond =
4694 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4695 const SIRegisterInfo *TRI
4696 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4697
4698 Register Def = MI.getOperand(1).getReg();
4699 Register Use = MI.getOperand(3).getReg();
4700
4701 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4702
4703 if (Negated)
4704 std::swap(CondBrTarget, UncondBrTarget);
4705
4706 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4707 if (IntrID == Intrinsic::amdgcn_if) {
4708 B.buildInstr(AMDGPU::SI_IF)
4709 .addDef(Def)
4710 .addUse(Use)
4711 .addMBB(UncondBrTarget);
4712 } else {
4713 B.buildInstr(AMDGPU::SI_ELSE)
4714 .addDef(Def)
4715 .addUse(Use)
4716 .addMBB(UncondBrTarget);
4717 }
4718
4719 if (Br) {
4720 Br->getOperand(0).setMBB(CondBrTarget);
4721 } else {
4722 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4723 // since we're swapping branch targets it needs to be reinserted.
4724 // FIXME: IRTranslator should probably not do this
4725 B.buildBr(*CondBrTarget);
4726 }
4727
4728 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4729 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4730 MI.eraseFromParent();
4731 BrCond->eraseFromParent();
4732 return true;
4733 }
4734
4735 return false;
4736 }
4737 case Intrinsic::amdgcn_loop: {
4738 MachineInstr *Br = nullptr;
4739 MachineBasicBlock *UncondBrTarget = nullptr;
4740 bool Negated = false;
4741 if (MachineInstr *BrCond =
4742 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4743 const SIRegisterInfo *TRI
4744 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4745
4746 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4747 Register Reg = MI.getOperand(2).getReg();
4748
4749 if (Negated)
4750 std::swap(CondBrTarget, UncondBrTarget);
4751
4752 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4753 B.buildInstr(AMDGPU::SI_LOOP)
4754 .addUse(Reg)
4755 .addMBB(UncondBrTarget);
4756
4757 if (Br)
4758 Br->getOperand(0).setMBB(CondBrTarget);
4759 else
4760 B.buildBr(*CondBrTarget);
4761
4762 MI.eraseFromParent();
4763 BrCond->eraseFromParent();
4764 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4765 return true;
4766 }
4767
4768 return false;
4769 }
4770 case Intrinsic::amdgcn_kernarg_segment_ptr:
4771 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4772 // This only makes sense to call in a kernel, so just lower to null.
4773 B.buildConstant(MI.getOperand(0).getReg(), 0);
4774 MI.eraseFromParent();
4775 return true;
4776 }
4777
4778 return legalizePreloadedArgIntrin(
4779 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4780 case Intrinsic::amdgcn_implicitarg_ptr:
4781 return legalizeImplicitArgPtr(MI, MRI, B);
4782 case Intrinsic::amdgcn_workitem_id_x:
4783 return legalizePreloadedArgIntrin(MI, MRI, B,
4784 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4785 case Intrinsic::amdgcn_workitem_id_y:
4786 return legalizePreloadedArgIntrin(MI, MRI, B,
4787 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4788 case Intrinsic::amdgcn_workitem_id_z:
4789 return legalizePreloadedArgIntrin(MI, MRI, B,
4790 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4791 case Intrinsic::amdgcn_workgroup_id_x:
4792 return legalizePreloadedArgIntrin(MI, MRI, B,
4793 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4794 case Intrinsic::amdgcn_workgroup_id_y:
4795 return legalizePreloadedArgIntrin(MI, MRI, B,
4796 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4797 case Intrinsic::amdgcn_workgroup_id_z:
4798 return legalizePreloadedArgIntrin(MI, MRI, B,
4799 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4800 case Intrinsic::amdgcn_dispatch_ptr:
4801 return legalizePreloadedArgIntrin(MI, MRI, B,
4802 AMDGPUFunctionArgInfo::DISPATCH_PTR);
4803 case Intrinsic::amdgcn_queue_ptr:
4804 return legalizePreloadedArgIntrin(MI, MRI, B,
4805 AMDGPUFunctionArgInfo::QUEUE_PTR);
4806 case Intrinsic::amdgcn_implicit_buffer_ptr:
4807 return legalizePreloadedArgIntrin(
4808 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4809 case Intrinsic::amdgcn_dispatch_id:
4810 return legalizePreloadedArgIntrin(MI, MRI, B,
4811 AMDGPUFunctionArgInfo::DISPATCH_ID);
4812 case Intrinsic::amdgcn_fdiv_fast:
4813 return legalizeFDIVFastIntrin(MI, MRI, B);
4814 case Intrinsic::amdgcn_is_shared:
4815 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4816 case Intrinsic::amdgcn_is_private:
4817 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4818 case Intrinsic::amdgcn_wavefrontsize: {
4819 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4820 MI.eraseFromParent();
4821 return true;
4822 }
4823 case Intrinsic::amdgcn_s_buffer_load:
4824 return legalizeSBufferLoad(Helper, MI);
4825 case Intrinsic::amdgcn_raw_buffer_store:
4826 case Intrinsic::amdgcn_struct_buffer_store:
4827 return legalizeBufferStore(MI, MRI, B, false, false);
4828 case Intrinsic::amdgcn_raw_buffer_store_format:
4829 case Intrinsic::amdgcn_struct_buffer_store_format:
4830 return legalizeBufferStore(MI, MRI, B, false, true);
4831 case Intrinsic::amdgcn_raw_tbuffer_store:
4832 case Intrinsic::amdgcn_struct_tbuffer_store:
4833 return legalizeBufferStore(MI, MRI, B, true, true);
4834 case Intrinsic::amdgcn_raw_buffer_load:
4835 case Intrinsic::amdgcn_struct_buffer_load:
4836 return legalizeBufferLoad(MI, MRI, B, false, false);
4837 case Intrinsic::amdgcn_raw_buffer_load_format:
4838 case Intrinsic::amdgcn_struct_buffer_load_format:
4839 return legalizeBufferLoad(MI, MRI, B, true, false);
4840 case Intrinsic::amdgcn_raw_tbuffer_load:
4841 case Intrinsic::amdgcn_struct_tbuffer_load:
4842 return legalizeBufferLoad(MI, MRI, B, true, true);
4843 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4844 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4845 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4846 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4847 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4848 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4849 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4850 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4851 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4852 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4853 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4854 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4855 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4856 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4857 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4858 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4859 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4860 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4861 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4862 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4863 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4864 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4865 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4866 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4867 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4868 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4869 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4870 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4871 case Intrinsic::amdgcn_buffer_atomic_fadd:
4872 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4873 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4874 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4875 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4876 return legalizeBufferAtomic(MI, B, IntrID);
4877 case Intrinsic::amdgcn_atomic_inc:
4878 return legalizeAtomicIncDec(MI, B, true);
4879 case Intrinsic::amdgcn_atomic_dec:
4880 return legalizeAtomicIncDec(MI, B, false);
4881 case Intrinsic::trap:
4882 return legalizeTrapIntrinsic(MI, MRI, B);
4883 case Intrinsic::debugtrap:
4884 return legalizeDebugTrapIntrinsic(MI, MRI, B);
4885 case Intrinsic::amdgcn_rsq_clamp:
4886 return legalizeRsqClampIntrinsic(MI, MRI, B);
4887 case Intrinsic::amdgcn_ds_fadd:
4888 case Intrinsic::amdgcn_ds_fmin:
4889 case Intrinsic::amdgcn_ds_fmax:
4890 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4891 case Intrinsic::amdgcn_image_bvh_intersect_ray:
4892 return legalizeBVHIntrinsic(MI, B);
4893 default: {
4894 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4895 AMDGPU::getImageDimIntrinsicInfo(IntrID))
4896 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4897 return true;
4898 }
4899 }
4900
4901 return true;
4902 }
4903