1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPULegalizerInfo.h"
15
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/ADT/ScopeExit.h"
22 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27
28 #define DEBUG_TYPE "amdgpu-legalinfo"
29
30 using namespace llvm;
31 using namespace LegalizeActions;
32 using namespace LegalizeMutations;
33 using namespace LegalityPredicates;
34 using namespace MIPatternMatch;
35
36 // Hack until load/store selection patterns support any tuple of legal types.
37 static cl::opt<bool> EnableNewLegality(
38 "amdgpu-global-isel-new-legality",
39 cl::desc("Use GlobalISel desired legality, rather than try to use"
40 "rules compatible with selection patterns"),
41 cl::init(false),
42 cl::ReallyHidden);
43
44 static constexpr unsigned MaxRegisterSize = 1024;
45
46 // Round the number of elements to the next power of two elements
getPow2VectorType(LLT Ty)47 static LLT getPow2VectorType(LLT Ty) {
48 unsigned NElts = Ty.getNumElements();
49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
50 return Ty.changeNumElements(Pow2NElts);
51 }
52
53 // Round the number of bits to the next power of two bits
getPow2ScalarType(LLT Ty)54 static LLT getPow2ScalarType(LLT Ty) {
55 unsigned Bits = Ty.getSizeInBits();
56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
57 return LLT::scalar(Pow2Bits);
58 }
59
60 /// \returs true if this is an odd sized vector which should widen by adding an
61 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
62 /// excludes s1 vectors, which should always be scalarized.
isSmallOddVector(unsigned TypeIdx)63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64 return [=](const LegalityQuery &Query) {
65 const LLT Ty = Query.Types[TypeIdx];
66 if (!Ty.isVector())
67 return false;
68
69 const LLT EltTy = Ty.getElementType();
70 const unsigned EltSize = EltTy.getSizeInBits();
71 return Ty.getNumElements() % 2 != 0 &&
72 EltSize > 1 && EltSize < 32 &&
73 Ty.getSizeInBits() % 32 != 0;
74 };
75 }
76
sizeIsMultipleOf32(unsigned TypeIdx)77 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
78 return [=](const LegalityQuery &Query) {
79 const LLT Ty = Query.Types[TypeIdx];
80 return Ty.getSizeInBits() % 32 == 0;
81 };
82 }
83
isWideVec16(unsigned TypeIdx)84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85 return [=](const LegalityQuery &Query) {
86 const LLT Ty = Query.Types[TypeIdx];
87 const LLT EltTy = Ty.getScalarType();
88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89 };
90 }
91
oneMoreElement(unsigned TypeIdx)92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93 return [=](const LegalityQuery &Query) {
94 const LLT Ty = Query.Types[TypeIdx];
95 const LLT EltTy = Ty.getElementType();
96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97 };
98 }
99
fewerEltsToSize64Vector(unsigned TypeIdx)100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101 return [=](const LegalityQuery &Query) {
102 const LLT Ty = Query.Types[TypeIdx];
103 const LLT EltTy = Ty.getElementType();
104 unsigned Size = Ty.getSizeInBits();
105 unsigned Pieces = (Size + 63) / 64;
106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108 };
109 }
110
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
moreEltsToNext32Bit(unsigned TypeIdx)113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116
117 const LLT EltTy = Ty.getElementType();
118 const int Size = Ty.getSizeInBits();
119 const int EltSize = EltTy.getSizeInBits();
120 const int NextMul32 = (Size + 31) / 32;
121
122 assert(EltSize < 32);
123
124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126 };
127 }
128
getBitcastRegisterType(const LLT Ty)129 static LLT getBitcastRegisterType(const LLT Ty) {
130 const unsigned Size = Ty.getSizeInBits();
131
132 LLT CoercedTy;
133 if (Size <= 32) {
134 // <2 x s8> -> s16
135 // <4 x s8> -> s32
136 return LLT::scalar(Size);
137 }
138
139 return LLT::scalarOrVector(Size / 32, 32);
140 }
141
bitcastToRegisterType(unsigned TypeIdx)142 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
143 return [=](const LegalityQuery &Query) {
144 const LLT Ty = Query.Types[TypeIdx];
145 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
146 };
147 }
148
bitcastToVectorElement32(unsigned TypeIdx)149 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
150 return [=](const LegalityQuery &Query) {
151 const LLT Ty = Query.Types[TypeIdx];
152 unsigned Size = Ty.getSizeInBits();
153 assert(Size % 32 == 0);
154 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
155 };
156 }
157
vectorSmallerThan(unsigned TypeIdx,unsigned Size)158 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
159 return [=](const LegalityQuery &Query) {
160 const LLT QueryTy = Query.Types[TypeIdx];
161 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
162 };
163 }
164
vectorWiderThan(unsigned TypeIdx,unsigned Size)165 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
166 return [=](const LegalityQuery &Query) {
167 const LLT QueryTy = Query.Types[TypeIdx];
168 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
169 };
170 }
171
numElementsNotEven(unsigned TypeIdx)172 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
173 return [=](const LegalityQuery &Query) {
174 const LLT QueryTy = Query.Types[TypeIdx];
175 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
176 };
177 }
178
isRegisterSize(unsigned Size)179 static bool isRegisterSize(unsigned Size) {
180 return Size % 32 == 0 && Size <= MaxRegisterSize;
181 }
182
isRegisterVectorElementType(LLT EltTy)183 static bool isRegisterVectorElementType(LLT EltTy) {
184 const int EltSize = EltTy.getSizeInBits();
185 return EltSize == 16 || EltSize % 32 == 0;
186 }
187
isRegisterVectorType(LLT Ty)188 static bool isRegisterVectorType(LLT Ty) {
189 const int EltSize = Ty.getElementType().getSizeInBits();
190 return EltSize == 32 || EltSize == 64 ||
191 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
192 EltSize == 128 || EltSize == 256;
193 }
194
isRegisterType(LLT Ty)195 static bool isRegisterType(LLT Ty) {
196 if (!isRegisterSize(Ty.getSizeInBits()))
197 return false;
198
199 if (Ty.isVector())
200 return isRegisterVectorType(Ty);
201
202 return true;
203 }
204
205 // Any combination of 32 or 64-bit elements up the maximum register size, and
206 // multiples of v2s16.
isRegisterType(unsigned TypeIdx)207 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
208 return [=](const LegalityQuery &Query) {
209 return isRegisterType(Query.Types[TypeIdx]);
210 };
211 }
212
elementTypeIsLegal(unsigned TypeIdx)213 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
214 return [=](const LegalityQuery &Query) {
215 const LLT QueryTy = Query.Types[TypeIdx];
216 if (!QueryTy.isVector())
217 return false;
218 const LLT EltTy = QueryTy.getElementType();
219 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
220 };
221 }
222
isWideScalarTruncStore(unsigned TypeIdx)223 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
224 return [=](const LegalityQuery &Query) {
225 const LLT Ty = Query.Types[TypeIdx];
226 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
227 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
228 };
229 }
230
231 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
232 // handle some operations by just promoting the register during
233 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
maxSizeForAddrSpace(const GCNSubtarget & ST,unsigned AS,bool IsLoad)234 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
235 bool IsLoad) {
236 switch (AS) {
237 case AMDGPUAS::PRIVATE_ADDRESS:
238 // FIXME: Private element size.
239 return ST.enableFlatScratch() ? 128 : 32;
240 case AMDGPUAS::LOCAL_ADDRESS:
241 return ST.useDS128() ? 128 : 64;
242 case AMDGPUAS::GLOBAL_ADDRESS:
243 case AMDGPUAS::CONSTANT_ADDRESS:
244 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
245 // Treat constant and global as identical. SMRD loads are sometimes usable for
246 // global loads (ideally constant address space should be eliminated)
247 // depending on the context. Legality cannot be context dependent, but
248 // RegBankSelect can split the load as necessary depending on the pointer
249 // register bank/uniformity and if the memory is invariant or not written in a
250 // kernel.
251 return IsLoad ? 512 : 128;
252 default:
253 // Flat addresses may contextually need to be split to 32-bit parts if they
254 // may alias scratch depending on the subtarget.
255 return 128;
256 }
257 }
258
isLoadStoreSizeLegal(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)259 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
260 const LegalityQuery &Query,
261 unsigned Opcode) {
262 const LLT Ty = Query.Types[0];
263
264 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
265 const bool IsLoad = Opcode != AMDGPU::G_STORE;
266
267 unsigned RegSize = Ty.getSizeInBits();
268 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
269 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
270 unsigned AS = Query.Types[1].getAddressSpace();
271
272 // All of these need to be custom lowered to cast the pointer operand.
273 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
274 return false;
275
276 // TODO: We should be able to widen loads if the alignment is high enough, but
277 // we also need to modify the memory access size.
278 #if 0
279 // Accept widening loads based on alignment.
280 if (IsLoad && MemSize < Size)
281 MemSize = std::max(MemSize, Align);
282 #endif
283
284 // Only 1-byte and 2-byte to 32-bit extloads are valid.
285 if (MemSize != RegSize && RegSize != 32)
286 return false;
287
288 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
289 return false;
290
291 switch (MemSize) {
292 case 8:
293 case 16:
294 case 32:
295 case 64:
296 case 128:
297 break;
298 case 96:
299 if (!ST.hasDwordx3LoadStores())
300 return false;
301 break;
302 case 256:
303 case 512:
304 // These may contextually need to be broken down.
305 break;
306 default:
307 return false;
308 }
309
310 assert(RegSize >= MemSize);
311
312 if (AlignBits < MemSize) {
313 const SITargetLowering *TLI = ST.getTargetLowering();
314 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
315 Align(AlignBits / 8)))
316 return false;
317 }
318
319 return true;
320 }
321
322 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
323 // workaround this. Eventually it should ignore the type for loads and only care
324 // about the size. Return true in cases where we will workaround this for now by
325 // bitcasting.
loadStoreBitcastWorkaround(const LLT Ty)326 static bool loadStoreBitcastWorkaround(const LLT Ty) {
327 if (EnableNewLegality)
328 return false;
329
330 const unsigned Size = Ty.getSizeInBits();
331 if (Size <= 64)
332 return false;
333 if (!Ty.isVector())
334 return true;
335
336 LLT EltTy = Ty.getElementType();
337 if (EltTy.isPointer())
338 return true;
339
340 unsigned EltSize = EltTy.getSizeInBits();
341 return EltSize != 32 && EltSize != 64;
342 }
343
isLoadStoreLegal(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)344 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
345 unsigned Opcode) {
346 const LLT Ty = Query.Types[0];
347 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
348 !loadStoreBitcastWorkaround(Ty);
349 }
350
351 /// Return true if a load or store of the type should be lowered with a bitcast
352 /// to a different type.
shouldBitcastLoadStoreType(const GCNSubtarget & ST,const LLT Ty,const unsigned MemSizeInBits)353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
354 const unsigned MemSizeInBits) {
355 const unsigned Size = Ty.getSizeInBits();
356 if (Size != MemSizeInBits)
357 return Size <= 32 && Ty.isVector();
358
359 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
360 return true;
361 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
362 !isRegisterVectorElementType(Ty.getElementType());
363 }
364
365 /// Return true if we should legalize a load by widening an odd sized memory
366 /// access up to the alignment. Note this case when the memory access itself
367 /// changes, not the size of the result register.
shouldWidenLoad(const GCNSubtarget & ST,unsigned SizeInBits,unsigned AlignInBits,unsigned AddrSpace,unsigned Opcode)368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
369 unsigned AlignInBits, unsigned AddrSpace,
370 unsigned Opcode) {
371 // We don't want to widen cases that are naturally legal.
372 if (isPowerOf2_32(SizeInBits))
373 return false;
374
375 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
376 // end up widening these for a scalar load during RegBankSelect, since there
377 // aren't 96-bit scalar loads.
378 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
379 return false;
380
381 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
382 return false;
383
384 // A load is known dereferenceable up to the alignment, so it's legal to widen
385 // to it.
386 //
387 // TODO: Could check dereferenceable for less aligned cases.
388 unsigned RoundedSize = NextPowerOf2(SizeInBits);
389 if (AlignInBits < RoundedSize)
390 return false;
391
392 // Do not widen if it would introduce a slow unaligned load.
393 const SITargetLowering *TLI = ST.getTargetLowering();
394 bool Fast = false;
395 return TLI->allowsMisalignedMemoryAccessesImpl(
396 RoundedSize, AddrSpace, Align(AlignInBits / 8),
397 MachineMemOperand::MOLoad, &Fast) &&
398 Fast;
399 }
400
shouldWidenLoad(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
402 unsigned Opcode) {
403 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
404 return false;
405
406 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
407 Query.MMODescrs[0].AlignInBits,
408 Query.Types[1].getAddressSpace(), Opcode);
409 }
410
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
412 const GCNTargetMachine &TM)
413 : ST(ST_) {
414 using namespace TargetOpcode;
415
416 auto GetAddrSpacePtr = [&TM](unsigned AS) {
417 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
418 };
419
420 const LLT S1 = LLT::scalar(1);
421 const LLT S8 = LLT::scalar(8);
422 const LLT S16 = LLT::scalar(16);
423 const LLT S32 = LLT::scalar(32);
424 const LLT S64 = LLT::scalar(64);
425 const LLT S128 = LLT::scalar(128);
426 const LLT S256 = LLT::scalar(256);
427 const LLT S512 = LLT::scalar(512);
428 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
429
430 const LLT V2S8 = LLT::vector(2, 8);
431 const LLT V2S16 = LLT::vector(2, 16);
432 const LLT V4S16 = LLT::vector(4, 16);
433
434 const LLT V2S32 = LLT::vector(2, 32);
435 const LLT V3S32 = LLT::vector(3, 32);
436 const LLT V4S32 = LLT::vector(4, 32);
437 const LLT V5S32 = LLT::vector(5, 32);
438 const LLT V6S32 = LLT::vector(6, 32);
439 const LLT V7S32 = LLT::vector(7, 32);
440 const LLT V8S32 = LLT::vector(8, 32);
441 const LLT V9S32 = LLT::vector(9, 32);
442 const LLT V10S32 = LLT::vector(10, 32);
443 const LLT V11S32 = LLT::vector(11, 32);
444 const LLT V12S32 = LLT::vector(12, 32);
445 const LLT V13S32 = LLT::vector(13, 32);
446 const LLT V14S32 = LLT::vector(14, 32);
447 const LLT V15S32 = LLT::vector(15, 32);
448 const LLT V16S32 = LLT::vector(16, 32);
449 const LLT V32S32 = LLT::vector(32, 32);
450
451 const LLT V2S64 = LLT::vector(2, 64);
452 const LLT V3S64 = LLT::vector(3, 64);
453 const LLT V4S64 = LLT::vector(4, 64);
454 const LLT V5S64 = LLT::vector(5, 64);
455 const LLT V6S64 = LLT::vector(6, 64);
456 const LLT V7S64 = LLT::vector(7, 64);
457 const LLT V8S64 = LLT::vector(8, 64);
458 const LLT V16S64 = LLT::vector(16, 64);
459
460 std::initializer_list<LLT> AllS32Vectors =
461 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
462 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
463 std::initializer_list<LLT> AllS64Vectors =
464 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
465
466 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
467 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
468 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
469 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
470 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
471 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
472 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
473
474 const LLT CodePtr = FlatPtr;
475
476 const std::initializer_list<LLT> AddrSpaces64 = {
477 GlobalPtr, ConstantPtr, FlatPtr
478 };
479
480 const std::initializer_list<LLT> AddrSpaces32 = {
481 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
482 };
483
484 const std::initializer_list<LLT> FPTypesBase = {
485 S32, S64
486 };
487
488 const std::initializer_list<LLT> FPTypes16 = {
489 S32, S64, S16
490 };
491
492 const std::initializer_list<LLT> FPTypesPK16 = {
493 S32, S64, S16, V2S16
494 };
495
496 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
497
498 setAction({G_BRCOND, S1}, Legal); // VCC branches
499 setAction({G_BRCOND, S32}, Legal); // SCC branches
500
501 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
502 // elements for v3s16
503 getActionDefinitionsBuilder(G_PHI)
504 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
505 .legalFor(AllS32Vectors)
506 .legalFor(AllS64Vectors)
507 .legalFor(AddrSpaces64)
508 .legalFor(AddrSpaces32)
509 .legalIf(isPointer(0))
510 .clampScalar(0, S16, S256)
511 .widenScalarToNextPow2(0, 32)
512 .clampMaxNumElements(0, S32, 16)
513 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
514 .scalarize(0);
515
516 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
517 // Full set of gfx9 features.
518 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
519 .legalFor({S32, S16, V2S16})
520 .clampScalar(0, S16, S32)
521 .clampMaxNumElements(0, S16, 2)
522 .scalarize(0)
523 .widenScalarToNextPow2(0, 32);
524
525 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
526 .legalFor({S32, S16, V2S16}) // Clamp modifier
527 .minScalarOrElt(0, S16)
528 .clampMaxNumElements(0, S16, 2)
529 .scalarize(0)
530 .widenScalarToNextPow2(0, 32)
531 .lower();
532 } else if (ST.has16BitInsts()) {
533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534 .legalFor({S32, S16})
535 .clampScalar(0, S16, S32)
536 .scalarize(0)
537 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
538
539 // Technically the saturating operations require clamp bit support, but this
540 // was introduced at the same time as 16-bit operations.
541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
542 .legalFor({S32, S16}) // Clamp modifier
543 .minScalar(0, S16)
544 .scalarize(0)
545 .widenScalarToNextPow2(0, 16)
546 .lower();
547
548 // We're just lowering this, but it helps get a better result to try to
549 // coerce to the desired type first.
550 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
551 .minScalar(0, S16)
552 .scalarize(0)
553 .lower();
554 } else {
555 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
556 .legalFor({S32})
557 .clampScalar(0, S32, S32)
558 .scalarize(0);
559
560 if (ST.hasIntClamp()) {
561 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
562 .legalFor({S32}) // Clamp modifier.
563 .scalarize(0)
564 .minScalarOrElt(0, S32)
565 .lower();
566 } else {
567 // Clamp bit support was added in VI, along with 16-bit operations.
568 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
569 .minScalar(0, S32)
570 .scalarize(0)
571 .lower();
572 }
573
574 // FIXME: DAG expansion gets better results. The widening uses the smaller
575 // range values and goes for the min/max lowering directly.
576 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
577 .minScalar(0, S32)
578 .scalarize(0)
579 .lower();
580 }
581
582 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
583 .customFor({S32, S64})
584 .clampScalar(0, S32, S64)
585 .widenScalarToNextPow2(0, 32)
586 .scalarize(0);
587
588 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
589 .legalFor({S32})
590 .maxScalarOrElt(0, S32);
591
592 if (ST.hasVOP3PInsts()) {
593 Mulh
594 .clampMaxNumElements(0, S8, 2)
595 .lowerFor({V2S8});
596 }
597
598 Mulh
599 .scalarize(0)
600 .lower();
601
602 // Report legal for any types we can handle anywhere. For the cases only legal
603 // on the SALU, RegBankSelect will be able to re-legalize.
604 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
605 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
606 .clampScalar(0, S32, S64)
607 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
608 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
609 .widenScalarToNextPow2(0)
610 .scalarize(0);
611
612 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
613 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
614 .legalFor({{S32, S1}, {S32, S32}})
615 .minScalar(0, S32)
616 // TODO: .scalarize(0)
617 .lower();
618
619 getActionDefinitionsBuilder(G_BITCAST)
620 // Don't worry about the size constraint.
621 .legalIf(all(isRegisterType(0), isRegisterType(1)))
622 .lower();
623
624
625 getActionDefinitionsBuilder(G_CONSTANT)
626 .legalFor({S1, S32, S64, S16, GlobalPtr,
627 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
628 .legalIf(isPointer(0))
629 .clampScalar(0, S32, S64)
630 .widenScalarToNextPow2(0);
631
632 getActionDefinitionsBuilder(G_FCONSTANT)
633 .legalFor({S32, S64, S16})
634 .clampScalar(0, S16, S64);
635
636 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
637 .legalIf(isRegisterType(0))
638 // s1 and s16 are special cases because they have legal operations on
639 // them, but don't really occupy registers in the normal way.
640 .legalFor({S1, S16})
641 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
642 .clampScalarOrElt(0, S32, MaxScalar)
643 .widenScalarToNextPow2(0, 32)
644 .clampMaxNumElements(0, S32, 16);
645
646 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
647
648 // If the amount is divergent, we have to do a wave reduction to get the
649 // maximum value, so this is expanded during RegBankSelect.
650 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
651 .legalFor({{PrivatePtr, S32}});
652
653 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
654 .customIf(typeIsNot(0, PrivatePtr));
655
656 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
657
658 auto &FPOpActions = getActionDefinitionsBuilder(
659 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
660 .legalFor({S32, S64});
661 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
662 .customFor({S32, S64});
663 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
664 .customFor({S32, S64});
665
666 if (ST.has16BitInsts()) {
667 if (ST.hasVOP3PInsts())
668 FPOpActions.legalFor({S16, V2S16});
669 else
670 FPOpActions.legalFor({S16});
671
672 TrigActions.customFor({S16});
673 FDIVActions.customFor({S16});
674 }
675
676 auto &MinNumMaxNum = getActionDefinitionsBuilder({
677 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
678
679 if (ST.hasVOP3PInsts()) {
680 MinNumMaxNum.customFor(FPTypesPK16)
681 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
682 .clampMaxNumElements(0, S16, 2)
683 .clampScalar(0, S16, S64)
684 .scalarize(0);
685 } else if (ST.has16BitInsts()) {
686 MinNumMaxNum.customFor(FPTypes16)
687 .clampScalar(0, S16, S64)
688 .scalarize(0);
689 } else {
690 MinNumMaxNum.customFor(FPTypesBase)
691 .clampScalar(0, S32, S64)
692 .scalarize(0);
693 }
694
695 if (ST.hasVOP3PInsts())
696 FPOpActions.clampMaxNumElements(0, S16, 2);
697
698 FPOpActions
699 .scalarize(0)
700 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
701
702 TrigActions
703 .scalarize(0)
704 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
705
706 FDIVActions
707 .scalarize(0)
708 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
709
710 getActionDefinitionsBuilder({G_FNEG, G_FABS})
711 .legalFor(FPTypesPK16)
712 .clampMaxNumElements(0, S16, 2)
713 .scalarize(0)
714 .clampScalar(0, S16, S64);
715
716 if (ST.has16BitInsts()) {
717 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
718 .legalFor({S32, S64, S16})
719 .scalarize(0)
720 .clampScalar(0, S16, S64);
721 } else {
722 getActionDefinitionsBuilder(G_FSQRT)
723 .legalFor({S32, S64})
724 .scalarize(0)
725 .clampScalar(0, S32, S64);
726
727 if (ST.hasFractBug()) {
728 getActionDefinitionsBuilder(G_FFLOOR)
729 .customFor({S64})
730 .legalFor({S32, S64})
731 .scalarize(0)
732 .clampScalar(0, S32, S64);
733 } else {
734 getActionDefinitionsBuilder(G_FFLOOR)
735 .legalFor({S32, S64})
736 .scalarize(0)
737 .clampScalar(0, S32, S64);
738 }
739 }
740
741 getActionDefinitionsBuilder(G_FPTRUNC)
742 .legalFor({{S32, S64}, {S16, S32}})
743 .scalarize(0)
744 .lower();
745
746 getActionDefinitionsBuilder(G_FPEXT)
747 .legalFor({{S64, S32}, {S32, S16}})
748 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
749 .scalarize(0);
750
751 getActionDefinitionsBuilder(G_FSUB)
752 // Use actual fsub instruction
753 .legalFor({S32})
754 // Must use fadd + fneg
755 .lowerFor({S64, S16, V2S16})
756 .scalarize(0)
757 .clampScalar(0, S32, S64);
758
759 // Whether this is legal depends on the floating point mode for the function.
760 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
761 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
762 FMad.customFor({S32, S16});
763 else if (ST.hasMadMacF32Insts())
764 FMad.customFor({S32});
765 else if (ST.hasMadF16())
766 FMad.customFor({S16});
767 FMad.scalarize(0)
768 .lower();
769
770 auto &FRem = getActionDefinitionsBuilder(G_FREM);
771 if (ST.has16BitInsts()) {
772 FRem.customFor({S16, S32, S64});
773 } else {
774 FRem.minScalar(0, S32)
775 .customFor({S32, S64});
776 }
777 FRem.scalarize(0);
778
779 // TODO: Do we need to clamp maximum bitwidth?
780 getActionDefinitionsBuilder(G_TRUNC)
781 .legalIf(isScalar(0))
782 .legalFor({{V2S16, V2S32}})
783 .clampMaxNumElements(0, S16, 2)
784 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
785 // situations (like an invalid implicit use), we don't want to infinite loop
786 // in the legalizer.
787 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
788 .alwaysLegal();
789
790 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
791 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
792 {S32, S1}, {S64, S1}, {S16, S1}})
793 .scalarize(0)
794 .clampScalar(0, S32, S64)
795 .widenScalarToNextPow2(1, 32);
796
797 // TODO: Split s1->s64 during regbankselect for VALU.
798 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
799 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
800 .lowerFor({{S32, S64}})
801 .lowerIf(typeIs(1, S1))
802 .customFor({{S64, S64}});
803 if (ST.has16BitInsts())
804 IToFP.legalFor({{S16, S16}});
805 IToFP.clampScalar(1, S32, S64)
806 .minScalar(0, S32)
807 .scalarize(0)
808 .widenScalarToNextPow2(1);
809
810 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
811 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
812 .customFor({{S64, S64}})
813 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
814 if (ST.has16BitInsts())
815 FPToI.legalFor({{S16, S16}});
816 else
817 FPToI.minScalar(1, S32);
818
819 FPToI.minScalar(0, S32)
820 .scalarize(0)
821 .lower();
822
823 // Lower roundeven into G_FRINT
824 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
825 .scalarize(0)
826 .lower();
827
828 if (ST.has16BitInsts()) {
829 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
830 .legalFor({S16, S32, S64})
831 .clampScalar(0, S16, S64)
832 .scalarize(0);
833 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
834 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
835 .legalFor({S32, S64})
836 .clampScalar(0, S32, S64)
837 .scalarize(0);
838 } else {
839 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
840 .legalFor({S32})
841 .customFor({S64})
842 .clampScalar(0, S32, S64)
843 .scalarize(0);
844 }
845
846 getActionDefinitionsBuilder(G_PTR_ADD)
847 .legalIf(all(isPointer(0), sameSize(0, 1)))
848 .scalarize(0)
849 .scalarSameSizeAs(1, 0);
850
851 getActionDefinitionsBuilder(G_PTRMASK)
852 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
853 .scalarSameSizeAs(1, 0)
854 .scalarize(0);
855
856 auto &CmpBuilder =
857 getActionDefinitionsBuilder(G_ICMP)
858 // The compare output type differs based on the register bank of the output,
859 // so make both s1 and s32 legal.
860 //
861 // Scalar compares producing output in scc will be promoted to s32, as that
862 // is the allocatable register type that will be needed for the copy from
863 // scc. This will be promoted during RegBankSelect, and we assume something
864 // before that won't try to use s32 result types.
865 //
866 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
867 // bank.
868 .legalForCartesianProduct(
869 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
870 .legalForCartesianProduct(
871 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
872 if (ST.has16BitInsts()) {
873 CmpBuilder.legalFor({{S1, S16}});
874 }
875
876 CmpBuilder
877 .widenScalarToNextPow2(1)
878 .clampScalar(1, S32, S64)
879 .scalarize(0)
880 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
881
882 getActionDefinitionsBuilder(G_FCMP)
883 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
884 .widenScalarToNextPow2(1)
885 .clampScalar(1, S32, S64)
886 .scalarize(0);
887
888 // FIXME: fpow has a selection pattern that should move to custom lowering.
889 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
890 if (ST.has16BitInsts())
891 Exp2Ops.legalFor({S32, S16});
892 else
893 Exp2Ops.legalFor({S32});
894 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
895 Exp2Ops.scalarize(0);
896
897 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
898 if (ST.has16BitInsts())
899 ExpOps.customFor({{S32}, {S16}});
900 else
901 ExpOps.customFor({S32});
902 ExpOps.clampScalar(0, MinScalarFPTy, S32)
903 .scalarize(0);
904
905 getActionDefinitionsBuilder(G_FPOWI)
906 .clampScalar(0, MinScalarFPTy, S32)
907 .lower();
908
909 // The 64-bit versions produce 32-bit results, but only on the SALU.
910 getActionDefinitionsBuilder(G_CTPOP)
911 .legalFor({{S32, S32}, {S32, S64}})
912 .clampScalar(0, S32, S32)
913 .clampScalar(1, S32, S64)
914 .scalarize(0)
915 .widenScalarToNextPow2(0, 32)
916 .widenScalarToNextPow2(1, 32);
917
918 // The hardware instructions return a different result on 0 than the generic
919 // instructions expect. The hardware produces -1, but these produce the
920 // bitwidth.
921 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
922 .scalarize(0)
923 .clampScalar(0, S32, S32)
924 .clampScalar(1, S32, S64)
925 .widenScalarToNextPow2(0, 32)
926 .widenScalarToNextPow2(1, 32)
927 .lower();
928
929 // The 64-bit versions produce 32-bit results, but only on the SALU.
930 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
931 .legalFor({{S32, S32}, {S32, S64}})
932 .clampScalar(0, S32, S32)
933 .clampScalar(1, S32, S64)
934 .scalarize(0)
935 .widenScalarToNextPow2(0, 32)
936 .widenScalarToNextPow2(1, 32);
937
938 getActionDefinitionsBuilder(G_BITREVERSE)
939 .legalFor({S32})
940 .clampScalar(0, S32, S32)
941 .scalarize(0);
942
943 if (ST.has16BitInsts()) {
944 getActionDefinitionsBuilder(G_BSWAP)
945 .legalFor({S16, S32, V2S16})
946 .clampMaxNumElements(0, S16, 2)
947 // FIXME: Fixing non-power-of-2 before clamp is workaround for
948 // narrowScalar limitation.
949 .widenScalarToNextPow2(0)
950 .clampScalar(0, S16, S32)
951 .scalarize(0);
952
953 if (ST.hasVOP3PInsts()) {
954 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
955 .legalFor({S32, S16, V2S16})
956 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
957 .clampMaxNumElements(0, S16, 2)
958 .minScalar(0, S16)
959 .widenScalarToNextPow2(0)
960 .scalarize(0)
961 .lower();
962 } else {
963 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
964 .legalFor({S32, S16})
965 .widenScalarToNextPow2(0)
966 .minScalar(0, S16)
967 .scalarize(0)
968 .lower();
969 }
970 } else {
971 // TODO: Should have same legality without v_perm_b32
972 getActionDefinitionsBuilder(G_BSWAP)
973 .legalFor({S32})
974 .lowerIf(scalarNarrowerThan(0, 32))
975 // FIXME: Fixing non-power-of-2 before clamp is workaround for
976 // narrowScalar limitation.
977 .widenScalarToNextPow2(0)
978 .maxScalar(0, S32)
979 .scalarize(0)
980 .lower();
981
982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
983 .legalFor({S32})
984 .minScalar(0, S32)
985 .widenScalarToNextPow2(0)
986 .scalarize(0)
987 .lower();
988 }
989
990 getActionDefinitionsBuilder(G_INTTOPTR)
991 // List the common cases
992 .legalForCartesianProduct(AddrSpaces64, {S64})
993 .legalForCartesianProduct(AddrSpaces32, {S32})
994 .scalarize(0)
995 // Accept any address space as long as the size matches
996 .legalIf(sameSize(0, 1))
997 .widenScalarIf(smallerThan(1, 0),
998 [](const LegalityQuery &Query) {
999 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1000 })
1001 .narrowScalarIf(largerThan(1, 0),
1002 [](const LegalityQuery &Query) {
1003 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1004 });
1005
1006 getActionDefinitionsBuilder(G_PTRTOINT)
1007 // List the common cases
1008 .legalForCartesianProduct(AddrSpaces64, {S64})
1009 .legalForCartesianProduct(AddrSpaces32, {S32})
1010 .scalarize(0)
1011 // Accept any address space as long as the size matches
1012 .legalIf(sameSize(0, 1))
1013 .widenScalarIf(smallerThan(0, 1),
1014 [](const LegalityQuery &Query) {
1015 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1016 })
1017 .narrowScalarIf(
1018 largerThan(0, 1),
1019 [](const LegalityQuery &Query) {
1020 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1021 });
1022
1023 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1024 .scalarize(0)
1025 .custom();
1026
1027 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1028 bool IsLoad) -> bool {
1029 const LLT DstTy = Query.Types[0];
1030
1031 // Split vector extloads.
1032 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1033 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1034
1035 if (MemSize < DstTy.getSizeInBits())
1036 MemSize = std::max(MemSize, AlignBits);
1037
1038 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1039 return true;
1040
1041 const LLT PtrTy = Query.Types[1];
1042 unsigned AS = PtrTy.getAddressSpace();
1043 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1044 return true;
1045
1046 // Catch weird sized loads that don't evenly divide into the access sizes
1047 // TODO: May be able to widen depending on alignment etc.
1048 unsigned NumRegs = (MemSize + 31) / 32;
1049 if (NumRegs == 3) {
1050 if (!ST.hasDwordx3LoadStores())
1051 return true;
1052 } else {
1053 // If the alignment allows, these should have been widened.
1054 if (!isPowerOf2_32(NumRegs))
1055 return true;
1056 }
1057
1058 if (AlignBits < MemSize) {
1059 const SITargetLowering *TLI = ST.getTargetLowering();
1060 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1061 Align(AlignBits / 8));
1062 }
1063
1064 return false;
1065 };
1066
1067 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1068 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1069 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1070
1071 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1072 // LDS
1073 // TODO: Unsupported flat for SI.
1074
1075 for (unsigned Op : {G_LOAD, G_STORE}) {
1076 const bool IsStore = Op == G_STORE;
1077
1078 auto &Actions = getActionDefinitionsBuilder(Op);
1079 // Explicitly list some common cases.
1080 // TODO: Does this help compile time at all?
1081 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1082 {V2S32, GlobalPtr, 64, GlobalAlign32},
1083 {V4S32, GlobalPtr, 128, GlobalAlign32},
1084 {S64, GlobalPtr, 64, GlobalAlign32},
1085 {V2S64, GlobalPtr, 128, GlobalAlign32},
1086 {V2S16, GlobalPtr, 32, GlobalAlign32},
1087 {S32, GlobalPtr, 8, GlobalAlign8},
1088 {S32, GlobalPtr, 16, GlobalAlign16},
1089
1090 {S32, LocalPtr, 32, 32},
1091 {S64, LocalPtr, 64, 32},
1092 {V2S32, LocalPtr, 64, 32},
1093 {S32, LocalPtr, 8, 8},
1094 {S32, LocalPtr, 16, 16},
1095 {V2S16, LocalPtr, 32, 32},
1096
1097 {S32, PrivatePtr, 32, 32},
1098 {S32, PrivatePtr, 8, 8},
1099 {S32, PrivatePtr, 16, 16},
1100 {V2S16, PrivatePtr, 32, 32},
1101
1102 {S32, ConstantPtr, 32, GlobalAlign32},
1103 {V2S32, ConstantPtr, 64, GlobalAlign32},
1104 {V4S32, ConstantPtr, 128, GlobalAlign32},
1105 {S64, ConstantPtr, 64, GlobalAlign32},
1106 {V2S32, ConstantPtr, 32, GlobalAlign32}});
1107 Actions.legalIf(
1108 [=](const LegalityQuery &Query) -> bool {
1109 return isLoadStoreLegal(ST, Query, Op);
1110 });
1111
1112 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1113 // 64-bits.
1114 //
1115 // TODO: Should generalize bitcast action into coerce, which will also cover
1116 // inserting addrspacecasts.
1117 Actions.customIf(typeIs(1, Constant32Ptr));
1118
1119 // Turn any illegal element vectors into something easier to deal
1120 // with. These will ultimately produce 32-bit scalar shifts to extract the
1121 // parts anyway.
1122 //
1123 // For odd 16-bit element vectors, prefer to split those into pieces with
1124 // 16-bit vector parts.
1125 Actions.bitcastIf(
1126 [=](const LegalityQuery &Query) -> bool {
1127 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1128 Query.MMODescrs[0].SizeInBits);
1129 }, bitcastToRegisterType(0));
1130
1131 if (!IsStore) {
1132 // Widen suitably aligned loads by loading extra bytes. The standard
1133 // legalization actions can't properly express widening memory operands.
1134 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1135 return shouldWidenLoad(ST, Query, G_LOAD);
1136 });
1137 }
1138
1139 // FIXME: load/store narrowing should be moved to lower action
1140 Actions
1141 .narrowScalarIf(
1142 [=](const LegalityQuery &Query) -> bool {
1143 return !Query.Types[0].isVector() &&
1144 needToSplitMemOp(Query, Op == G_LOAD);
1145 },
1146 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1147 const LLT DstTy = Query.Types[0];
1148 const LLT PtrTy = Query.Types[1];
1149
1150 const unsigned DstSize = DstTy.getSizeInBits();
1151 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1152
1153 // Split extloads.
1154 if (DstSize > MemSize)
1155 return std::make_pair(0, LLT::scalar(MemSize));
1156
1157 if (!isPowerOf2_32(DstSize)) {
1158 // We're probably decomposing an odd sized store. Try to split
1159 // to the widest type. TODO: Account for alignment. As-is it
1160 // should be OK, since the new parts will be further legalized.
1161 unsigned FloorSize = PowerOf2Floor(DstSize);
1162 return std::make_pair(0, LLT::scalar(FloorSize));
1163 }
1164
1165 if (DstSize > 32 && (DstSize % 32 != 0)) {
1166 // FIXME: Need a way to specify non-extload of larger size if
1167 // suitably aligned.
1168 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1169 }
1170
1171 unsigned MaxSize = maxSizeForAddrSpace(ST,
1172 PtrTy.getAddressSpace(),
1173 Op == G_LOAD);
1174 if (MemSize > MaxSize)
1175 return std::make_pair(0, LLT::scalar(MaxSize));
1176
1177 unsigned Align = Query.MMODescrs[0].AlignInBits;
1178 return std::make_pair(0, LLT::scalar(Align));
1179 })
1180 .fewerElementsIf(
1181 [=](const LegalityQuery &Query) -> bool {
1182 return Query.Types[0].isVector() &&
1183 needToSplitMemOp(Query, Op == G_LOAD);
1184 },
1185 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1186 const LLT DstTy = Query.Types[0];
1187 const LLT PtrTy = Query.Types[1];
1188
1189 LLT EltTy = DstTy.getElementType();
1190 unsigned MaxSize = maxSizeForAddrSpace(ST,
1191 PtrTy.getAddressSpace(),
1192 Op == G_LOAD);
1193
1194 // FIXME: Handle widened to power of 2 results better. This ends
1195 // up scalarizing.
1196 // FIXME: 3 element stores scalarized on SI
1197
1198 // Split if it's too large for the address space.
1199 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1200 unsigned NumElts = DstTy.getNumElements();
1201 unsigned EltSize = EltTy.getSizeInBits();
1202
1203 if (MaxSize % EltSize == 0) {
1204 return std::make_pair(
1205 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1206 }
1207
1208 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1209
1210 // FIXME: Refine when odd breakdowns handled
1211 // The scalars will need to be re-legalized.
1212 if (NumPieces == 1 || NumPieces >= NumElts ||
1213 NumElts % NumPieces != 0)
1214 return std::make_pair(0, EltTy);
1215
1216 return std::make_pair(0,
1217 LLT::vector(NumElts / NumPieces, EltTy));
1218 }
1219
1220 // FIXME: We could probably handle weird extending loads better.
1221 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1222 if (DstTy.getSizeInBits() > MemSize)
1223 return std::make_pair(0, EltTy);
1224
1225 unsigned EltSize = EltTy.getSizeInBits();
1226 unsigned DstSize = DstTy.getSizeInBits();
1227 if (!isPowerOf2_32(DstSize)) {
1228 // We're probably decomposing an odd sized store. Try to split
1229 // to the widest type. TODO: Account for alignment. As-is it
1230 // should be OK, since the new parts will be further legalized.
1231 unsigned FloorSize = PowerOf2Floor(DstSize);
1232 return std::make_pair(
1233 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1234 }
1235
1236 // Need to split because of alignment.
1237 unsigned Align = Query.MMODescrs[0].AlignInBits;
1238 if (EltSize > Align &&
1239 (EltSize / Align < DstTy.getNumElements())) {
1240 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1241 }
1242
1243 // May need relegalization for the scalars.
1244 return std::make_pair(0, EltTy);
1245 })
1246 .lowerIfMemSizeNotPow2()
1247 .minScalar(0, S32);
1248
1249 if (IsStore)
1250 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1251
1252 Actions
1253 .widenScalarToNextPow2(0)
1254 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1255 .lower();
1256 }
1257
1258 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1259 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1260 {S32, GlobalPtr, 16, 2 * 8},
1261 {S32, LocalPtr, 8, 8},
1262 {S32, LocalPtr, 16, 16},
1263 {S32, PrivatePtr, 8, 8},
1264 {S32, PrivatePtr, 16, 16},
1265 {S32, ConstantPtr, 8, 8},
1266 {S32, ConstantPtr, 16, 2 * 8}});
1267 if (ST.hasFlatAddressSpace()) {
1268 ExtLoads.legalForTypesWithMemDesc(
1269 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1270 }
1271
1272 ExtLoads.clampScalar(0, S32, S32)
1273 .widenScalarToNextPow2(0)
1274 .unsupportedIfMemSizeNotPow2()
1275 .lower();
1276
1277 auto &Atomics = getActionDefinitionsBuilder(
1278 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281 G_ATOMICRMW_UMIN})
1282 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283 {S64, GlobalPtr}, {S64, LocalPtr},
1284 {S32, RegionPtr}, {S64, RegionPtr}});
1285 if (ST.hasFlatAddressSpace()) {
1286 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287 }
1288
1289 if (ST.hasLDSFPAtomics()) {
1290 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1291 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292 }
1293
1294 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1295 // demarshalling
1296 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1297 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1298 {S32, FlatPtr}, {S64, FlatPtr}})
1299 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1300 {S32, RegionPtr}, {S64, RegionPtr}});
1301 // TODO: Pointer types, any 32-bit or 64-bit vector
1302
1303 // Condition should be s32 for scalar, s1 for vector.
1304 getActionDefinitionsBuilder(G_SELECT)
1305 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1306 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1307 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1308 .clampScalar(0, S16, S64)
1309 .scalarize(1)
1310 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1311 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1312 .clampMaxNumElements(0, S32, 2)
1313 .clampMaxNumElements(0, LocalPtr, 2)
1314 .clampMaxNumElements(0, PrivatePtr, 2)
1315 .scalarize(0)
1316 .widenScalarToNextPow2(0)
1317 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1318
1319 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1320 // be more flexible with the shift amount type.
1321 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1322 .legalFor({{S32, S32}, {S64, S32}});
1323 if (ST.has16BitInsts()) {
1324 if (ST.hasVOP3PInsts()) {
1325 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1326 .clampMaxNumElements(0, S16, 2);
1327 } else
1328 Shifts.legalFor({{S16, S16}});
1329
1330 // TODO: Support 16-bit shift amounts for all types
1331 Shifts.widenScalarIf(
1332 [=](const LegalityQuery &Query) {
1333 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1334 // 32-bit amount.
1335 const LLT ValTy = Query.Types[0];
1336 const LLT AmountTy = Query.Types[1];
1337 return ValTy.getSizeInBits() <= 16 &&
1338 AmountTy.getSizeInBits() < 16;
1339 }, changeTo(1, S16));
1340 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1341 Shifts.clampScalar(1, S32, S32);
1342 Shifts.clampScalar(0, S16, S64);
1343 Shifts.widenScalarToNextPow2(0, 16);
1344
1345 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1346 .minScalar(0, S16)
1347 .scalarize(0)
1348 .lower();
1349 } else {
1350 // Make sure we legalize the shift amount type first, as the general
1351 // expansion for the shifted type will produce much worse code if it hasn't
1352 // been truncated already.
1353 Shifts.clampScalar(1, S32, S32);
1354 Shifts.clampScalar(0, S32, S64);
1355 Shifts.widenScalarToNextPow2(0, 32);
1356
1357 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1358 .minScalar(0, S32)
1359 .scalarize(0)
1360 .lower();
1361 }
1362 Shifts.scalarize(0);
1363
1364 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1365 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1366 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1367 unsigned IdxTypeIdx = 2;
1368
1369 getActionDefinitionsBuilder(Op)
1370 .customIf([=](const LegalityQuery &Query) {
1371 const LLT EltTy = Query.Types[EltTypeIdx];
1372 const LLT VecTy = Query.Types[VecTypeIdx];
1373 const LLT IdxTy = Query.Types[IdxTypeIdx];
1374 const unsigned EltSize = EltTy.getSizeInBits();
1375 return (EltSize == 32 || EltSize == 64) &&
1376 VecTy.getSizeInBits() % 32 == 0 &&
1377 VecTy.getSizeInBits() <= MaxRegisterSize &&
1378 IdxTy.getSizeInBits() == 32;
1379 })
1380 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1381 bitcastToVectorElement32(VecTypeIdx))
1382 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1383 .bitcastIf(
1384 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1385 [=](const LegalityQuery &Query) {
1386 // For > 64-bit element types, try to turn this into a 64-bit
1387 // element vector since we may be able to do better indexing
1388 // if this is scalar. If not, fall back to 32.
1389 const LLT EltTy = Query.Types[EltTypeIdx];
1390 const LLT VecTy = Query.Types[VecTypeIdx];
1391 const unsigned DstEltSize = EltTy.getSizeInBits();
1392 const unsigned VecSize = VecTy.getSizeInBits();
1393
1394 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1395 return std::make_pair(
1396 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1397 })
1398 .clampScalar(EltTypeIdx, S32, S64)
1399 .clampScalar(VecTypeIdx, S32, S64)
1400 .clampScalar(IdxTypeIdx, S32, S32)
1401 .clampMaxNumElements(VecTypeIdx, S32, 32)
1402 // TODO: Clamp elements for 64-bit vectors?
1403 // It should only be necessary with variable indexes.
1404 // As a last resort, lower to the stack
1405 .lower();
1406 }
1407
1408 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1409 .unsupportedIf([=](const LegalityQuery &Query) {
1410 const LLT &EltTy = Query.Types[1].getElementType();
1411 return Query.Types[0] != EltTy;
1412 });
1413
1414 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1415 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1416 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1417
1418 // FIXME: Doesn't handle extract of illegal sizes.
1419 getActionDefinitionsBuilder(Op)
1420 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1421 // FIXME: Multiples of 16 should not be legal.
1422 .legalIf([=](const LegalityQuery &Query) {
1423 const LLT BigTy = Query.Types[BigTyIdx];
1424 const LLT LitTy = Query.Types[LitTyIdx];
1425 return (BigTy.getSizeInBits() % 32 == 0) &&
1426 (LitTy.getSizeInBits() % 16 == 0);
1427 })
1428 .widenScalarIf(
1429 [=](const LegalityQuery &Query) {
1430 const LLT BigTy = Query.Types[BigTyIdx];
1431 return (BigTy.getScalarSizeInBits() < 16);
1432 },
1433 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1434 .widenScalarIf(
1435 [=](const LegalityQuery &Query) {
1436 const LLT LitTy = Query.Types[LitTyIdx];
1437 return (LitTy.getScalarSizeInBits() < 16);
1438 },
1439 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1440 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1441 .widenScalarToNextPow2(BigTyIdx, 32);
1442
1443 }
1444
1445 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1446 .legalForCartesianProduct(AllS32Vectors, {S32})
1447 .legalForCartesianProduct(AllS64Vectors, {S64})
1448 .clampNumElements(0, V16S32, V32S32)
1449 .clampNumElements(0, V2S64, V16S64)
1450 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1451
1452 if (ST.hasScalarPackInsts()) {
1453 BuildVector
1454 // FIXME: Should probably widen s1 vectors straight to s32
1455 .minScalarOrElt(0, S16)
1456 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1457 .minScalar(1, S32);
1458
1459 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1460 .legalFor({V2S16, S32})
1461 .lower();
1462 BuildVector.minScalarOrElt(0, S32);
1463 } else {
1464 BuildVector.customFor({V2S16, S16});
1465 BuildVector.minScalarOrElt(0, S32);
1466
1467 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1468 .customFor({V2S16, S32})
1469 .lower();
1470 }
1471
1472 BuildVector.legalIf(isRegisterType(0));
1473
1474 // FIXME: Clamp maximum size
1475 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1476 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1477 .clampMaxNumElements(0, S32, 32)
1478 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1479 .clampMaxNumElements(0, S16, 64);
1480
1481 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1482 // pre-legalize.
1483 if (ST.hasVOP3PInsts()) {
1484 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1485 .customFor({V2S16, V2S16})
1486 .lower();
1487 } else
1488 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1489
1490 // Merge/Unmerge
1491 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1492 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1493 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1494
1495 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1496 const LLT Ty = Query.Types[TypeIdx];
1497 if (Ty.isVector()) {
1498 const LLT &EltTy = Ty.getElementType();
1499 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1500 return true;
1501 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1502 return true;
1503 }
1504 return false;
1505 };
1506
1507 auto &Builder = getActionDefinitionsBuilder(Op)
1508 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1509 .lowerFor({{S16, V2S16}})
1510 .lowerIf([=](const LegalityQuery &Query) {
1511 const LLT BigTy = Query.Types[BigTyIdx];
1512 return BigTy.getSizeInBits() == 32;
1513 })
1514 // Try to widen to s16 first for small types.
1515 // TODO: Only do this on targets with legal s16 shifts
1516 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1517 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1518 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1519 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1520 elementTypeIs(1, S16)),
1521 changeTo(1, V2S16))
1522 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1523 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1524 // valid.
1525 .clampScalar(LitTyIdx, S32, S512)
1526 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1527 // Break up vectors with weird elements into scalars
1528 .fewerElementsIf(
1529 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1530 scalarize(0))
1531 .fewerElementsIf(
1532 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1533 scalarize(1))
1534 .clampScalar(BigTyIdx, S32, MaxScalar);
1535
1536 if (Op == G_MERGE_VALUES) {
1537 Builder.widenScalarIf(
1538 // TODO: Use 16-bit shifts if legal for 8-bit values?
1539 [=](const LegalityQuery &Query) {
1540 const LLT Ty = Query.Types[LitTyIdx];
1541 return Ty.getSizeInBits() < 32;
1542 },
1543 changeTo(LitTyIdx, S32));
1544 }
1545
1546 Builder.widenScalarIf(
1547 [=](const LegalityQuery &Query) {
1548 const LLT Ty = Query.Types[BigTyIdx];
1549 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1550 Ty.getSizeInBits() % 16 != 0;
1551 },
1552 [=](const LegalityQuery &Query) {
1553 // Pick the next power of 2, or a multiple of 64 over 128.
1554 // Whichever is smaller.
1555 const LLT &Ty = Query.Types[BigTyIdx];
1556 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1557 if (NewSizeInBits >= 256) {
1558 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1559 if (RoundedTo < NewSizeInBits)
1560 NewSizeInBits = RoundedTo;
1561 }
1562 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1563 })
1564 // Any vectors left are the wrong size. Scalarize them.
1565 .scalarize(0)
1566 .scalarize(1);
1567 }
1568
1569 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1570 // RegBankSelect.
1571 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1572 .legalFor({{S32}, {S64}});
1573
1574 if (ST.hasVOP3PInsts()) {
1575 SextInReg.lowerFor({{V2S16}})
1576 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1577 // get more vector shift opportunities, since we'll get those when
1578 // expanded.
1579 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1580 } else if (ST.has16BitInsts()) {
1581 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1582 } else {
1583 // Prefer to promote to s32 before lowering if we don't have 16-bit
1584 // shifts. This avoid a lot of intermediate truncate and extend operations.
1585 SextInReg.lowerFor({{S32}, {S64}});
1586 }
1587
1588 SextInReg
1589 .scalarize(0)
1590 .clampScalar(0, S32, S64)
1591 .lower();
1592
1593 getActionDefinitionsBuilder(G_FSHR)
1594 .legalFor({{S32, S32}})
1595 .scalarize(0)
1596 .lower();
1597
1598 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1599 .legalFor({S64});
1600
1601 getActionDefinitionsBuilder(G_FENCE)
1602 .alwaysLegal();
1603
1604 getActionDefinitionsBuilder({
1605 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1606 G_FCOPYSIGN,
1607
1608 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1609 G_ATOMICRMW_NAND,
1610 G_ATOMICRMW_FSUB,
1611 G_READ_REGISTER,
1612 G_WRITE_REGISTER,
1613
1614 G_SADDO, G_SSUBO,
1615
1616 // TODO: Implement
1617 G_FMINIMUM, G_FMAXIMUM,
1618 G_FSHL
1619 }).lower();
1620
1621 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1622 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1623 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1624 .unsupported();
1625
1626 computeTables();
1627 verify(*ST.getInstrInfo());
1628 }
1629
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI) const1630 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1631 MachineInstr &MI) const {
1632 MachineIRBuilder &B = Helper.MIRBuilder;
1633 MachineRegisterInfo &MRI = *B.getMRI();
1634
1635 switch (MI.getOpcode()) {
1636 case TargetOpcode::G_ADDRSPACE_CAST:
1637 return legalizeAddrSpaceCast(MI, MRI, B);
1638 case TargetOpcode::G_FRINT:
1639 return legalizeFrint(MI, MRI, B);
1640 case TargetOpcode::G_FCEIL:
1641 return legalizeFceil(MI, MRI, B);
1642 case TargetOpcode::G_FREM:
1643 return legalizeFrem(MI, MRI, B);
1644 case TargetOpcode::G_INTRINSIC_TRUNC:
1645 return legalizeIntrinsicTrunc(MI, MRI, B);
1646 case TargetOpcode::G_SITOFP:
1647 return legalizeITOFP(MI, MRI, B, true);
1648 case TargetOpcode::G_UITOFP:
1649 return legalizeITOFP(MI, MRI, B, false);
1650 case TargetOpcode::G_FPTOSI:
1651 return legalizeFPTOI(MI, MRI, B, true);
1652 case TargetOpcode::G_FPTOUI:
1653 return legalizeFPTOI(MI, MRI, B, false);
1654 case TargetOpcode::G_FMINNUM:
1655 case TargetOpcode::G_FMAXNUM:
1656 case TargetOpcode::G_FMINNUM_IEEE:
1657 case TargetOpcode::G_FMAXNUM_IEEE:
1658 return legalizeMinNumMaxNum(Helper, MI);
1659 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1660 return legalizeExtractVectorElt(MI, MRI, B);
1661 case TargetOpcode::G_INSERT_VECTOR_ELT:
1662 return legalizeInsertVectorElt(MI, MRI, B);
1663 case TargetOpcode::G_SHUFFLE_VECTOR:
1664 return legalizeShuffleVector(MI, MRI, B);
1665 case TargetOpcode::G_FSIN:
1666 case TargetOpcode::G_FCOS:
1667 return legalizeSinCos(MI, MRI, B);
1668 case TargetOpcode::G_GLOBAL_VALUE:
1669 return legalizeGlobalValue(MI, MRI, B);
1670 case TargetOpcode::G_LOAD:
1671 return legalizeLoad(Helper, MI);
1672 case TargetOpcode::G_FMAD:
1673 return legalizeFMad(MI, MRI, B);
1674 case TargetOpcode::G_FDIV:
1675 return legalizeFDIV(MI, MRI, B);
1676 case TargetOpcode::G_UDIV:
1677 case TargetOpcode::G_UREM:
1678 return legalizeUDIV_UREM(MI, MRI, B);
1679 case TargetOpcode::G_SDIV:
1680 case TargetOpcode::G_SREM:
1681 return legalizeSDIV_SREM(MI, MRI, B);
1682 case TargetOpcode::G_ATOMIC_CMPXCHG:
1683 return legalizeAtomicCmpXChg(MI, MRI, B);
1684 case TargetOpcode::G_FLOG:
1685 return legalizeFlog(MI, B, numbers::ln2f);
1686 case TargetOpcode::G_FLOG10:
1687 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1688 case TargetOpcode::G_FEXP:
1689 return legalizeFExp(MI, B);
1690 case TargetOpcode::G_FPOW:
1691 return legalizeFPow(MI, B);
1692 case TargetOpcode::G_FFLOOR:
1693 return legalizeFFloor(MI, MRI, B);
1694 case TargetOpcode::G_BUILD_VECTOR:
1695 return legalizeBuildVector(MI, MRI, B);
1696 default:
1697 return false;
1698 }
1699
1700 llvm_unreachable("expected switch to return");
1701 }
1702
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1703 Register AMDGPULegalizerInfo::getSegmentAperture(
1704 unsigned AS,
1705 MachineRegisterInfo &MRI,
1706 MachineIRBuilder &B) const {
1707 MachineFunction &MF = B.getMF();
1708 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1709 const LLT S32 = LLT::scalar(32);
1710
1711 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1712
1713 if (ST.hasApertureRegs()) {
1714 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1715 // getreg.
1716 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1717 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1718 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1719 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1720 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1721 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1722 unsigned Encoding =
1723 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1724 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1725 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1726
1727 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1728
1729 B.buildInstr(AMDGPU::S_GETREG_B32)
1730 .addDef(GetReg)
1731 .addImm(Encoding);
1732 MRI.setType(GetReg, S32);
1733
1734 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1735 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1736 }
1737
1738 Register QueuePtr = MRI.createGenericVirtualRegister(
1739 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1740
1741 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1742 return Register();
1743
1744 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1745 // private_segment_aperture_base_hi.
1746 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1747
1748 // TODO: can we be smarter about machine pointer info?
1749 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1750 MachineMemOperand *MMO = MF.getMachineMemOperand(
1751 PtrInfo,
1752 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1753 MachineMemOperand::MOInvariant,
1754 4, commonAlignment(Align(64), StructOffset));
1755
1756 Register LoadAddr;
1757
1758 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1759 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1760 }
1761
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1762 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1763 MachineInstr &MI, MachineRegisterInfo &MRI,
1764 MachineIRBuilder &B) const {
1765 MachineFunction &MF = B.getMF();
1766
1767 const LLT S32 = LLT::scalar(32);
1768 Register Dst = MI.getOperand(0).getReg();
1769 Register Src = MI.getOperand(1).getReg();
1770
1771 LLT DstTy = MRI.getType(Dst);
1772 LLT SrcTy = MRI.getType(Src);
1773 unsigned DestAS = DstTy.getAddressSpace();
1774 unsigned SrcAS = SrcTy.getAddressSpace();
1775
1776 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1777 // vector element.
1778 assert(!DstTy.isVector());
1779
1780 const AMDGPUTargetMachine &TM
1781 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1782
1783 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1784 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1785 return true;
1786 }
1787
1788 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1789 // Truncate.
1790 B.buildExtract(Dst, Src, 0);
1791 MI.eraseFromParent();
1792 return true;
1793 }
1794
1795 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1796 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1797 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1798
1799 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1800 // another. Merge operands are required to be the same type, but creating an
1801 // extra ptrtoint would be kind of pointless.
1802 auto HighAddr = B.buildConstant(
1803 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1804 B.buildMerge(Dst, {Src, HighAddr});
1805 MI.eraseFromParent();
1806 return true;
1807 }
1808
1809 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1810 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1811 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1812 unsigned NullVal = TM.getNullPointerValue(DestAS);
1813
1814 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1815 auto FlatNull = B.buildConstant(SrcTy, 0);
1816
1817 // Extract low 32-bits of the pointer.
1818 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1819
1820 auto CmpRes =
1821 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1822 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1823
1824 MI.eraseFromParent();
1825 return true;
1826 }
1827
1828 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1829 return false;
1830
1831 if (!ST.hasFlatAddressSpace())
1832 return false;
1833
1834 auto SegmentNull =
1835 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1836 auto FlatNull =
1837 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1838
1839 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1840 if (!ApertureReg.isValid())
1841 return false;
1842
1843 auto CmpRes =
1844 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1845
1846 // Coerce the type of the low half of the result so we can use merge_values.
1847 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1848
1849 // TODO: Should we allow mismatched types but matching sizes in merges to
1850 // avoid the ptrtoint?
1851 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1852 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1853
1854 MI.eraseFromParent();
1855 return true;
1856 }
1857
legalizeFrint(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1858 bool AMDGPULegalizerInfo::legalizeFrint(
1859 MachineInstr &MI, MachineRegisterInfo &MRI,
1860 MachineIRBuilder &B) const {
1861 Register Src = MI.getOperand(1).getReg();
1862 LLT Ty = MRI.getType(Src);
1863 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1864
1865 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1866 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1867
1868 auto C1 = B.buildFConstant(Ty, C1Val);
1869 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1870
1871 // TODO: Should this propagate fast-math-flags?
1872 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1873 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1874
1875 auto C2 = B.buildFConstant(Ty, C2Val);
1876 auto Fabs = B.buildFAbs(Ty, Src);
1877
1878 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1879 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1880 MI.eraseFromParent();
1881 return true;
1882 }
1883
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1884 bool AMDGPULegalizerInfo::legalizeFceil(
1885 MachineInstr &MI, MachineRegisterInfo &MRI,
1886 MachineIRBuilder &B) const {
1887
1888 const LLT S1 = LLT::scalar(1);
1889 const LLT S64 = LLT::scalar(64);
1890
1891 Register Src = MI.getOperand(1).getReg();
1892 assert(MRI.getType(Src) == S64);
1893
1894 // result = trunc(src)
1895 // if (src > 0.0 && src != result)
1896 // result += 1.0
1897
1898 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1899
1900 const auto Zero = B.buildFConstant(S64, 0.0);
1901 const auto One = B.buildFConstant(S64, 1.0);
1902 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1903 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1904 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1905 auto Add = B.buildSelect(S64, And, One, Zero);
1906
1907 // TODO: Should this propagate fast-math-flags?
1908 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1909 return true;
1910 }
1911
legalizeFrem(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1912 bool AMDGPULegalizerInfo::legalizeFrem(
1913 MachineInstr &MI, MachineRegisterInfo &MRI,
1914 MachineIRBuilder &B) const {
1915 Register DstReg = MI.getOperand(0).getReg();
1916 Register Src0Reg = MI.getOperand(1).getReg();
1917 Register Src1Reg = MI.getOperand(2).getReg();
1918 auto Flags = MI.getFlags();
1919 LLT Ty = MRI.getType(DstReg);
1920
1921 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1922 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1923 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1924 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1925 MI.eraseFromParent();
1926 return true;
1927 }
1928
extractF64Exponent(Register Hi,MachineIRBuilder & B)1929 static MachineInstrBuilder extractF64Exponent(Register Hi,
1930 MachineIRBuilder &B) {
1931 const unsigned FractBits = 52;
1932 const unsigned ExpBits = 11;
1933 LLT S32 = LLT::scalar(32);
1934
1935 auto Const0 = B.buildConstant(S32, FractBits - 32);
1936 auto Const1 = B.buildConstant(S32, ExpBits);
1937
1938 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1939 .addUse(Hi)
1940 .addUse(Const0.getReg(0))
1941 .addUse(Const1.getReg(0));
1942
1943 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1944 }
1945
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1946 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1947 MachineInstr &MI, MachineRegisterInfo &MRI,
1948 MachineIRBuilder &B) const {
1949 const LLT S1 = LLT::scalar(1);
1950 const LLT S32 = LLT::scalar(32);
1951 const LLT S64 = LLT::scalar(64);
1952
1953 Register Src = MI.getOperand(1).getReg();
1954 assert(MRI.getType(Src) == S64);
1955
1956 // TODO: Should this use extract since the low half is unused?
1957 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1958 Register Hi = Unmerge.getReg(1);
1959
1960 // Extract the upper half, since this is where we will find the sign and
1961 // exponent.
1962 auto Exp = extractF64Exponent(Hi, B);
1963
1964 const unsigned FractBits = 52;
1965
1966 // Extract the sign bit.
1967 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1968 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1969
1970 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1971
1972 const auto Zero32 = B.buildConstant(S32, 0);
1973
1974 // Extend back to 64-bits.
1975 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1976
1977 auto Shr = B.buildAShr(S64, FractMask, Exp);
1978 auto Not = B.buildNot(S64, Shr);
1979 auto Tmp0 = B.buildAnd(S64, Src, Not);
1980 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1981
1982 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1983 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1984
1985 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1986 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1987 MI.eraseFromParent();
1988 return true;
1989 }
1990
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const1991 bool AMDGPULegalizerInfo::legalizeITOFP(
1992 MachineInstr &MI, MachineRegisterInfo &MRI,
1993 MachineIRBuilder &B, bool Signed) const {
1994
1995 Register Dst = MI.getOperand(0).getReg();
1996 Register Src = MI.getOperand(1).getReg();
1997
1998 const LLT S64 = LLT::scalar(64);
1999 const LLT S32 = LLT::scalar(32);
2000
2001 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2002
2003 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2004
2005 auto CvtHi = Signed ?
2006 B.buildSITOFP(S64, Unmerge.getReg(1)) :
2007 B.buildUITOFP(S64, Unmerge.getReg(1));
2008
2009 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2010
2011 auto ThirtyTwo = B.buildConstant(S32, 32);
2012 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2013 .addUse(CvtHi.getReg(0))
2014 .addUse(ThirtyTwo.getReg(0));
2015
2016 // TODO: Should this propagate fast-math-flags?
2017 B.buildFAdd(Dst, LdExp, CvtLo);
2018 MI.eraseFromParent();
2019 return true;
2020 }
2021
2022 // TODO: Copied from DAG implementation. Verify logic and document how this
2023 // actually works.
legalizeFPTOI(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2024 bool AMDGPULegalizerInfo::legalizeFPTOI(
2025 MachineInstr &MI, MachineRegisterInfo &MRI,
2026 MachineIRBuilder &B, bool Signed) const {
2027
2028 Register Dst = MI.getOperand(0).getReg();
2029 Register Src = MI.getOperand(1).getReg();
2030
2031 const LLT S64 = LLT::scalar(64);
2032 const LLT S32 = LLT::scalar(32);
2033
2034 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2035
2036 unsigned Flags = MI.getFlags();
2037
2038 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2039 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2040 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2041
2042 auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2043 auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2044 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2045
2046 auto Hi = Signed ?
2047 B.buildFPTOSI(S32, FloorMul) :
2048 B.buildFPTOUI(S32, FloorMul);
2049 auto Lo = B.buildFPTOUI(S32, Fma);
2050
2051 B.buildMerge(Dst, { Lo, Hi });
2052 MI.eraseFromParent();
2053
2054 return true;
2055 }
2056
legalizeMinNumMaxNum(LegalizerHelper & Helper,MachineInstr & MI) const2057 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2058 MachineInstr &MI) const {
2059 MachineFunction &MF = Helper.MIRBuilder.getMF();
2060 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2061
2062 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2063 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2064
2065 // With ieee_mode disabled, the instructions have the correct behavior
2066 // already for G_FMINNUM/G_FMAXNUM
2067 if (!MFI->getMode().IEEE)
2068 return !IsIEEEOp;
2069
2070 if (IsIEEEOp)
2071 return true;
2072
2073 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2074 }
2075
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2076 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2077 MachineInstr &MI, MachineRegisterInfo &MRI,
2078 MachineIRBuilder &B) const {
2079 // TODO: Should move some of this into LegalizerHelper.
2080
2081 // TODO: Promote dynamic indexing of s16 to s32
2082
2083 // FIXME: Artifact combiner probably should have replaced the truncated
2084 // constant before this, so we shouldn't need
2085 // getConstantVRegValWithLookThrough.
2086 Optional<ValueAndVReg> MaybeIdxVal =
2087 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2088 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2089 return true;
2090 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2091
2092 Register Dst = MI.getOperand(0).getReg();
2093 Register Vec = MI.getOperand(1).getReg();
2094
2095 LLT VecTy = MRI.getType(Vec);
2096 LLT EltTy = VecTy.getElementType();
2097 assert(EltTy == MRI.getType(Dst));
2098
2099 if (IdxVal < VecTy.getNumElements())
2100 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2101 else
2102 B.buildUndef(Dst);
2103
2104 MI.eraseFromParent();
2105 return true;
2106 }
2107
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2108 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2109 MachineInstr &MI, MachineRegisterInfo &MRI,
2110 MachineIRBuilder &B) const {
2111 // TODO: Should move some of this into LegalizerHelper.
2112
2113 // TODO: Promote dynamic indexing of s16 to s32
2114
2115 // FIXME: Artifact combiner probably should have replaced the truncated
2116 // constant before this, so we shouldn't need
2117 // getConstantVRegValWithLookThrough.
2118 Optional<ValueAndVReg> MaybeIdxVal =
2119 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2120 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2121 return true;
2122
2123 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2124 Register Dst = MI.getOperand(0).getReg();
2125 Register Vec = MI.getOperand(1).getReg();
2126 Register Ins = MI.getOperand(2).getReg();
2127
2128 LLT VecTy = MRI.getType(Vec);
2129 LLT EltTy = VecTy.getElementType();
2130 assert(EltTy == MRI.getType(Ins));
2131
2132 if (IdxVal < VecTy.getNumElements())
2133 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2134 else
2135 B.buildUndef(Dst);
2136
2137 MI.eraseFromParent();
2138 return true;
2139 }
2140
legalizeShuffleVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2141 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2142 MachineInstr &MI, MachineRegisterInfo &MRI,
2143 MachineIRBuilder &B) const {
2144 const LLT V2S16 = LLT::vector(2, 16);
2145
2146 Register Dst = MI.getOperand(0).getReg();
2147 Register Src0 = MI.getOperand(1).getReg();
2148 LLT DstTy = MRI.getType(Dst);
2149 LLT SrcTy = MRI.getType(Src0);
2150
2151 if (SrcTy == V2S16 && DstTy == V2S16 &&
2152 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2153 return true;
2154
2155 MachineIRBuilder HelperBuilder(MI);
2156 GISelObserverWrapper DummyObserver;
2157 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2158 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2159 }
2160
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2161 bool AMDGPULegalizerInfo::legalizeSinCos(
2162 MachineInstr &MI, MachineRegisterInfo &MRI,
2163 MachineIRBuilder &B) const {
2164
2165 Register DstReg = MI.getOperand(0).getReg();
2166 Register SrcReg = MI.getOperand(1).getReg();
2167 LLT Ty = MRI.getType(DstReg);
2168 unsigned Flags = MI.getFlags();
2169
2170 Register TrigVal;
2171 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2172 if (ST.hasTrigReducedRange()) {
2173 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2174 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2175 .addUse(MulVal.getReg(0))
2176 .setMIFlags(Flags).getReg(0);
2177 } else
2178 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2179
2180 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2181 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2182 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2183 .addUse(TrigVal)
2184 .setMIFlags(Flags);
2185 MI.eraseFromParent();
2186 return true;
2187 }
2188
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,int64_t Offset,unsigned GAFlags) const2189 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2190 MachineIRBuilder &B,
2191 const GlobalValue *GV,
2192 int64_t Offset,
2193 unsigned GAFlags) const {
2194 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2195 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2196 // to the following code sequence:
2197 //
2198 // For constant address space:
2199 // s_getpc_b64 s[0:1]
2200 // s_add_u32 s0, s0, $symbol
2201 // s_addc_u32 s1, s1, 0
2202 //
2203 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2204 // a fixup or relocation is emitted to replace $symbol with a literal
2205 // constant, which is a pc-relative offset from the encoding of the $symbol
2206 // operand to the global variable.
2207 //
2208 // For global address space:
2209 // s_getpc_b64 s[0:1]
2210 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2211 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2212 //
2213 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2214 // fixups or relocations are emitted to replace $symbol@*@lo and
2215 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2216 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2217 // operand to the global variable.
2218 //
2219 // What we want here is an offset from the value returned by s_getpc
2220 // (which is the address of the s_add_u32 instruction) to the global
2221 // variable, but since the encoding of $symbol starts 4 bytes after the start
2222 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2223 // small. This requires us to add 4 to the global variable offset in order to
2224 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2225 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2226 // instruction.
2227
2228 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2229
2230 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2231 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2232
2233 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2234 .addDef(PCReg);
2235
2236 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2237 if (GAFlags == SIInstrInfo::MO_NONE)
2238 MIB.addImm(0);
2239 else
2240 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2241
2242 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2243
2244 if (PtrTy.getSizeInBits() == 32)
2245 B.buildExtract(DstReg, PCReg, 0);
2246 return true;
2247 }
2248
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2249 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2250 MachineInstr &MI, MachineRegisterInfo &MRI,
2251 MachineIRBuilder &B) const {
2252 Register DstReg = MI.getOperand(0).getReg();
2253 LLT Ty = MRI.getType(DstReg);
2254 unsigned AS = Ty.getAddressSpace();
2255
2256 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2257 MachineFunction &MF = B.getMF();
2258 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2259
2260 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2261 if (!MFI->isModuleEntryFunction()) {
2262 const Function &Fn = MF.getFunction();
2263 DiagnosticInfoUnsupported BadLDSDecl(
2264 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2265 DS_Warning);
2266 Fn.getContext().diagnose(BadLDSDecl);
2267
2268 // We currently don't have a way to correctly allocate LDS objects that
2269 // aren't directly associated with a kernel. We do force inlining of
2270 // functions that use local objects. However, if these dead functions are
2271 // not eliminated, we don't want a compile time error. Just emit a warning
2272 // and a trap, since there should be no callable path here.
2273 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2274 B.buildUndef(DstReg);
2275 MI.eraseFromParent();
2276 return true;
2277 }
2278
2279 // TODO: We could emit code to handle the initialization somewhere.
2280 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2281 const SITargetLowering *TLI = ST.getTargetLowering();
2282 if (!TLI->shouldUseLDSConstAddress(GV)) {
2283 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2284 return true; // Leave in place;
2285 }
2286
2287 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2288 Type *Ty = GV->getValueType();
2289 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2290 // zero-sized type in other languages to declare the dynamic shared
2291 // memory which size is not known at the compile time. They will be
2292 // allocated by the runtime and placed directly after the static
2293 // allocated ones. They all share the same offset.
2294 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2295 // Adjust alignment for that dynamic shared memory array.
2296 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2297 LLT S32 = LLT::scalar(32);
2298 auto Sz =
2299 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2300 B.buildIntToPtr(DstReg, Sz);
2301 MI.eraseFromParent();
2302 return true;
2303 }
2304 }
2305
2306 B.buildConstant(
2307 DstReg,
2308 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2309 MI.eraseFromParent();
2310 return true;
2311 }
2312
2313 const Function &Fn = MF.getFunction();
2314 DiagnosticInfoUnsupported BadInit(
2315 Fn, "unsupported initializer for address space", MI.getDebugLoc());
2316 Fn.getContext().diagnose(BadInit);
2317 return true;
2318 }
2319
2320 const SITargetLowering *TLI = ST.getTargetLowering();
2321
2322 if (TLI->shouldEmitFixup(GV)) {
2323 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2324 MI.eraseFromParent();
2325 return true;
2326 }
2327
2328 if (TLI->shouldEmitPCReloc(GV)) {
2329 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2330 MI.eraseFromParent();
2331 return true;
2332 }
2333
2334 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2335 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2336
2337 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2338 MachinePointerInfo::getGOT(MF),
2339 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2340 MachineMemOperand::MOInvariant,
2341 8 /*Size*/, Align(8));
2342
2343 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2344
2345 if (Ty.getSizeInBits() == 32) {
2346 // Truncate if this is a 32-bit constant adrdess.
2347 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2348 B.buildExtract(DstReg, Load, 0);
2349 } else
2350 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2351
2352 MI.eraseFromParent();
2353 return true;
2354 }
2355
widenToNextPowerOf2(LLT Ty)2356 static LLT widenToNextPowerOf2(LLT Ty) {
2357 if (Ty.isVector())
2358 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2359 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2360 }
2361
legalizeLoad(LegalizerHelper & Helper,MachineInstr & MI) const2362 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2363 MachineInstr &MI) const {
2364 MachineIRBuilder &B = Helper.MIRBuilder;
2365 MachineRegisterInfo &MRI = *B.getMRI();
2366 GISelChangeObserver &Observer = Helper.Observer;
2367
2368 Register PtrReg = MI.getOperand(1).getReg();
2369 LLT PtrTy = MRI.getType(PtrReg);
2370 unsigned AddrSpace = PtrTy.getAddressSpace();
2371
2372 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2373 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2374 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2375 Observer.changingInstr(MI);
2376 MI.getOperand(1).setReg(Cast.getReg(0));
2377 Observer.changedInstr(MI);
2378 return true;
2379 }
2380
2381 Register ValReg = MI.getOperand(0).getReg();
2382 LLT ValTy = MRI.getType(ValReg);
2383
2384 MachineMemOperand *MMO = *MI.memoperands_begin();
2385 const unsigned ValSize = ValTy.getSizeInBits();
2386 const unsigned MemSize = 8 * MMO->getSize();
2387 const Align MemAlign = MMO->getAlign();
2388 const unsigned AlignInBits = 8 * MemAlign.value();
2389
2390 // Widen non-power-of-2 loads to the alignment if needed
2391 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2392 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2393
2394 // This was already the correct extending load result type, so just adjust
2395 // the memory type.
2396 if (WideMemSize == ValSize) {
2397 MachineFunction &MF = B.getMF();
2398
2399 MachineMemOperand *WideMMO =
2400 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2401 Observer.changingInstr(MI);
2402 MI.setMemRefs(MF, {WideMMO});
2403 Observer.changedInstr(MI);
2404 return true;
2405 }
2406
2407 // Don't bother handling edge case that should probably never be produced.
2408 if (ValSize > WideMemSize)
2409 return false;
2410
2411 LLT WideTy = widenToNextPowerOf2(ValTy);
2412
2413 Register WideLoad;
2414 if (!WideTy.isVector()) {
2415 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2416 B.buildTrunc(ValReg, WideLoad).getReg(0);
2417 } else {
2418 // Extract the subvector.
2419
2420 if (isRegisterType(ValTy)) {
2421 // If this a case where G_EXTRACT is legal, use it.
2422 // (e.g. <3 x s32> -> <4 x s32>)
2423 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2424 B.buildExtract(ValReg, WideLoad, 0);
2425 } else {
2426 // For cases where the widened type isn't a nice register value, unmerge
2427 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2428 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2429 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2430 B.setInsertPt(B.getMBB(), MI.getIterator());
2431 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2432 }
2433 }
2434
2435 MI.eraseFromParent();
2436 return true;
2437 }
2438
2439 return false;
2440 }
2441
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2442 bool AMDGPULegalizerInfo::legalizeFMad(
2443 MachineInstr &MI, MachineRegisterInfo &MRI,
2444 MachineIRBuilder &B) const {
2445 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2446 assert(Ty.isScalar());
2447
2448 MachineFunction &MF = B.getMF();
2449 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2450
2451 // TODO: Always legal with future ftz flag.
2452 // FIXME: Do we need just output?
2453 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2454 return true;
2455 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2456 return true;
2457
2458 MachineIRBuilder HelperBuilder(MI);
2459 GISelObserverWrapper DummyObserver;
2460 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2461 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2462 }
2463
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2464 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2465 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2466 Register DstReg = MI.getOperand(0).getReg();
2467 Register PtrReg = MI.getOperand(1).getReg();
2468 Register CmpVal = MI.getOperand(2).getReg();
2469 Register NewVal = MI.getOperand(3).getReg();
2470
2471 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2472 "this should not have been custom lowered");
2473
2474 LLT ValTy = MRI.getType(CmpVal);
2475 LLT VecTy = LLT::vector(2, ValTy);
2476
2477 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2478
2479 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2480 .addDef(DstReg)
2481 .addUse(PtrReg)
2482 .addUse(PackedVal)
2483 .setMemRefs(MI.memoperands());
2484
2485 MI.eraseFromParent();
2486 return true;
2487 }
2488
legalizeFlog(MachineInstr & MI,MachineIRBuilder & B,double Log2BaseInverted) const2489 bool AMDGPULegalizerInfo::legalizeFlog(
2490 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2491 Register Dst = MI.getOperand(0).getReg();
2492 Register Src = MI.getOperand(1).getReg();
2493 LLT Ty = B.getMRI()->getType(Dst);
2494 unsigned Flags = MI.getFlags();
2495
2496 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2497 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2498
2499 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2500 MI.eraseFromParent();
2501 return true;
2502 }
2503
legalizeFExp(MachineInstr & MI,MachineIRBuilder & B) const2504 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2505 MachineIRBuilder &B) const {
2506 Register Dst = MI.getOperand(0).getReg();
2507 Register Src = MI.getOperand(1).getReg();
2508 unsigned Flags = MI.getFlags();
2509 LLT Ty = B.getMRI()->getType(Dst);
2510
2511 auto K = B.buildFConstant(Ty, numbers::log2e);
2512 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2513 B.buildFExp2(Dst, Mul, Flags);
2514 MI.eraseFromParent();
2515 return true;
2516 }
2517
legalizeFPow(MachineInstr & MI,MachineIRBuilder & B) const2518 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2519 MachineIRBuilder &B) const {
2520 Register Dst = MI.getOperand(0).getReg();
2521 Register Src0 = MI.getOperand(1).getReg();
2522 Register Src1 = MI.getOperand(2).getReg();
2523 unsigned Flags = MI.getFlags();
2524 LLT Ty = B.getMRI()->getType(Dst);
2525 const LLT S16 = LLT::scalar(16);
2526 const LLT S32 = LLT::scalar(32);
2527
2528 if (Ty == S32) {
2529 auto Log = B.buildFLog2(S32, Src0, Flags);
2530 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2531 .addUse(Log.getReg(0))
2532 .addUse(Src1)
2533 .setMIFlags(Flags);
2534 B.buildFExp2(Dst, Mul, Flags);
2535 } else if (Ty == S16) {
2536 // There's no f16 fmul_legacy, so we need to convert for it.
2537 auto Log = B.buildFLog2(S16, Src0, Flags);
2538 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2539 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2540 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2541 .addUse(Ext0.getReg(0))
2542 .addUse(Ext1.getReg(0))
2543 .setMIFlags(Flags);
2544
2545 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2546 } else
2547 return false;
2548
2549 MI.eraseFromParent();
2550 return true;
2551 }
2552
2553 // Find a source register, ignoring any possible source modifiers.
stripAnySourceMods(Register OrigSrc,MachineRegisterInfo & MRI)2554 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2555 Register ModSrc = OrigSrc;
2556 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2557 ModSrc = SrcFNeg->getOperand(1).getReg();
2558 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2559 ModSrc = SrcFAbs->getOperand(1).getReg();
2560 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2561 ModSrc = SrcFAbs->getOperand(1).getReg();
2562 return ModSrc;
2563 }
2564
legalizeFFloor(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2565 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2566 MachineRegisterInfo &MRI,
2567 MachineIRBuilder &B) const {
2568
2569 const LLT S1 = LLT::scalar(1);
2570 const LLT S64 = LLT::scalar(64);
2571 Register Dst = MI.getOperand(0).getReg();
2572 Register OrigSrc = MI.getOperand(1).getReg();
2573 unsigned Flags = MI.getFlags();
2574 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2575 "this should not have been custom lowered");
2576
2577 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2578 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2579 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2580 // V_FRACT bug is:
2581 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2582 //
2583 // Convert floor(x) to (x - fract(x))
2584
2585 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2586 .addUse(OrigSrc)
2587 .setMIFlags(Flags);
2588
2589 // Give source modifier matching some assistance before obscuring a foldable
2590 // pattern.
2591
2592 // TODO: We can avoid the neg on the fract? The input sign to fract
2593 // shouldn't matter?
2594 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2595
2596 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2597
2598 Register Min = MRI.createGenericVirtualRegister(S64);
2599
2600 // We don't need to concern ourselves with the snan handling difference, so
2601 // use the one which will directly select.
2602 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2603 if (MFI->getMode().IEEE)
2604 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2605 else
2606 B.buildFMinNum(Min, Fract, Const, Flags);
2607
2608 Register CorrectedFract = Min;
2609 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2610 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2611 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2612 }
2613
2614 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2615 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2616
2617 MI.eraseFromParent();
2618 return true;
2619 }
2620
2621 // Turn an illegal packed v2s16 build vector into bit operations.
2622 // TODO: This should probably be a bitcast action in LegalizerHelper.
legalizeBuildVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2623 bool AMDGPULegalizerInfo::legalizeBuildVector(
2624 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2625 Register Dst = MI.getOperand(0).getReg();
2626 const LLT S32 = LLT::scalar(32);
2627 assert(MRI.getType(Dst) == LLT::vector(2, 16));
2628
2629 Register Src0 = MI.getOperand(1).getReg();
2630 Register Src1 = MI.getOperand(2).getReg();
2631 assert(MRI.getType(Src0) == LLT::scalar(16));
2632
2633 auto Merge = B.buildMerge(S32, {Src0, Src1});
2634 B.buildBitcast(Dst, Merge);
2635
2636 MI.eraseFromParent();
2637 return true;
2638 }
2639
2640 // Check that this is a G_XOR x, -1
isNot(const MachineRegisterInfo & MRI,const MachineInstr & MI)2641 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2642 if (MI.getOpcode() != TargetOpcode::G_XOR)
2643 return false;
2644 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2645 return ConstVal && *ConstVal == -1;
2646 }
2647
2648 // Return the use branch instruction, otherwise null if the usage is invalid.
2649 static MachineInstr *
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br,MachineBasicBlock * & UncondBrTarget,bool & Negated)2650 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2651 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2652 Register CondDef = MI.getOperand(0).getReg();
2653 if (!MRI.hasOneNonDBGUse(CondDef))
2654 return nullptr;
2655
2656 MachineBasicBlock *Parent = MI.getParent();
2657 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2658
2659 if (isNot(MRI, *UseMI)) {
2660 Register NegatedCond = UseMI->getOperand(0).getReg();
2661 if (!MRI.hasOneNonDBGUse(NegatedCond))
2662 return nullptr;
2663
2664 // We're deleting the def of this value, so we need to remove it.
2665 UseMI->eraseFromParent();
2666
2667 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2668 Negated = true;
2669 }
2670
2671 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2672 return nullptr;
2673
2674 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2675 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2676 if (Next == Parent->end()) {
2677 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2678 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2679 return nullptr;
2680 UncondBrTarget = &*NextMBB;
2681 } else {
2682 if (Next->getOpcode() != AMDGPU::G_BR)
2683 return nullptr;
2684 Br = &*Next;
2685 UncondBrTarget = Br->getOperand(0).getMBB();
2686 }
2687
2688 return UseMI;
2689 }
2690
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg,const TargetRegisterClass * ArgRC,LLT ArgTy) const2691 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2692 const ArgDescriptor *Arg,
2693 const TargetRegisterClass *ArgRC,
2694 LLT ArgTy) const {
2695 MCRegister SrcReg = Arg->getRegister();
2696 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2697 assert(DstReg.isVirtual() && "Virtual register expected");
2698
2699 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2700 ArgTy);
2701 if (Arg->isMasked()) {
2702 // TODO: Should we try to emit this once in the entry block?
2703 const LLT S32 = LLT::scalar(32);
2704 const unsigned Mask = Arg->getMask();
2705 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2706
2707 Register AndMaskSrc = LiveIn;
2708
2709 if (Shift != 0) {
2710 auto ShiftAmt = B.buildConstant(S32, Shift);
2711 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2712 }
2713
2714 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2715 } else {
2716 B.buildCopy(DstReg, LiveIn);
2717 }
2718
2719 return true;
2720 }
2721
loadInputValue(Register DstReg,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2722 bool AMDGPULegalizerInfo::loadInputValue(
2723 Register DstReg, MachineIRBuilder &B,
2724 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2725 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2726 const ArgDescriptor *Arg;
2727 const TargetRegisterClass *ArgRC;
2728 LLT ArgTy;
2729 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2730
2731 if (!Arg->isRegister() || !Arg->getRegister().isValid())
2732 return false; // TODO: Handle these
2733 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2734 }
2735
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2736 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2737 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2738 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2739 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2740 return false;
2741
2742 MI.eraseFromParent();
2743 return true;
2744 }
2745
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2746 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2747 MachineRegisterInfo &MRI,
2748 MachineIRBuilder &B) const {
2749 Register Dst = MI.getOperand(0).getReg();
2750 LLT DstTy = MRI.getType(Dst);
2751 LLT S16 = LLT::scalar(16);
2752 LLT S32 = LLT::scalar(32);
2753 LLT S64 = LLT::scalar(64);
2754
2755 if (DstTy == S16)
2756 return legalizeFDIV16(MI, MRI, B);
2757 if (DstTy == S32)
2758 return legalizeFDIV32(MI, MRI, B);
2759 if (DstTy == S64)
2760 return legalizeFDIV64(MI, MRI, B);
2761
2762 return false;
2763 }
2764
legalizeUDIV_UREM32Impl(MachineIRBuilder & B,Register DstReg,Register X,Register Y,bool IsDiv) const2765 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2766 Register DstReg,
2767 Register X,
2768 Register Y,
2769 bool IsDiv) const {
2770 const LLT S1 = LLT::scalar(1);
2771 const LLT S32 = LLT::scalar(32);
2772
2773 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2774 // algorithm used here.
2775
2776 // Initial estimate of inv(y).
2777 auto FloatY = B.buildUITOFP(S32, Y);
2778 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2779 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2780 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2781 auto Z = B.buildFPTOUI(S32, ScaledY);
2782
2783 // One round of UNR.
2784 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2785 auto NegYZ = B.buildMul(S32, NegY, Z);
2786 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2787
2788 // Quotient/remainder estimate.
2789 auto Q = B.buildUMulH(S32, X, Z);
2790 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2791
2792 // First quotient/remainder refinement.
2793 auto One = B.buildConstant(S32, 1);
2794 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2795 if (IsDiv)
2796 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2797 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2798
2799 // Second quotient/remainder refinement.
2800 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2801 if (IsDiv)
2802 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2803 else
2804 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2805 }
2806
legalizeUDIV_UREM32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2807 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2808 MachineRegisterInfo &MRI,
2809 MachineIRBuilder &B) const {
2810 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2811 Register DstReg = MI.getOperand(0).getReg();
2812 Register Num = MI.getOperand(1).getReg();
2813 Register Den = MI.getOperand(2).getReg();
2814 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2815 MI.eraseFromParent();
2816 return true;
2817 }
2818
2819 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2820 //
2821 // Return lo, hi of result
2822 //
2823 // %cvt.lo = G_UITOFP Val.lo
2824 // %cvt.hi = G_UITOFP Val.hi
2825 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2826 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2827 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2828 // %mul2 = G_FMUL %mul1, 2**(-32)
2829 // %trunc = G_INTRINSIC_TRUNC %mul2
2830 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2831 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
emitReciprocalU64(MachineIRBuilder & B,Register Val)2832 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2833 Register Val) {
2834 const LLT S32 = LLT::scalar(32);
2835 auto Unmerge = B.buildUnmerge(S32, Val);
2836
2837 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2838 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2839
2840 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2841 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2842
2843 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2844 auto Mul1 =
2845 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2846
2847 // 2**(-32)
2848 auto Mul2 =
2849 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2850 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2851
2852 // -(2**32)
2853 auto Mad2 = B.buildFMAD(S32, Trunc,
2854 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2855
2856 auto ResultLo = B.buildFPTOUI(S32, Mad2);
2857 auto ResultHi = B.buildFPTOUI(S32, Trunc);
2858
2859 return {ResultLo.getReg(0), ResultHi.getReg(0)};
2860 }
2861
legalizeUDIV_UREM64Impl(MachineIRBuilder & B,Register DstReg,Register Numer,Register Denom,bool IsDiv) const2862 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2863 Register DstReg,
2864 Register Numer,
2865 Register Denom,
2866 bool IsDiv) const {
2867 const LLT S32 = LLT::scalar(32);
2868 const LLT S64 = LLT::scalar(64);
2869 const LLT S1 = LLT::scalar(1);
2870 Register RcpLo, RcpHi;
2871
2872 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2873
2874 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2875
2876 auto Zero64 = B.buildConstant(S64, 0);
2877 auto NegDenom = B.buildSub(S64, Zero64, Denom);
2878
2879 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2880 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2881
2882 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2883 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2884 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2885
2886 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2887 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2888 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2889 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2890
2891 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2892 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2893 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2894 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2895 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2896
2897 auto Zero32 = B.buildConstant(S32, 0);
2898 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2899 auto Add2_HiC =
2900 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2901 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2902 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2903
2904 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2905 Register NumerLo = UnmergeNumer.getReg(0);
2906 Register NumerHi = UnmergeNumer.getReg(1);
2907
2908 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2909 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2910 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2911 Register Mul3_Lo = UnmergeMul3.getReg(0);
2912 Register Mul3_Hi = UnmergeMul3.getReg(1);
2913 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2914 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2915 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2916 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2917
2918 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2919 Register DenomLo = UnmergeDenom.getReg(0);
2920 Register DenomHi = UnmergeDenom.getReg(1);
2921
2922 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2923 auto C1 = B.buildSExt(S32, CmpHi);
2924
2925 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2926 auto C2 = B.buildSExt(S32, CmpLo);
2927
2928 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2929 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2930
2931 // TODO: Here and below portions of the code can be enclosed into if/endif.
2932 // Currently control flow is unconditional and we have 4 selects after
2933 // potential endif to substitute PHIs.
2934
2935 // if C3 != 0 ...
2936 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2937 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2938 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2939 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2940
2941 auto One64 = B.buildConstant(S64, 1);
2942 auto Add3 = B.buildAdd(S64, MulHi3, One64);
2943
2944 auto C4 =
2945 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2946 auto C5 =
2947 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2948 auto C6 = B.buildSelect(
2949 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2950
2951 // if (C6 != 0)
2952 auto Add4 = B.buildAdd(S64, Add3, One64);
2953 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2954
2955 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2956 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2957 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2958
2959 // endif C6
2960 // endif C3
2961
2962 if (IsDiv) {
2963 auto Sel1 = B.buildSelect(
2964 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2965 B.buildSelect(DstReg,
2966 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2967 } else {
2968 auto Sel2 = B.buildSelect(
2969 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2970 B.buildSelect(DstReg,
2971 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2972 }
2973 }
2974
legalizeUDIV_UREM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2975 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2976 MachineRegisterInfo &MRI,
2977 MachineIRBuilder &B) const {
2978 const LLT S64 = LLT::scalar(64);
2979 const LLT S32 = LLT::scalar(32);
2980 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2981 Register DstReg = MI.getOperand(0).getReg();
2982 Register Num = MI.getOperand(1).getReg();
2983 Register Den = MI.getOperand(2).getReg();
2984 LLT Ty = MRI.getType(DstReg);
2985
2986 if (Ty == S32)
2987 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2988 else if (Ty == S64)
2989 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2990 else
2991 return false;
2992
2993 MI.eraseFromParent();
2994 return true;
2995
2996 }
2997
legalizeSDIV_SREM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2998 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2999 MachineRegisterInfo &MRI,
3000 MachineIRBuilder &B) const {
3001 const LLT S64 = LLT::scalar(64);
3002 const LLT S32 = LLT::scalar(32);
3003
3004 Register DstReg = MI.getOperand(0).getReg();
3005 const LLT Ty = MRI.getType(DstReg);
3006 if (Ty != S32 && Ty != S64)
3007 return false;
3008
3009 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3010
3011 Register LHS = MI.getOperand(1).getReg();
3012 Register RHS = MI.getOperand(2).getReg();
3013
3014 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3015 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3016 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3017
3018 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3019 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3020
3021 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3022 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3023
3024 Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3025 if (Ty == S32)
3026 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3027 else
3028 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3029
3030 Register Sign;
3031 if (IsDiv)
3032 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3033 else
3034 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3035
3036 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3037 B.buildSub(DstReg, UDivRem, Sign);
3038
3039 MI.eraseFromParent();
3040 return true;
3041 }
3042
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3043 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3044 MachineRegisterInfo &MRI,
3045 MachineIRBuilder &B) const {
3046 Register Res = MI.getOperand(0).getReg();
3047 Register LHS = MI.getOperand(1).getReg();
3048 Register RHS = MI.getOperand(2).getReg();
3049 uint16_t Flags = MI.getFlags();
3050 LLT ResTy = MRI.getType(Res);
3051
3052 const MachineFunction &MF = B.getMF();
3053 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3054 MI.getFlag(MachineInstr::FmAfn);
3055
3056 if (!AllowInaccurateRcp)
3057 return false;
3058
3059 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3060 // 1 / x -> RCP(x)
3061 if (CLHS->isExactlyValue(1.0)) {
3062 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3063 .addUse(RHS)
3064 .setMIFlags(Flags);
3065
3066 MI.eraseFromParent();
3067 return true;
3068 }
3069
3070 // -1 / x -> RCP( FNEG(x) )
3071 if (CLHS->isExactlyValue(-1.0)) {
3072 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3073 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3074 .addUse(FNeg.getReg(0))
3075 .setMIFlags(Flags);
3076
3077 MI.eraseFromParent();
3078 return true;
3079 }
3080 }
3081
3082 // x / y -> x * (1.0 / y)
3083 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3084 .addUse(RHS)
3085 .setMIFlags(Flags);
3086 B.buildFMul(Res, LHS, RCP, Flags);
3087
3088 MI.eraseFromParent();
3089 return true;
3090 }
3091
legalizeFastUnsafeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3092 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3093 MachineRegisterInfo &MRI,
3094 MachineIRBuilder &B) const {
3095 Register Res = MI.getOperand(0).getReg();
3096 Register X = MI.getOperand(1).getReg();
3097 Register Y = MI.getOperand(2).getReg();
3098 uint16_t Flags = MI.getFlags();
3099 LLT ResTy = MRI.getType(Res);
3100
3101 const MachineFunction &MF = B.getMF();
3102 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3103 MI.getFlag(MachineInstr::FmAfn);
3104
3105 if (!AllowInaccurateRcp)
3106 return false;
3107
3108 auto NegY = B.buildFNeg(ResTy, Y);
3109 auto One = B.buildFConstant(ResTy, 1.0);
3110
3111 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3112 .addUse(Y)
3113 .setMIFlags(Flags);
3114
3115 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3116 R = B.buildFMA(ResTy, Tmp0, R, R);
3117
3118 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3119 R = B.buildFMA(ResTy, Tmp1, R, R);
3120
3121 auto Ret = B.buildFMul(ResTy, X, R);
3122 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3123
3124 B.buildFMA(Res, Tmp2, R, Ret);
3125 MI.eraseFromParent();
3126 return true;
3127 }
3128
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3129 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3130 MachineRegisterInfo &MRI,
3131 MachineIRBuilder &B) const {
3132 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3133 return true;
3134
3135 Register Res = MI.getOperand(0).getReg();
3136 Register LHS = MI.getOperand(1).getReg();
3137 Register RHS = MI.getOperand(2).getReg();
3138
3139 uint16_t Flags = MI.getFlags();
3140
3141 LLT S16 = LLT::scalar(16);
3142 LLT S32 = LLT::scalar(32);
3143
3144 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3145 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3146
3147 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3148 .addUse(RHSExt.getReg(0))
3149 .setMIFlags(Flags);
3150
3151 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3152 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3153
3154 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3155 .addUse(RDst.getReg(0))
3156 .addUse(RHS)
3157 .addUse(LHS)
3158 .setMIFlags(Flags);
3159
3160 MI.eraseFromParent();
3161 return true;
3162 }
3163
3164 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3165 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,AMDGPU::SIModeRegisterDefaults Mode)3166 static void toggleSPDenormMode(bool Enable,
3167 MachineIRBuilder &B,
3168 const GCNSubtarget &ST,
3169 AMDGPU::SIModeRegisterDefaults Mode) {
3170 // Set SP denorm mode to this value.
3171 unsigned SPDenormMode =
3172 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3173
3174 if (ST.hasDenormModeInst()) {
3175 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3176 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3177
3178 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3179 B.buildInstr(AMDGPU::S_DENORM_MODE)
3180 .addImm(NewDenormModeValue);
3181
3182 } else {
3183 // Select FP32 bit field in mode register.
3184 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3185 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3186 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3187
3188 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3189 .addImm(SPDenormMode)
3190 .addImm(SPDenormModeBitField);
3191 }
3192 }
3193
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3194 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3195 MachineRegisterInfo &MRI,
3196 MachineIRBuilder &B) const {
3197 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3198 return true;
3199
3200 Register Res = MI.getOperand(0).getReg();
3201 Register LHS = MI.getOperand(1).getReg();
3202 Register RHS = MI.getOperand(2).getReg();
3203 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3204 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3205
3206 uint16_t Flags = MI.getFlags();
3207
3208 LLT S32 = LLT::scalar(32);
3209 LLT S1 = LLT::scalar(1);
3210
3211 auto One = B.buildFConstant(S32, 1.0f);
3212
3213 auto DenominatorScaled =
3214 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3215 .addUse(LHS)
3216 .addUse(RHS)
3217 .addImm(0)
3218 .setMIFlags(Flags);
3219 auto NumeratorScaled =
3220 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3221 .addUse(LHS)
3222 .addUse(RHS)
3223 .addImm(1)
3224 .setMIFlags(Flags);
3225
3226 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3227 .addUse(DenominatorScaled.getReg(0))
3228 .setMIFlags(Flags);
3229 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3230
3231 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3232 // aren't modeled as reading it.
3233 if (!Mode.allFP32Denormals())
3234 toggleSPDenormMode(true, B, ST, Mode);
3235
3236 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3237 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3238 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3239 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3240 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3241 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3242
3243 if (!Mode.allFP32Denormals())
3244 toggleSPDenormMode(false, B, ST, Mode);
3245
3246 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3247 .addUse(Fma4.getReg(0))
3248 .addUse(Fma1.getReg(0))
3249 .addUse(Fma3.getReg(0))
3250 .addUse(NumeratorScaled.getReg(1))
3251 .setMIFlags(Flags);
3252
3253 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3254 .addUse(Fmas.getReg(0))
3255 .addUse(RHS)
3256 .addUse(LHS)
3257 .setMIFlags(Flags);
3258
3259 MI.eraseFromParent();
3260 return true;
3261 }
3262
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3263 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3264 MachineRegisterInfo &MRI,
3265 MachineIRBuilder &B) const {
3266 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3267 return true;
3268
3269 Register Res = MI.getOperand(0).getReg();
3270 Register LHS = MI.getOperand(1).getReg();
3271 Register RHS = MI.getOperand(2).getReg();
3272
3273 uint16_t Flags = MI.getFlags();
3274
3275 LLT S64 = LLT::scalar(64);
3276 LLT S1 = LLT::scalar(1);
3277
3278 auto One = B.buildFConstant(S64, 1.0);
3279
3280 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3281 .addUse(LHS)
3282 .addUse(RHS)
3283 .addImm(0)
3284 .setMIFlags(Flags);
3285
3286 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3287
3288 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3289 .addUse(DivScale0.getReg(0))
3290 .setMIFlags(Flags);
3291
3292 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3293 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3294 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3295
3296 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3297 .addUse(LHS)
3298 .addUse(RHS)
3299 .addImm(1)
3300 .setMIFlags(Flags);
3301
3302 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3303 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3304 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3305
3306 Register Scale;
3307 if (!ST.hasUsableDivScaleConditionOutput()) {
3308 // Workaround a hardware bug on SI where the condition output from div_scale
3309 // is not usable.
3310
3311 LLT S32 = LLT::scalar(32);
3312
3313 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3314 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3315 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3316 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3317
3318 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3319 Scale1Unmerge.getReg(1));
3320 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3321 Scale0Unmerge.getReg(1));
3322 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3323 } else {
3324 Scale = DivScale1.getReg(1);
3325 }
3326
3327 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3328 .addUse(Fma4.getReg(0))
3329 .addUse(Fma3.getReg(0))
3330 .addUse(Mul.getReg(0))
3331 .addUse(Scale)
3332 .setMIFlags(Flags);
3333
3334 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3335 .addUse(Fmas.getReg(0))
3336 .addUse(RHS)
3337 .addUse(LHS)
3338 .setMIFlags(Flags);
3339
3340 MI.eraseFromParent();
3341 return true;
3342 }
3343
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3344 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3345 MachineRegisterInfo &MRI,
3346 MachineIRBuilder &B) const {
3347 Register Res = MI.getOperand(0).getReg();
3348 Register LHS = MI.getOperand(2).getReg();
3349 Register RHS = MI.getOperand(3).getReg();
3350 uint16_t Flags = MI.getFlags();
3351
3352 LLT S32 = LLT::scalar(32);
3353 LLT S1 = LLT::scalar(1);
3354
3355 auto Abs = B.buildFAbs(S32, RHS, Flags);
3356 const APFloat C0Val(1.0f);
3357
3358 auto C0 = B.buildConstant(S32, 0x6f800000);
3359 auto C1 = B.buildConstant(S32, 0x2f800000);
3360 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3361
3362 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3363 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3364
3365 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3366
3367 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3368 .addUse(Mul0.getReg(0))
3369 .setMIFlags(Flags);
3370
3371 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3372
3373 B.buildFMul(Res, Sel, Mul1, Flags);
3374
3375 MI.eraseFromParent();
3376 return true;
3377 }
3378
3379 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3380 // FIXME: Why do we handle this one but not other removed instructions?
3381 //
3382 // Reciprocal square root. The clamp prevents infinite results, clamping
3383 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3384 // +-max_float.
legalizeRsqClampIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3385 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3386 MachineRegisterInfo &MRI,
3387 MachineIRBuilder &B) const {
3388 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3389 return true;
3390
3391 Register Dst = MI.getOperand(0).getReg();
3392 Register Src = MI.getOperand(2).getReg();
3393 auto Flags = MI.getFlags();
3394
3395 LLT Ty = MRI.getType(Dst);
3396
3397 const fltSemantics *FltSemantics;
3398 if (Ty == LLT::scalar(32))
3399 FltSemantics = &APFloat::IEEEsingle();
3400 else if (Ty == LLT::scalar(64))
3401 FltSemantics = &APFloat::IEEEdouble();
3402 else
3403 return false;
3404
3405 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3406 .addUse(Src)
3407 .setMIFlags(Flags);
3408
3409 // We don't need to concern ourselves with the snan handling difference, since
3410 // the rsq quieted (or not) so use the one which will directly select.
3411 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3412 const bool UseIEEE = MFI->getMode().IEEE;
3413
3414 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3415 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3416 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3417
3418 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3419
3420 if (UseIEEE)
3421 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3422 else
3423 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3424 MI.eraseFromParent();
3425 return true;
3426 }
3427
getDSFPAtomicOpcode(Intrinsic::ID IID)3428 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3429 switch (IID) {
3430 case Intrinsic::amdgcn_ds_fadd:
3431 return AMDGPU::G_ATOMICRMW_FADD;
3432 case Intrinsic::amdgcn_ds_fmin:
3433 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3434 case Intrinsic::amdgcn_ds_fmax:
3435 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3436 default:
3437 llvm_unreachable("not a DS FP intrinsic");
3438 }
3439 }
3440
legalizeDSAtomicFPIntrinsic(LegalizerHelper & Helper,MachineInstr & MI,Intrinsic::ID IID) const3441 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3442 MachineInstr &MI,
3443 Intrinsic::ID IID) const {
3444 GISelChangeObserver &Observer = Helper.Observer;
3445 Observer.changingInstr(MI);
3446
3447 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3448
3449 // The remaining operands were used to set fields in the MemOperand on
3450 // construction.
3451 for (int I = 6; I > 3; --I)
3452 MI.RemoveOperand(I);
3453
3454 MI.RemoveOperand(1); // Remove the intrinsic ID.
3455 Observer.changedInstr(MI);
3456 return true;
3457 }
3458
getImplicitArgPtr(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3459 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3460 MachineRegisterInfo &MRI,
3461 MachineIRBuilder &B) const {
3462 uint64_t Offset =
3463 ST.getTargetLowering()->getImplicitParameterOffset(
3464 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3465 LLT DstTy = MRI.getType(DstReg);
3466 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3467
3468 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3469 if (!loadInputValue(KernargPtrReg, B,
3470 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3471 return false;
3472
3473 // FIXME: This should be nuw
3474 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3475 return true;
3476 }
3477
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3478 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3479 MachineRegisterInfo &MRI,
3480 MachineIRBuilder &B) const {
3481 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3482 if (!MFI->isEntryFunction()) {
3483 return legalizePreloadedArgIntrin(MI, MRI, B,
3484 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3485 }
3486
3487 Register DstReg = MI.getOperand(0).getReg();
3488 if (!getImplicitArgPtr(DstReg, MRI, B))
3489 return false;
3490
3491 MI.eraseFromParent();
3492 return true;
3493 }
3494
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const3495 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3496 MachineRegisterInfo &MRI,
3497 MachineIRBuilder &B,
3498 unsigned AddrSpace) const {
3499 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3500 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3501 Register Hi32 = Unmerge.getReg(1);
3502
3503 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3504 MI.eraseFromParent();
3505 return true;
3506 }
3507
3508 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3509 // offset (the offset that is included in bounds checking and swizzling, to be
3510 // split between the instruction's voffset and immoffset fields) and soffset
3511 // (the offset that is excluded from bounds checking and swizzling, to go in
3512 // the instruction's soffset field). This function takes the first kind of
3513 // offset and figures out how to split it between voffset and immoffset.
3514 std::tuple<Register, unsigned, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const3515 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3516 Register OrigOffset) const {
3517 const unsigned MaxImm = 4095;
3518 Register BaseReg;
3519 unsigned TotalConstOffset;
3520 const LLT S32 = LLT::scalar(32);
3521
3522 std::tie(BaseReg, TotalConstOffset) =
3523 AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3524
3525 unsigned ImmOffset = TotalConstOffset;
3526
3527 // If the immediate value is too big for the immoffset field, put the value
3528 // and -4096 into the immoffset field so that the value that is copied/added
3529 // for the voffset field is a multiple of 4096, and it stands more chance
3530 // of being CSEd with the copy/add for another similar load/store.
3531 // However, do not do that rounding down to a multiple of 4096 if that is a
3532 // negative number, as it appears to be illegal to have a negative offset
3533 // in the vgpr, even if adding the immediate offset makes it positive.
3534 unsigned Overflow = ImmOffset & ~MaxImm;
3535 ImmOffset -= Overflow;
3536 if ((int32_t)Overflow < 0) {
3537 Overflow += ImmOffset;
3538 ImmOffset = 0;
3539 }
3540
3541 if (Overflow != 0) {
3542 if (!BaseReg) {
3543 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3544 } else {
3545 auto OverflowVal = B.buildConstant(S32, Overflow);
3546 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3547 }
3548 }
3549
3550 if (!BaseReg)
3551 BaseReg = B.buildConstant(S32, 0).getReg(0);
3552
3553 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3554 }
3555
3556 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg,bool ImageStore) const3557 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3558 MachineRegisterInfo &MRI,
3559 Register Reg,
3560 bool ImageStore) const {
3561 const LLT S16 = LLT::scalar(16);
3562 const LLT S32 = LLT::scalar(32);
3563 LLT StoreVT = MRI.getType(Reg);
3564 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3565
3566 if (ST.hasUnpackedD16VMem()) {
3567 auto Unmerge = B.buildUnmerge(S16, Reg);
3568
3569 SmallVector<Register, 4> WideRegs;
3570 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3571 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3572
3573 int NumElts = StoreVT.getNumElements();
3574
3575 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3576 }
3577
3578 if (ImageStore && ST.hasImageStoreD16Bug()) {
3579 if (StoreVT.getNumElements() == 2) {
3580 SmallVector<Register, 4> PackedRegs;
3581 Reg = B.buildBitcast(S32, Reg).getReg(0);
3582 PackedRegs.push_back(Reg);
3583 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3584 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3585 }
3586
3587 if (StoreVT.getNumElements() == 3) {
3588 SmallVector<Register, 4> PackedRegs;
3589 auto Unmerge = B.buildUnmerge(S16, Reg);
3590 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3591 PackedRegs.push_back(Unmerge.getReg(I));
3592 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3593 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3594 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3595 }
3596
3597 if (StoreVT.getNumElements() == 4) {
3598 SmallVector<Register, 4> PackedRegs;
3599 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3600 auto Unmerge = B.buildUnmerge(S32, Reg);
3601 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3602 PackedRegs.push_back(Unmerge.getReg(I));
3603 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3604 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3605 }
3606
3607 llvm_unreachable("invalid data type");
3608 }
3609
3610 return Reg;
3611 }
3612
fixStoreSourceType(MachineIRBuilder & B,Register VData,bool IsFormat) const3613 Register AMDGPULegalizerInfo::fixStoreSourceType(
3614 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3615 MachineRegisterInfo *MRI = B.getMRI();
3616 LLT Ty = MRI->getType(VData);
3617
3618 const LLT S16 = LLT::scalar(16);
3619
3620 // Fixup illegal register types for i8 stores.
3621 if (Ty == LLT::scalar(8) || Ty == S16) {
3622 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3623 return AnyExt;
3624 }
3625
3626 if (Ty.isVector()) {
3627 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3628 if (IsFormat)
3629 return handleD16VData(B, *MRI, VData);
3630 }
3631 }
3632
3633 return VData;
3634 }
3635
legalizeBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsTyped,bool IsFormat) const3636 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3637 MachineRegisterInfo &MRI,
3638 MachineIRBuilder &B,
3639 bool IsTyped,
3640 bool IsFormat) const {
3641 Register VData = MI.getOperand(1).getReg();
3642 LLT Ty = MRI.getType(VData);
3643 LLT EltTy = Ty.getScalarType();
3644 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3645 const LLT S32 = LLT::scalar(32);
3646
3647 VData = fixStoreSourceType(B, VData, IsFormat);
3648 Register RSrc = MI.getOperand(2).getReg();
3649
3650 MachineMemOperand *MMO = *MI.memoperands_begin();
3651 const int MemSize = MMO->getSize();
3652
3653 unsigned ImmOffset;
3654 unsigned TotalOffset;
3655
3656 // The typed intrinsics add an immediate after the registers.
3657 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3658
3659 // The struct intrinsic variants add one additional operand over raw.
3660 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3661 Register VIndex;
3662 int OpOffset = 0;
3663 if (HasVIndex) {
3664 VIndex = MI.getOperand(3).getReg();
3665 OpOffset = 1;
3666 }
3667
3668 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3669 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3670
3671 unsigned Format = 0;
3672 if (IsTyped) {
3673 Format = MI.getOperand(5 + OpOffset).getImm();
3674 ++OpOffset;
3675 }
3676
3677 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3678
3679 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3680 if (TotalOffset != 0)
3681 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3682
3683 unsigned Opc;
3684 if (IsTyped) {
3685 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3686 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3687 } else if (IsFormat) {
3688 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3689 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3690 } else {
3691 switch (MemSize) {
3692 case 1:
3693 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3694 break;
3695 case 2:
3696 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3697 break;
3698 default:
3699 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3700 break;
3701 }
3702 }
3703
3704 if (!VIndex)
3705 VIndex = B.buildConstant(S32, 0).getReg(0);
3706
3707 auto MIB = B.buildInstr(Opc)
3708 .addUse(VData) // vdata
3709 .addUse(RSrc) // rsrc
3710 .addUse(VIndex) // vindex
3711 .addUse(VOffset) // voffset
3712 .addUse(SOffset) // soffset
3713 .addImm(ImmOffset); // offset(imm)
3714
3715 if (IsTyped)
3716 MIB.addImm(Format);
3717
3718 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3719 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3720 .addMemOperand(MMO);
3721
3722 MI.eraseFromParent();
3723 return true;
3724 }
3725
legalizeBufferLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat,bool IsTyped) const3726 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3727 MachineRegisterInfo &MRI,
3728 MachineIRBuilder &B,
3729 bool IsFormat,
3730 bool IsTyped) const {
3731 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3732 MachineMemOperand *MMO = *MI.memoperands_begin();
3733 const int MemSize = MMO->getSize();
3734 const LLT S32 = LLT::scalar(32);
3735
3736 Register Dst = MI.getOperand(0).getReg();
3737 Register RSrc = MI.getOperand(2).getReg();
3738
3739 // The typed intrinsics add an immediate after the registers.
3740 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3741
3742 // The struct intrinsic variants add one additional operand over raw.
3743 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3744 Register VIndex;
3745 int OpOffset = 0;
3746 if (HasVIndex) {
3747 VIndex = MI.getOperand(3).getReg();
3748 OpOffset = 1;
3749 }
3750
3751 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3752 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3753
3754 unsigned Format = 0;
3755 if (IsTyped) {
3756 Format = MI.getOperand(5 + OpOffset).getImm();
3757 ++OpOffset;
3758 }
3759
3760 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3761 unsigned ImmOffset;
3762 unsigned TotalOffset;
3763
3764 LLT Ty = MRI.getType(Dst);
3765 LLT EltTy = Ty.getScalarType();
3766 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3767 const bool Unpacked = ST.hasUnpackedD16VMem();
3768
3769 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3770 if (TotalOffset != 0)
3771 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3772
3773 unsigned Opc;
3774
3775 if (IsTyped) {
3776 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3777 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3778 } else if (IsFormat) {
3779 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3780 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3781 } else {
3782 switch (MemSize) {
3783 case 1:
3784 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3785 break;
3786 case 2:
3787 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3788 break;
3789 default:
3790 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3791 break;
3792 }
3793 }
3794
3795 Register LoadDstReg;
3796
3797 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3798 LLT UnpackedTy = Ty.changeElementSize(32);
3799
3800 if (IsExtLoad)
3801 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3802 else if (Unpacked && IsD16 && Ty.isVector())
3803 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3804 else
3805 LoadDstReg = Dst;
3806
3807 if (!VIndex)
3808 VIndex = B.buildConstant(S32, 0).getReg(0);
3809
3810 auto MIB = B.buildInstr(Opc)
3811 .addDef(LoadDstReg) // vdata
3812 .addUse(RSrc) // rsrc
3813 .addUse(VIndex) // vindex
3814 .addUse(VOffset) // voffset
3815 .addUse(SOffset) // soffset
3816 .addImm(ImmOffset); // offset(imm)
3817
3818 if (IsTyped)
3819 MIB.addImm(Format);
3820
3821 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3822 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3823 .addMemOperand(MMO);
3824
3825 if (LoadDstReg != Dst) {
3826 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3827
3828 // Widen result for extending loads was widened.
3829 if (IsExtLoad)
3830 B.buildTrunc(Dst, LoadDstReg);
3831 else {
3832 // Repack to original 16-bit vector result
3833 // FIXME: G_TRUNC should work, but legalization currently fails
3834 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3835 SmallVector<Register, 4> Repack;
3836 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3837 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3838 B.buildMerge(Dst, Repack);
3839 }
3840 }
3841
3842 MI.eraseFromParent();
3843 return true;
3844 }
3845
legalizeAtomicIncDec(MachineInstr & MI,MachineIRBuilder & B,bool IsInc) const3846 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3847 MachineIRBuilder &B,
3848 bool IsInc) const {
3849 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3850 AMDGPU::G_AMDGPU_ATOMIC_DEC;
3851 B.buildInstr(Opc)
3852 .addDef(MI.getOperand(0).getReg())
3853 .addUse(MI.getOperand(2).getReg())
3854 .addUse(MI.getOperand(3).getReg())
3855 .cloneMemRefs(MI);
3856 MI.eraseFromParent();
3857 return true;
3858 }
3859
getBufferAtomicPseudo(Intrinsic::ID IntrID)3860 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3861 switch (IntrID) {
3862 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3863 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3864 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3865 case Intrinsic::amdgcn_raw_buffer_atomic_add:
3866 case Intrinsic::amdgcn_struct_buffer_atomic_add:
3867 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3868 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3869 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3870 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3871 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3872 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3873 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3874 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3875 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3876 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3877 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3878 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3879 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3880 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3881 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3882 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3883 case Intrinsic::amdgcn_raw_buffer_atomic_and:
3884 case Intrinsic::amdgcn_struct_buffer_atomic_and:
3885 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3886 case Intrinsic::amdgcn_raw_buffer_atomic_or:
3887 case Intrinsic::amdgcn_struct_buffer_atomic_or:
3888 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3889 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3890 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3891 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3892 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3893 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3894 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3895 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3896 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3897 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3898 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3899 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3901 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3902 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3903 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3904 default:
3905 llvm_unreachable("unhandled atomic opcode");
3906 }
3907 }
3908
legalizeBufferAtomic(MachineInstr & MI,MachineIRBuilder & B,Intrinsic::ID IID) const3909 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3910 MachineIRBuilder &B,
3911 Intrinsic::ID IID) const {
3912 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3913 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3914 const bool HasReturn = MI.getNumExplicitDefs() != 0;
3915
3916 Register Dst;
3917
3918 int OpOffset = 0;
3919 if (HasReturn) {
3920 // A few FP atomics do not support return values.
3921 Dst = MI.getOperand(0).getReg();
3922 } else {
3923 OpOffset = -1;
3924 }
3925
3926 Register VData = MI.getOperand(2 + OpOffset).getReg();
3927 Register CmpVal;
3928
3929 if (IsCmpSwap) {
3930 CmpVal = MI.getOperand(3 + OpOffset).getReg();
3931 ++OpOffset;
3932 }
3933
3934 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3935 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3936
3937 // The struct intrinsic variants add one additional operand over raw.
3938 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3939 Register VIndex;
3940 if (HasVIndex) {
3941 VIndex = MI.getOperand(4 + OpOffset).getReg();
3942 ++OpOffset;
3943 }
3944
3945 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3946 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3947 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3948
3949 MachineMemOperand *MMO = *MI.memoperands_begin();
3950
3951 unsigned ImmOffset;
3952 unsigned TotalOffset;
3953 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3954 if (TotalOffset != 0)
3955 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3956
3957 if (!VIndex)
3958 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3959
3960 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3961
3962 if (HasReturn)
3963 MIB.addDef(Dst);
3964
3965 MIB.addUse(VData); // vdata
3966
3967 if (IsCmpSwap)
3968 MIB.addReg(CmpVal);
3969
3970 MIB.addUse(RSrc) // rsrc
3971 .addUse(VIndex) // vindex
3972 .addUse(VOffset) // voffset
3973 .addUse(SOffset) // soffset
3974 .addImm(ImmOffset) // offset(imm)
3975 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3976 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3977 .addMemOperand(MMO);
3978
3979 MI.eraseFromParent();
3980 return true;
3981 }
3982
3983 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3984 /// vector with s16 typed elements.
packImageA16AddressToDwords(MachineIRBuilder & B,MachineInstr & MI,SmallVectorImpl<Register> & PackedAddrs,unsigned ArgOffset,const AMDGPU::ImageDimIntrinsicInfo * Intr,unsigned EndIdx)3985 static void packImageA16AddressToDwords(
3986 MachineIRBuilder &B, MachineInstr &MI,
3987 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
3988 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
3989 const LLT S16 = LLT::scalar(16);
3990 const LLT V2S16 = LLT::vector(2, 16);
3991
3992 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
3993 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
3994 if (!SrcOp.isReg())
3995 continue; // _L to _LZ may have eliminated this.
3996
3997 Register AddrReg = SrcOp.getReg();
3998
3999 if (I < Intr->GradientStart) {
4000 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4001 PackedAddrs.push_back(AddrReg);
4002 } else {
4003 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4004 // derivatives dx/dh and dx/dv are packed with undef.
4005 if (((I + 1) >= EndIdx) ||
4006 ((Intr->NumGradients / 2) % 2 == 1 &&
4007 (I == static_cast<unsigned>(Intr->GradientStart +
4008 (Intr->NumGradients / 2) - 1) ||
4009 I == static_cast<unsigned>(Intr->GradientStart +
4010 Intr->NumGradients - 1))) ||
4011 // Check for _L to _LZ optimization
4012 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4013 PackedAddrs.push_back(
4014 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4015 .getReg(0));
4016 } else {
4017 PackedAddrs.push_back(
4018 B.buildBuildVector(
4019 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4020 .getReg(0));
4021 ++I;
4022 }
4023 }
4024 }
4025 }
4026
4027 /// Convert from separate vaddr components to a single vector address register,
4028 /// and replace the remaining operands with $noreg.
convertImageAddrToPacked(MachineIRBuilder & B,MachineInstr & MI,int DimIdx,int NumVAddrs)4029 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4030 int DimIdx, int NumVAddrs) {
4031 const LLT S32 = LLT::scalar(32);
4032
4033 SmallVector<Register, 8> AddrRegs;
4034 for (int I = 0; I != NumVAddrs; ++I) {
4035 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4036 if (SrcOp.isReg()) {
4037 AddrRegs.push_back(SrcOp.getReg());
4038 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4039 }
4040 }
4041
4042 int NumAddrRegs = AddrRegs.size();
4043 if (NumAddrRegs != 1) {
4044 // Round up to 8 elements for v5-v7
4045 // FIXME: Missing intermediate sized register classes and instructions.
4046 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4047 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4048 auto Undef = B.buildUndef(S32);
4049 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4050 NumAddrRegs = RoundedNumRegs;
4051 }
4052
4053 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4054 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4055 }
4056
4057 for (int I = 1; I != NumVAddrs; ++I) {
4058 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4059 if (SrcOp.isReg())
4060 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4061 }
4062 }
4063
4064 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4065 ///
4066 /// Depending on the subtarget, load/store with 16-bit element data need to be
4067 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4068 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4069 /// registers.
4070 ///
4071 /// We don't want to directly select image instructions just yet, but also want
4072 /// to exposes all register repacking to the legalizer/combiners. We also don't
4073 /// want a selected instrution entering RegBankSelect. In order to avoid
4074 /// defining a multitude of intermediate image instructions, directly hack on
4075 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4076 /// now unnecessary arguments with $noreg.
legalizeImageIntrinsic(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer,const AMDGPU::ImageDimIntrinsicInfo * Intr) const4077 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4078 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4079 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4080
4081 const unsigned NumDefs = MI.getNumExplicitDefs();
4082 const unsigned ArgOffset = NumDefs + 1;
4083 bool IsTFE = NumDefs == 2;
4084 // We are only processing the operands of d16 image operations on subtargets
4085 // that use the unpacked register layout, or need to repack the TFE result.
4086
4087 // TODO: Do we need to guard against already legalized intrinsics?
4088 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4089 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4090
4091 MachineRegisterInfo *MRI = B.getMRI();
4092 const LLT S32 = LLT::scalar(32);
4093 const LLT S16 = LLT::scalar(16);
4094 const LLT V2S16 = LLT::vector(2, 16);
4095
4096 unsigned DMask = 0;
4097
4098 // Check for 16 bit addresses and pack if true.
4099 LLT GradTy =
4100 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4101 LLT AddrTy =
4102 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4103 const bool IsG16 = GradTy == S16;
4104 const bool IsA16 = AddrTy == S16;
4105
4106 int DMaskLanes = 0;
4107 if (!BaseOpcode->Atomic) {
4108 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4109 if (BaseOpcode->Gather4) {
4110 DMaskLanes = 4;
4111 } else if (DMask != 0) {
4112 DMaskLanes = countPopulation(DMask);
4113 } else if (!IsTFE && !BaseOpcode->Store) {
4114 // If dmask is 0, this is a no-op load. This can be eliminated.
4115 B.buildUndef(MI.getOperand(0));
4116 MI.eraseFromParent();
4117 return true;
4118 }
4119 }
4120
4121 Observer.changingInstr(MI);
4122 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4123
4124 unsigned NewOpcode = NumDefs == 0 ?
4125 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4126
4127 // Track that we legalized this
4128 MI.setDesc(B.getTII().get(NewOpcode));
4129
4130 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4131 // dmask to be at least 1 otherwise the instruction will fail
4132 if (IsTFE && DMask == 0) {
4133 DMask = 0x1;
4134 DMaskLanes = 1;
4135 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4136 }
4137
4138 if (BaseOpcode->Atomic) {
4139 Register VData0 = MI.getOperand(2).getReg();
4140 LLT Ty = MRI->getType(VData0);
4141
4142 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4143 if (Ty.isVector())
4144 return false;
4145
4146 if (BaseOpcode->AtomicX2) {
4147 Register VData1 = MI.getOperand(3).getReg();
4148 // The two values are packed in one register.
4149 LLT PackedTy = LLT::vector(2, Ty);
4150 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4151 MI.getOperand(2).setReg(Concat.getReg(0));
4152 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4153 }
4154 }
4155
4156 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4157
4158 // Optimize _L to _LZ when _L is zero
4159 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4160 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4161 const ConstantFP *ConstantLod;
4162
4163 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4164 m_GFCst(ConstantLod))) {
4165 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4166 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4167 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4168 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4169 Intr->Dim);
4170
4171 // The starting indexes should remain in the same place.
4172 --CorrectedNumVAddrs;
4173
4174 MI.getOperand(MI.getNumExplicitDefs())
4175 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4176 MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4177 Intr = NewImageDimIntr;
4178 }
4179 }
4180 }
4181
4182 // Optimize _mip away, when 'lod' is zero
4183 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4184 int64_t ConstantLod;
4185 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4186 m_ICst(ConstantLod))) {
4187 if (ConstantLod == 0) {
4188 // TODO: Change intrinsic opcode and remove operand instead or replacing
4189 // it with 0, as the _L to _LZ handling is done above.
4190 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4191 --CorrectedNumVAddrs;
4192 }
4193 }
4194 }
4195
4196 // Rewrite the addressing register layout before doing anything else.
4197 if (IsA16 || IsG16) {
4198 if (IsA16) {
4199 // Target must support the feature and gradients need to be 16 bit too
4200 if (!ST.hasA16() || !IsG16)
4201 return false;
4202 } else if (!ST.hasG16())
4203 return false;
4204
4205 if (Intr->NumVAddrs > 1) {
4206 SmallVector<Register, 4> PackedRegs;
4207 // Don't compress addresses for G16
4208 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4209 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4210 PackEndIdx);
4211
4212 if (!IsA16) {
4213 // Add uncompressed address
4214 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4215 int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4216 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4217 PackedRegs.push_back(AddrReg);
4218 }
4219 }
4220
4221 // See also below in the non-a16 branch
4222 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4223
4224 if (!UseNSA && PackedRegs.size() > 1) {
4225 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4226 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4227 PackedRegs[0] = Concat.getReg(0);
4228 PackedRegs.resize(1);
4229 }
4230
4231 const unsigned NumPacked = PackedRegs.size();
4232 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4233 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4234 if (!SrcOp.isReg()) {
4235 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4236 continue;
4237 }
4238
4239 assert(SrcOp.getReg() != AMDGPU::NoRegister);
4240
4241 if (I - Intr->VAddrStart < NumPacked)
4242 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4243 else
4244 SrcOp.setReg(AMDGPU::NoRegister);
4245 }
4246 }
4247 } else {
4248 // If the register allocator cannot place the address registers contiguously
4249 // without introducing moves, then using the non-sequential address encoding
4250 // is always preferable, since it saves VALU instructions and is usually a
4251 // wash in terms of code size or even better.
4252 //
4253 // However, we currently have no way of hinting to the register allocator
4254 // that MIMG addresses should be placed contiguously when it is possible to
4255 // do so, so force non-NSA for the common 2-address case as a heuristic.
4256 //
4257 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4258 // allocation when possible.
4259 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4260
4261 if (!UseNSA && Intr->NumVAddrs > 1)
4262 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4263 Intr->NumVAddrs);
4264 }
4265
4266 int Flags = 0;
4267 if (IsA16)
4268 Flags |= 1;
4269 if (IsG16)
4270 Flags |= 2;
4271 MI.addOperand(MachineOperand::CreateImm(Flags));
4272
4273 if (BaseOpcode->Store) { // No TFE for stores?
4274 // TODO: Handle dmask trim
4275 Register VData = MI.getOperand(1).getReg();
4276 LLT Ty = MRI->getType(VData);
4277 if (!Ty.isVector() || Ty.getElementType() != S16)
4278 return true;
4279
4280 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4281 if (RepackedReg != VData) {
4282 MI.getOperand(1).setReg(RepackedReg);
4283 }
4284
4285 return true;
4286 }
4287
4288 Register DstReg = MI.getOperand(0).getReg();
4289 LLT Ty = MRI->getType(DstReg);
4290 const LLT EltTy = Ty.getScalarType();
4291 const bool IsD16 = Ty.getScalarType() == S16;
4292 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4293
4294 // Confirm that the return type is large enough for the dmask specified
4295 if (NumElts < DMaskLanes)
4296 return false;
4297
4298 if (NumElts > 4 || DMaskLanes > 4)
4299 return false;
4300
4301 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4302 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4303
4304 // The raw dword aligned data component of the load. The only legal cases
4305 // where this matters should be when using the packed D16 format, for
4306 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4307 LLT RoundedTy;
4308
4309 // S32 vector to to cover all data, plus TFE result element.
4310 LLT TFETy;
4311
4312 // Register type to use for each loaded component. Will be S32 or V2S16.
4313 LLT RegTy;
4314
4315 if (IsD16 && ST.hasUnpackedD16VMem()) {
4316 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4317 TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4318 RegTy = S32;
4319 } else {
4320 unsigned EltSize = EltTy.getSizeInBits();
4321 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4322 unsigned RoundedSize = 32 * RoundedElts;
4323 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4324 TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4325 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4326 }
4327
4328 // The return type does not need adjustment.
4329 // TODO: Should we change s16 case to s32 or <2 x s16>?
4330 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4331 return true;
4332
4333 Register Dst1Reg;
4334
4335 // Insert after the instruction.
4336 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4337
4338 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4339 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4340 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4341 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4342
4343 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4344
4345 MI.getOperand(0).setReg(NewResultReg);
4346
4347 // In the IR, TFE is supposed to be used with a 2 element struct return
4348 // type. The intruction really returns these two values in one contiguous
4349 // register, with one additional dword beyond the loaded data. Rewrite the
4350 // return type to use a single register result.
4351
4352 if (IsTFE) {
4353 Dst1Reg = MI.getOperand(1).getReg();
4354 if (MRI->getType(Dst1Reg) != S32)
4355 return false;
4356
4357 // TODO: Make sure the TFE operand bit is set.
4358 MI.RemoveOperand(1);
4359
4360 // Handle the easy case that requires no repack instructions.
4361 if (Ty == S32) {
4362 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4363 return true;
4364 }
4365 }
4366
4367 // Now figure out how to copy the new result register back into the old
4368 // result.
4369 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4370
4371 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4372
4373 if (ResultNumRegs == 1) {
4374 assert(!IsTFE);
4375 ResultRegs[0] = NewResultReg;
4376 } else {
4377 // We have to repack into a new vector of some kind.
4378 for (int I = 0; I != NumDataRegs; ++I)
4379 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4380 B.buildUnmerge(ResultRegs, NewResultReg);
4381
4382 // Drop the final TFE element to get the data part. The TFE result is
4383 // directly written to the right place already.
4384 if (IsTFE)
4385 ResultRegs.resize(NumDataRegs);
4386 }
4387
4388 // For an s16 scalar result, we form an s32 result with a truncate regardless
4389 // of packed vs. unpacked.
4390 if (IsD16 && !Ty.isVector()) {
4391 B.buildTrunc(DstReg, ResultRegs[0]);
4392 return true;
4393 }
4394
4395 // Avoid a build/concat_vector of 1 entry.
4396 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4397 B.buildBitcast(DstReg, ResultRegs[0]);
4398 return true;
4399 }
4400
4401 assert(Ty.isVector());
4402
4403 if (IsD16) {
4404 // For packed D16 results with TFE enabled, all the data components are
4405 // S32. Cast back to the expected type.
4406 //
4407 // TODO: We don't really need to use load s32 elements. We would only need one
4408 // cast for the TFE result if a multiple of v2s16 was used.
4409 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4410 for (Register &Reg : ResultRegs)
4411 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4412 } else if (ST.hasUnpackedD16VMem()) {
4413 for (Register &Reg : ResultRegs)
4414 Reg = B.buildTrunc(S16, Reg).getReg(0);
4415 }
4416 }
4417
4418 auto padWithUndef = [&](LLT Ty, int NumElts) {
4419 if (NumElts == 0)
4420 return;
4421 Register Undef = B.buildUndef(Ty).getReg(0);
4422 for (int I = 0; I != NumElts; ++I)
4423 ResultRegs.push_back(Undef);
4424 };
4425
4426 // Pad out any elements eliminated due to the dmask.
4427 LLT ResTy = MRI->getType(ResultRegs[0]);
4428 if (!ResTy.isVector()) {
4429 padWithUndef(ResTy, NumElts - ResultRegs.size());
4430 B.buildBuildVector(DstReg, ResultRegs);
4431 return true;
4432 }
4433
4434 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4435 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4436
4437 // Deal with the one annoying legal case.
4438 const LLT V3S16 = LLT::vector(3, 16);
4439 if (Ty == V3S16) {
4440 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4441 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4442 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4443 return true;
4444 }
4445
4446 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4447 B.buildConcatVectors(DstReg, ResultRegs);
4448 return true;
4449 }
4450
legalizeSBufferLoad(LegalizerHelper & Helper,MachineInstr & MI) const4451 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4452 LegalizerHelper &Helper, MachineInstr &MI) const {
4453 MachineIRBuilder &B = Helper.MIRBuilder;
4454 GISelChangeObserver &Observer = Helper.Observer;
4455
4456 Register Dst = MI.getOperand(0).getReg();
4457 LLT Ty = B.getMRI()->getType(Dst);
4458 unsigned Size = Ty.getSizeInBits();
4459 MachineFunction &MF = B.getMF();
4460
4461 Observer.changingInstr(MI);
4462
4463 if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4464 Ty = getBitcastRegisterType(Ty);
4465 Helper.bitcastDst(MI, Ty, 0);
4466 Dst = MI.getOperand(0).getReg();
4467 B.setInsertPt(B.getMBB(), MI);
4468 }
4469
4470 // FIXME: We don't really need this intermediate instruction. The intrinsic
4471 // should be fixed to have a memory operand. Since it's readnone, we're not
4472 // allowed to add one.
4473 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4474 MI.RemoveOperand(1); // Remove intrinsic ID
4475
4476 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4477 // TODO: Should this use datalayout alignment?
4478 const unsigned MemSize = (Size + 7) / 8;
4479 const Align MemAlign(4);
4480 MachineMemOperand *MMO = MF.getMachineMemOperand(
4481 MachinePointerInfo(),
4482 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4483 MachineMemOperand::MOInvariant,
4484 MemSize, MemAlign);
4485 MI.addMemOperand(MF, MMO);
4486
4487 // There are no 96-bit result scalar loads, but widening to 128-bit should
4488 // always be legal. We may need to restore this to a 96-bit result if it turns
4489 // out this needs to be converted to a vector load during RegBankSelect.
4490 if (!isPowerOf2_32(Size)) {
4491 if (Ty.isVector())
4492 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4493 else
4494 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4495 }
4496
4497 Observer.changedInstr(MI);
4498 return true;
4499 }
4500
4501 // TODO: Move to selection
legalizeTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4502 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4503 MachineRegisterInfo &MRI,
4504 MachineIRBuilder &B) const {
4505 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4506 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4507 !ST.isTrapHandlerEnabled()) {
4508 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4509 } else {
4510 // Pass queue pointer to trap handler as input, and insert trap instruction
4511 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4512 MachineRegisterInfo &MRI = *B.getMRI();
4513
4514 Register LiveIn =
4515 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4516 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4517 return false;
4518
4519 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4520 B.buildCopy(SGPR01, LiveIn);
4521 B.buildInstr(AMDGPU::S_TRAP)
4522 .addImm(GCNSubtarget::TrapIDLLVMTrap)
4523 .addReg(SGPR01, RegState::Implicit);
4524 }
4525
4526 MI.eraseFromParent();
4527 return true;
4528 }
4529
legalizeDebugTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4530 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4531 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4532 // Is non-HSA path or trap-handler disabled? then, report a warning
4533 // accordingly
4534 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4535 !ST.isTrapHandlerEnabled()) {
4536 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4537 "debugtrap handler not supported",
4538 MI.getDebugLoc(), DS_Warning);
4539 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4540 Ctx.diagnose(NoTrap);
4541 } else {
4542 // Insert debug-trap instruction
4543 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4544 }
4545
4546 MI.eraseFromParent();
4547 return true;
4548 }
4549
legalizeBVHIntrinsic(MachineInstr & MI,MachineIRBuilder & B) const4550 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4551 MachineIRBuilder &B) const {
4552 MachineRegisterInfo &MRI = *B.getMRI();
4553 const LLT S16 = LLT::scalar(16);
4554 const LLT S32 = LLT::scalar(32);
4555
4556 Register DstReg = MI.getOperand(0).getReg();
4557 Register NodePtr = MI.getOperand(2).getReg();
4558 Register RayExtent = MI.getOperand(3).getReg();
4559 Register RayOrigin = MI.getOperand(4).getReg();
4560 Register RayDir = MI.getOperand(5).getReg();
4561 Register RayInvDir = MI.getOperand(6).getReg();
4562 Register TDescr = MI.getOperand(7).getReg();
4563
4564 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4565 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4566 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4567 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4568 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4569 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4570
4571 SmallVector<Register, 12> Ops;
4572 if (Is64) {
4573 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4574 Ops.push_back(Unmerge.getReg(0));
4575 Ops.push_back(Unmerge.getReg(1));
4576 } else {
4577 Ops.push_back(NodePtr);
4578 }
4579 Ops.push_back(RayExtent);
4580
4581 auto packLanes = [&Ops, &S32, &B] (Register Src) {
4582 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4583 Ops.push_back(Unmerge.getReg(0));
4584 Ops.push_back(Unmerge.getReg(1));
4585 Ops.push_back(Unmerge.getReg(2));
4586 };
4587
4588 packLanes(RayOrigin);
4589 if (IsA16) {
4590 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4591 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4592 Register R1 = MRI.createGenericVirtualRegister(S32);
4593 Register R2 = MRI.createGenericVirtualRegister(S32);
4594 Register R3 = MRI.createGenericVirtualRegister(S32);
4595 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4596 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4597 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4598 Ops.push_back(R1);
4599 Ops.push_back(R2);
4600 Ops.push_back(R3);
4601 } else {
4602 packLanes(RayDir);
4603 packLanes(RayInvDir);
4604 }
4605
4606 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4607 .addDef(DstReg)
4608 .addImm(Opcode);
4609
4610 for (Register R : Ops) {
4611 MIB.addUse(R);
4612 }
4613
4614 MIB.addUse(TDescr)
4615 .addImm(IsA16 ? 1 : 0)
4616 .cloneMemRefs(MI);
4617
4618 MI.eraseFromParent();
4619 return true;
4620 }
4621
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const4622 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4623 MachineInstr &MI) const {
4624 MachineIRBuilder &B = Helper.MIRBuilder;
4625 MachineRegisterInfo &MRI = *B.getMRI();
4626
4627 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4628 auto IntrID = MI.getIntrinsicID();
4629 switch (IntrID) {
4630 case Intrinsic::amdgcn_if:
4631 case Intrinsic::amdgcn_else: {
4632 MachineInstr *Br = nullptr;
4633 MachineBasicBlock *UncondBrTarget = nullptr;
4634 bool Negated = false;
4635 if (MachineInstr *BrCond =
4636 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4637 const SIRegisterInfo *TRI
4638 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4639
4640 Register Def = MI.getOperand(1).getReg();
4641 Register Use = MI.getOperand(3).getReg();
4642
4643 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4644
4645 if (Negated)
4646 std::swap(CondBrTarget, UncondBrTarget);
4647
4648 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4649 if (IntrID == Intrinsic::amdgcn_if) {
4650 B.buildInstr(AMDGPU::SI_IF)
4651 .addDef(Def)
4652 .addUse(Use)
4653 .addMBB(UncondBrTarget);
4654 } else {
4655 B.buildInstr(AMDGPU::SI_ELSE)
4656 .addDef(Def)
4657 .addUse(Use)
4658 .addMBB(UncondBrTarget);
4659 }
4660
4661 if (Br) {
4662 Br->getOperand(0).setMBB(CondBrTarget);
4663 } else {
4664 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4665 // since we're swapping branch targets it needs to be reinserted.
4666 // FIXME: IRTranslator should probably not do this
4667 B.buildBr(*CondBrTarget);
4668 }
4669
4670 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4671 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4672 MI.eraseFromParent();
4673 BrCond->eraseFromParent();
4674 return true;
4675 }
4676
4677 return false;
4678 }
4679 case Intrinsic::amdgcn_loop: {
4680 MachineInstr *Br = nullptr;
4681 MachineBasicBlock *UncondBrTarget = nullptr;
4682 bool Negated = false;
4683 if (MachineInstr *BrCond =
4684 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4685 const SIRegisterInfo *TRI
4686 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4687
4688 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4689 Register Reg = MI.getOperand(2).getReg();
4690
4691 if (Negated)
4692 std::swap(CondBrTarget, UncondBrTarget);
4693
4694 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4695 B.buildInstr(AMDGPU::SI_LOOP)
4696 .addUse(Reg)
4697 .addMBB(UncondBrTarget);
4698
4699 if (Br)
4700 Br->getOperand(0).setMBB(CondBrTarget);
4701 else
4702 B.buildBr(*CondBrTarget);
4703
4704 MI.eraseFromParent();
4705 BrCond->eraseFromParent();
4706 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4707 return true;
4708 }
4709
4710 return false;
4711 }
4712 case Intrinsic::amdgcn_kernarg_segment_ptr:
4713 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4714 // This only makes sense to call in a kernel, so just lower to null.
4715 B.buildConstant(MI.getOperand(0).getReg(), 0);
4716 MI.eraseFromParent();
4717 return true;
4718 }
4719
4720 return legalizePreloadedArgIntrin(
4721 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4722 case Intrinsic::amdgcn_implicitarg_ptr:
4723 return legalizeImplicitArgPtr(MI, MRI, B);
4724 case Intrinsic::amdgcn_workitem_id_x:
4725 return legalizePreloadedArgIntrin(MI, MRI, B,
4726 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4727 case Intrinsic::amdgcn_workitem_id_y:
4728 return legalizePreloadedArgIntrin(MI, MRI, B,
4729 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4730 case Intrinsic::amdgcn_workitem_id_z:
4731 return legalizePreloadedArgIntrin(MI, MRI, B,
4732 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4733 case Intrinsic::amdgcn_workgroup_id_x:
4734 return legalizePreloadedArgIntrin(MI, MRI, B,
4735 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4736 case Intrinsic::amdgcn_workgroup_id_y:
4737 return legalizePreloadedArgIntrin(MI, MRI, B,
4738 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4739 case Intrinsic::amdgcn_workgroup_id_z:
4740 return legalizePreloadedArgIntrin(MI, MRI, B,
4741 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4742 case Intrinsic::amdgcn_dispatch_ptr:
4743 return legalizePreloadedArgIntrin(MI, MRI, B,
4744 AMDGPUFunctionArgInfo::DISPATCH_PTR);
4745 case Intrinsic::amdgcn_queue_ptr:
4746 return legalizePreloadedArgIntrin(MI, MRI, B,
4747 AMDGPUFunctionArgInfo::QUEUE_PTR);
4748 case Intrinsic::amdgcn_implicit_buffer_ptr:
4749 return legalizePreloadedArgIntrin(
4750 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4751 case Intrinsic::amdgcn_dispatch_id:
4752 return legalizePreloadedArgIntrin(MI, MRI, B,
4753 AMDGPUFunctionArgInfo::DISPATCH_ID);
4754 case Intrinsic::amdgcn_fdiv_fast:
4755 return legalizeFDIVFastIntrin(MI, MRI, B);
4756 case Intrinsic::amdgcn_is_shared:
4757 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4758 case Intrinsic::amdgcn_is_private:
4759 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4760 case Intrinsic::amdgcn_wavefrontsize: {
4761 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4762 MI.eraseFromParent();
4763 return true;
4764 }
4765 case Intrinsic::amdgcn_s_buffer_load:
4766 return legalizeSBufferLoad(Helper, MI);
4767 case Intrinsic::amdgcn_raw_buffer_store:
4768 case Intrinsic::amdgcn_struct_buffer_store:
4769 return legalizeBufferStore(MI, MRI, B, false, false);
4770 case Intrinsic::amdgcn_raw_buffer_store_format:
4771 case Intrinsic::amdgcn_struct_buffer_store_format:
4772 return legalizeBufferStore(MI, MRI, B, false, true);
4773 case Intrinsic::amdgcn_raw_tbuffer_store:
4774 case Intrinsic::amdgcn_struct_tbuffer_store:
4775 return legalizeBufferStore(MI, MRI, B, true, true);
4776 case Intrinsic::amdgcn_raw_buffer_load:
4777 case Intrinsic::amdgcn_struct_buffer_load:
4778 return legalizeBufferLoad(MI, MRI, B, false, false);
4779 case Intrinsic::amdgcn_raw_buffer_load_format:
4780 case Intrinsic::amdgcn_struct_buffer_load_format:
4781 return legalizeBufferLoad(MI, MRI, B, true, false);
4782 case Intrinsic::amdgcn_raw_tbuffer_load:
4783 case Intrinsic::amdgcn_struct_tbuffer_load:
4784 return legalizeBufferLoad(MI, MRI, B, true, true);
4785 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4786 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4787 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4788 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4789 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4790 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4791 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4792 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4793 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4794 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4795 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4796 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4797 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4798 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4799 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4800 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4801 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4802 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4803 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4804 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4805 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4806 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4807 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4808 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4809 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4810 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4811 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4812 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4813 return legalizeBufferAtomic(MI, B, IntrID);
4814 case Intrinsic::amdgcn_atomic_inc:
4815 return legalizeAtomicIncDec(MI, B, true);
4816 case Intrinsic::amdgcn_atomic_dec:
4817 return legalizeAtomicIncDec(MI, B, false);
4818 case Intrinsic::trap:
4819 return legalizeTrapIntrinsic(MI, MRI, B);
4820 case Intrinsic::debugtrap:
4821 return legalizeDebugTrapIntrinsic(MI, MRI, B);
4822 case Intrinsic::amdgcn_rsq_clamp:
4823 return legalizeRsqClampIntrinsic(MI, MRI, B);
4824 case Intrinsic::amdgcn_ds_fadd:
4825 case Intrinsic::amdgcn_ds_fmin:
4826 case Intrinsic::amdgcn_ds_fmax:
4827 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4828 case Intrinsic::amdgcn_image_bvh_intersect_ray:
4829 return legalizeBVHIntrinsic(MI, B);
4830 default: {
4831 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4832 AMDGPU::getImageDimIntrinsicInfo(IntrID))
4833 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4834 return true;
4835 }
4836 }
4837
4838 return true;
4839 }
4840