1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPULegalizerInfo.h"
15
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41 "amdgpu-global-isel-new-legality",
42 cl::desc("Use GlobalISel desired legality, rather than try to use"
43 "rules compatible with selection patterns"),
44 cl::init(false),
45 cl::ReallyHidden);
46
47 static constexpr unsigned MaxRegisterSize = 1024;
48
49 // Round the number of elements to the next power of two elements
getPow2VectorType(LLT Ty)50 static LLT getPow2VectorType(LLT Ty) {
51 unsigned NElts = Ty.getNumElements();
52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
53 return Ty.changeNumElements(Pow2NElts);
54 }
55
56 // Round the number of bits to the next power of two bits
getPow2ScalarType(LLT Ty)57 static LLT getPow2ScalarType(LLT Ty) {
58 unsigned Bits = Ty.getSizeInBits();
59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
60 return LLT::scalar(Pow2Bits);
61 }
62
isSmallOddVector(unsigned TypeIdx)63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64 return [=](const LegalityQuery &Query) {
65 const LLT Ty = Query.Types[TypeIdx];
66 return Ty.isVector() &&
67 Ty.getNumElements() % 2 != 0 &&
68 Ty.getElementType().getSizeInBits() < 32 &&
69 Ty.getSizeInBits() % 32 != 0;
70 };
71 }
72
isWideVec16(unsigned TypeIdx)73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74 return [=](const LegalityQuery &Query) {
75 const LLT Ty = Query.Types[TypeIdx];
76 const LLT EltTy = Ty.getScalarType();
77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78 };
79 }
80
oneMoreElement(unsigned TypeIdx)81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82 return [=](const LegalityQuery &Query) {
83 const LLT Ty = Query.Types[TypeIdx];
84 const LLT EltTy = Ty.getElementType();
85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86 };
87 }
88
fewerEltsToSize64Vector(unsigned TypeIdx)89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 const LLT EltTy = Ty.getElementType();
93 unsigned Size = Ty.getSizeInBits();
94 unsigned Pieces = (Size + 63) / 64;
95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97 };
98 }
99
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
moreEltsToNext32Bit(unsigned TypeIdx)102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103 return [=](const LegalityQuery &Query) {
104 const LLT Ty = Query.Types[TypeIdx];
105
106 const LLT EltTy = Ty.getElementType();
107 const int Size = Ty.getSizeInBits();
108 const int EltSize = EltTy.getSizeInBits();
109 const int NextMul32 = (Size + 31) / 32;
110
111 assert(EltSize < 32);
112
113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115 };
116 }
117
bitcastToRegisterType(unsigned TypeIdx)118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119 return [=](const LegalityQuery &Query) {
120 const LLT Ty = Query.Types[TypeIdx];
121 unsigned Size = Ty.getSizeInBits();
122
123 LLT CoercedTy;
124 if (Size <= 32) {
125 // <2 x s8> -> s16
126 // <4 x s8> -> s32
127 CoercedTy = LLT::scalar(Size);
128 } else
129 CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130
131 return std::make_pair(TypeIdx, CoercedTy);
132 };
133 }
134
vectorSmallerThan(unsigned TypeIdx,unsigned Size)135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136 return [=](const LegalityQuery &Query) {
137 const LLT QueryTy = Query.Types[TypeIdx];
138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139 };
140 }
141
vectorWiderThan(unsigned TypeIdx,unsigned Size)142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143 return [=](const LegalityQuery &Query) {
144 const LLT QueryTy = Query.Types[TypeIdx];
145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146 };
147 }
148
numElementsNotEven(unsigned TypeIdx)149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150 return [=](const LegalityQuery &Query) {
151 const LLT QueryTy = Query.Types[TypeIdx];
152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153 };
154 }
155
isRegisterSize(unsigned Size)156 static bool isRegisterSize(unsigned Size) {
157 return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159
isRegisterVectorElementType(LLT EltTy)160 static bool isRegisterVectorElementType(LLT EltTy) {
161 const int EltSize = EltTy.getSizeInBits();
162 return EltSize == 16 || EltSize % 32 == 0;
163 }
164
isRegisterVectorType(LLT Ty)165 static bool isRegisterVectorType(LLT Ty) {
166 const int EltSize = Ty.getElementType().getSizeInBits();
167 return EltSize == 32 || EltSize == 64 ||
168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169 EltSize == 128 || EltSize == 256;
170 }
171
isRegisterType(LLT Ty)172 static bool isRegisterType(LLT Ty) {
173 if (!isRegisterSize(Ty.getSizeInBits()))
174 return false;
175
176 if (Ty.isVector())
177 return isRegisterVectorType(Ty);
178
179 return true;
180 }
181
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
isRegisterType(unsigned TypeIdx)184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185 return [=](const LegalityQuery &Query) {
186 return isRegisterType(Query.Types[TypeIdx]);
187 };
188 }
189
elementTypeIsLegal(unsigned TypeIdx)190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191 return [=](const LegalityQuery &Query) {
192 const LLT QueryTy = Query.Types[TypeIdx];
193 if (!QueryTy.isVector())
194 return false;
195 const LLT EltTy = QueryTy.getElementType();
196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197 };
198 }
199
isWideScalarTruncStore(unsigned TypeIdx)200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201 return [=](const LegalityQuery &Query) {
202 const LLT Ty = Query.Types[TypeIdx];
203 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205 };
206 }
207
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
maxSizeForAddrSpace(const GCNSubtarget & ST,unsigned AS,bool IsLoad)211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212 bool IsLoad) {
213 switch (AS) {
214 case AMDGPUAS::PRIVATE_ADDRESS:
215 // FIXME: Private element size.
216 return 32;
217 case AMDGPUAS::LOCAL_ADDRESS:
218 return ST.useDS128() ? 128 : 64;
219 case AMDGPUAS::GLOBAL_ADDRESS:
220 case AMDGPUAS::CONSTANT_ADDRESS:
221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222 // Treat constant and global as identical. SMRD loads are sometimes usable for
223 // global loads (ideally constant address space should be eliminated)
224 // depending on the context. Legality cannot be context dependent, but
225 // RegBankSelect can split the load as necessary depending on the pointer
226 // register bank/uniformity and if the memory is invariant or not written in a
227 // kernel.
228 return IsLoad ? 512 : 128;
229 default:
230 // Flat addresses may contextually need to be split to 32-bit parts if they
231 // may alias scratch depending on the subtarget.
232 return 128;
233 }
234 }
235
isLoadStoreSizeLegal(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237 const LegalityQuery &Query,
238 unsigned Opcode) {
239 const LLT Ty = Query.Types[0];
240
241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242 const bool IsLoad = Opcode != AMDGPU::G_STORE;
243
244 unsigned RegSize = Ty.getSizeInBits();
245 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246 unsigned Align = Query.MMODescrs[0].AlignInBits;
247 unsigned AS = Query.Types[1].getAddressSpace();
248
249 // All of these need to be custom lowered to cast the pointer operand.
250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251 return false;
252
253 // TODO: We should be able to widen loads if the alignment is high enough, but
254 // we also need to modify the memory access size.
255 #if 0
256 // Accept widening loads based on alignment.
257 if (IsLoad && MemSize < Size)
258 MemSize = std::max(MemSize, Align);
259 #endif
260
261 // Only 1-byte and 2-byte to 32-bit extloads are valid.
262 if (MemSize != RegSize && RegSize != 32)
263 return false;
264
265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266 return false;
267
268 switch (MemSize) {
269 case 8:
270 case 16:
271 case 32:
272 case 64:
273 case 128:
274 break;
275 case 96:
276 if (!ST.hasDwordx3LoadStores())
277 return false;
278 break;
279 case 256:
280 case 512:
281 // These may contextually need to be broken down.
282 break;
283 default:
284 return false;
285 }
286
287 assert(RegSize >= MemSize);
288
289 if (Align < MemSize) {
290 const SITargetLowering *TLI = ST.getTargetLowering();
291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292 return false;
293 }
294
295 return true;
296 }
297
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
loadStoreBitcastWorkaround(const LLT Ty)302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303 if (EnableNewLegality)
304 return false;
305
306 const unsigned Size = Ty.getSizeInBits();
307 if (Size <= 64)
308 return false;
309 if (!Ty.isVector())
310 return true;
311 unsigned EltSize = Ty.getElementType().getSizeInBits();
312 return EltSize != 32 && EltSize != 64;
313 }
314
isLoadStoreLegal(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316 unsigned Opcode) {
317 const LLT Ty = Query.Types[0];
318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319 !loadStoreBitcastWorkaround(Ty);
320 }
321
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323 const GCNTargetMachine &TM)
324 : ST(ST_) {
325 using namespace TargetOpcode;
326
327 auto GetAddrSpacePtr = [&TM](unsigned AS) {
328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329 };
330
331 const LLT S1 = LLT::scalar(1);
332 const LLT S16 = LLT::scalar(16);
333 const LLT S32 = LLT::scalar(32);
334 const LLT S64 = LLT::scalar(64);
335 const LLT S128 = LLT::scalar(128);
336 const LLT S256 = LLT::scalar(256);
337 const LLT S512 = LLT::scalar(512);
338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339
340 const LLT V2S16 = LLT::vector(2, 16);
341 const LLT V4S16 = LLT::vector(4, 16);
342
343 const LLT V2S32 = LLT::vector(2, 32);
344 const LLT V3S32 = LLT::vector(3, 32);
345 const LLT V4S32 = LLT::vector(4, 32);
346 const LLT V5S32 = LLT::vector(5, 32);
347 const LLT V6S32 = LLT::vector(6, 32);
348 const LLT V7S32 = LLT::vector(7, 32);
349 const LLT V8S32 = LLT::vector(8, 32);
350 const LLT V9S32 = LLT::vector(9, 32);
351 const LLT V10S32 = LLT::vector(10, 32);
352 const LLT V11S32 = LLT::vector(11, 32);
353 const LLT V12S32 = LLT::vector(12, 32);
354 const LLT V13S32 = LLT::vector(13, 32);
355 const LLT V14S32 = LLT::vector(14, 32);
356 const LLT V15S32 = LLT::vector(15, 32);
357 const LLT V16S32 = LLT::vector(16, 32);
358 const LLT V32S32 = LLT::vector(32, 32);
359
360 const LLT V2S64 = LLT::vector(2, 64);
361 const LLT V3S64 = LLT::vector(3, 64);
362 const LLT V4S64 = LLT::vector(4, 64);
363 const LLT V5S64 = LLT::vector(5, 64);
364 const LLT V6S64 = LLT::vector(6, 64);
365 const LLT V7S64 = LLT::vector(7, 64);
366 const LLT V8S64 = LLT::vector(8, 64);
367 const LLT V16S64 = LLT::vector(16, 64);
368
369 std::initializer_list<LLT> AllS32Vectors =
370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372 std::initializer_list<LLT> AllS64Vectors =
373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374
375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382
383 const LLT CodePtr = FlatPtr;
384
385 const std::initializer_list<LLT> AddrSpaces64 = {
386 GlobalPtr, ConstantPtr, FlatPtr
387 };
388
389 const std::initializer_list<LLT> AddrSpaces32 = {
390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391 };
392
393 const std::initializer_list<LLT> FPTypesBase = {
394 S32, S64
395 };
396
397 const std::initializer_list<LLT> FPTypes16 = {
398 S32, S64, S16
399 };
400
401 const std::initializer_list<LLT> FPTypesPK16 = {
402 S32, S64, S16, V2S16
403 };
404
405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406
407 setAction({G_BRCOND, S1}, Legal); // VCC branches
408 setAction({G_BRCOND, S32}, Legal); // SCC branches
409
410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411 // elements for v3s16
412 getActionDefinitionsBuilder(G_PHI)
413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414 .legalFor(AllS32Vectors)
415 .legalFor(AllS64Vectors)
416 .legalFor(AddrSpaces64)
417 .legalFor(AddrSpaces32)
418 .clampScalar(0, S32, S256)
419 .widenScalarToNextPow2(0, 32)
420 .clampMaxNumElements(0, S32, 16)
421 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
422 .legalIf(isPointer(0));
423
424 if (ST.hasVOP3PInsts()) {
425 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
426 .legalFor({S32, S16, V2S16})
427 .clampScalar(0, S16, S32)
428 .clampMaxNumElements(0, S16, 2)
429 .scalarize(0)
430 .widenScalarToNextPow2(0, 32);
431 } else if (ST.has16BitInsts()) {
432 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
433 .legalFor({S32, S16})
434 .clampScalar(0, S16, S32)
435 .scalarize(0)
436 .widenScalarToNextPow2(0, 32);
437 } else {
438 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
439 .legalFor({S32})
440 .clampScalar(0, S32, S32)
441 .scalarize(0);
442 }
443
444 // FIXME: Not really legal. Placeholder for custom lowering.
445 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
446 .customFor({S32, S64})
447 .clampScalar(0, S32, S64)
448 .widenScalarToNextPow2(0, 32)
449 .scalarize(0);
450
451 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
452 .legalFor({S32})
453 .clampScalar(0, S32, S32)
454 .scalarize(0);
455
456 // Report legal for any types we can handle anywhere. For the cases only legal
457 // on the SALU, RegBankSelect will be able to re-legalize.
458 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
459 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
460 .clampScalar(0, S32, S64)
461 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
462 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
463 .widenScalarToNextPow2(0)
464 .scalarize(0);
465
466 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
467 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
468 .legalFor({{S32, S1}, {S32, S32}})
469 .minScalar(0, S32)
470 // TODO: .scalarize(0)
471 .lower();
472
473 getActionDefinitionsBuilder(G_BITCAST)
474 // Don't worry about the size constraint.
475 .legalIf(all(isRegisterType(0), isRegisterType(1)))
476 .lower();
477
478
479 getActionDefinitionsBuilder(G_CONSTANT)
480 .legalFor({S1, S32, S64, S16, GlobalPtr,
481 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
482 .clampScalar(0, S32, S64)
483 .widenScalarToNextPow2(0)
484 .legalIf(isPointer(0));
485
486 getActionDefinitionsBuilder(G_FCONSTANT)
487 .legalFor({S32, S64, S16})
488 .clampScalar(0, S16, S64);
489
490 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
491 .legalIf(isRegisterType(0))
492 // s1 and s16 are special cases because they have legal operations on
493 // them, but don't really occupy registers in the normal way.
494 .legalFor({S1, S16})
495 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
496 .clampScalarOrElt(0, S32, MaxScalar)
497 .widenScalarToNextPow2(0, 32)
498 .clampMaxNumElements(0, S32, 16);
499
500 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
501
502 // If the amount is divergent, we have to do a wave reduction to get the
503 // maximum value, so this is expanded during RegBankSelect.
504 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
505 .legalFor({{PrivatePtr, S32}});
506
507 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
508 .unsupportedFor({PrivatePtr})
509 .custom();
510 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
511
512 auto &FPOpActions = getActionDefinitionsBuilder(
513 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
514 .legalFor({S32, S64});
515 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
516 .customFor({S32, S64});
517 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
518 .customFor({S32, S64});
519
520 if (ST.has16BitInsts()) {
521 if (ST.hasVOP3PInsts())
522 FPOpActions.legalFor({S16, V2S16});
523 else
524 FPOpActions.legalFor({S16});
525
526 TrigActions.customFor({S16});
527 FDIVActions.customFor({S16});
528 }
529
530 auto &MinNumMaxNum = getActionDefinitionsBuilder({
531 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
532
533 if (ST.hasVOP3PInsts()) {
534 MinNumMaxNum.customFor(FPTypesPK16)
535 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
536 .clampMaxNumElements(0, S16, 2)
537 .clampScalar(0, S16, S64)
538 .scalarize(0);
539 } else if (ST.has16BitInsts()) {
540 MinNumMaxNum.customFor(FPTypes16)
541 .clampScalar(0, S16, S64)
542 .scalarize(0);
543 } else {
544 MinNumMaxNum.customFor(FPTypesBase)
545 .clampScalar(0, S32, S64)
546 .scalarize(0);
547 }
548
549 if (ST.hasVOP3PInsts())
550 FPOpActions.clampMaxNumElements(0, S16, 2);
551
552 FPOpActions
553 .scalarize(0)
554 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
555
556 TrigActions
557 .scalarize(0)
558 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
559
560 FDIVActions
561 .scalarize(0)
562 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
563
564 getActionDefinitionsBuilder({G_FNEG, G_FABS})
565 .legalFor(FPTypesPK16)
566 .clampMaxNumElements(0, S16, 2)
567 .scalarize(0)
568 .clampScalar(0, S16, S64);
569
570 if (ST.has16BitInsts()) {
571 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
572 .legalFor({S32, S64, S16})
573 .scalarize(0)
574 .clampScalar(0, S16, S64);
575 } else {
576 getActionDefinitionsBuilder(G_FSQRT)
577 .legalFor({S32, S64})
578 .scalarize(0)
579 .clampScalar(0, S32, S64);
580
581 if (ST.hasFractBug()) {
582 getActionDefinitionsBuilder(G_FFLOOR)
583 .customFor({S64})
584 .legalFor({S32, S64})
585 .scalarize(0)
586 .clampScalar(0, S32, S64);
587 } else {
588 getActionDefinitionsBuilder(G_FFLOOR)
589 .legalFor({S32, S64})
590 .scalarize(0)
591 .clampScalar(0, S32, S64);
592 }
593 }
594
595 getActionDefinitionsBuilder(G_FPTRUNC)
596 .legalFor({{S32, S64}, {S16, S32}})
597 .scalarize(0)
598 .lower();
599
600 getActionDefinitionsBuilder(G_FPEXT)
601 .legalFor({{S64, S32}, {S32, S16}})
602 .lowerFor({{S64, S16}}) // FIXME: Implement
603 .scalarize(0);
604
605 getActionDefinitionsBuilder(G_FSUB)
606 // Use actual fsub instruction
607 .legalFor({S32})
608 // Must use fadd + fneg
609 .lowerFor({S64, S16, V2S16})
610 .scalarize(0)
611 .clampScalar(0, S32, S64);
612
613 // Whether this is legal depends on the floating point mode for the function.
614 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
615 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
616 FMad.customFor({S32, S16});
617 else if (ST.hasMadMacF32Insts())
618 FMad.customFor({S32});
619 else if (ST.hasMadF16())
620 FMad.customFor({S16});
621 FMad.scalarize(0)
622 .lower();
623
624 // TODO: Do we need to clamp maximum bitwidth?
625 getActionDefinitionsBuilder(G_TRUNC)
626 .legalIf(isScalar(0))
627 .legalFor({{V2S16, V2S32}})
628 .clampMaxNumElements(0, S16, 2)
629 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
630 // situations (like an invalid implicit use), we don't want to infinite loop
631 // in the legalizer.
632 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
633 .alwaysLegal();
634
635 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
636 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
637 {S32, S1}, {S64, S1}, {S16, S1}})
638 .scalarize(0)
639 .clampScalar(0, S32, S64)
640 .widenScalarToNextPow2(1, 32);
641
642 // TODO: Split s1->s64 during regbankselect for VALU.
643 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
644 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
645 .lowerFor({{S32, S64}})
646 .lowerIf(typeIs(1, S1))
647 .customFor({{S64, S64}});
648 if (ST.has16BitInsts())
649 IToFP.legalFor({{S16, S16}});
650 IToFP.clampScalar(1, S32, S64)
651 .scalarize(0)
652 .widenScalarToNextPow2(1);
653
654 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
655 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
656 .customFor({{S64, S64}});
657 if (ST.has16BitInsts())
658 FPToI.legalFor({{S16, S16}});
659 else
660 FPToI.minScalar(1, S32);
661
662 FPToI.minScalar(0, S32)
663 .scalarize(0)
664 .lower();
665
666 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
667 .scalarize(0)
668 .lower();
669
670 if (ST.has16BitInsts()) {
671 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
672 .legalFor({S16, S32, S64})
673 .clampScalar(0, S16, S64)
674 .scalarize(0);
675 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
676 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
677 .legalFor({S32, S64})
678 .clampScalar(0, S32, S64)
679 .scalarize(0);
680 } else {
681 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
682 .legalFor({S32})
683 .customFor({S64})
684 .clampScalar(0, S32, S64)
685 .scalarize(0);
686 }
687
688 // FIXME: Clamp offset operand.
689 getActionDefinitionsBuilder(G_PTR_ADD)
690 .legalIf(isPointer(0))
691 .scalarize(0);
692
693 getActionDefinitionsBuilder(G_PTRMASK)
694 .legalIf(typeInSet(1, {S64, S32}))
695 .minScalar(1, S32)
696 .maxScalarIf(sizeIs(0, 32), 1, S32)
697 .maxScalarIf(sizeIs(0, 64), 1, S64)
698 .scalarize(0);
699
700 auto &CmpBuilder =
701 getActionDefinitionsBuilder(G_ICMP)
702 // The compare output type differs based on the register bank of the output,
703 // so make both s1 and s32 legal.
704 //
705 // Scalar compares producing output in scc will be promoted to s32, as that
706 // is the allocatable register type that will be needed for the copy from
707 // scc. This will be promoted during RegBankSelect, and we assume something
708 // before that won't try to use s32 result types.
709 //
710 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
711 // bank.
712 .legalForCartesianProduct(
713 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
714 .legalForCartesianProduct(
715 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
716 if (ST.has16BitInsts()) {
717 CmpBuilder.legalFor({{S1, S16}});
718 }
719
720 CmpBuilder
721 .widenScalarToNextPow2(1)
722 .clampScalar(1, S32, S64)
723 .scalarize(0)
724 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
725
726 getActionDefinitionsBuilder(G_FCMP)
727 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
728 .widenScalarToNextPow2(1)
729 .clampScalar(1, S32, S64)
730 .scalarize(0);
731
732 // FIXME: fpow has a selection pattern that should move to custom lowering.
733 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
734 if (ST.has16BitInsts())
735 Exp2Ops.legalFor({S32, S16});
736 else
737 Exp2Ops.legalFor({S32});
738 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
739 Exp2Ops.scalarize(0);
740
741 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
742 if (ST.has16BitInsts())
743 ExpOps.customFor({{S32}, {S16}});
744 else
745 ExpOps.customFor({S32});
746 ExpOps.clampScalar(0, MinScalarFPTy, S32)
747 .scalarize(0);
748
749 // The 64-bit versions produce 32-bit results, but only on the SALU.
750 getActionDefinitionsBuilder(G_CTPOP)
751 .legalFor({{S32, S32}, {S32, S64}})
752 .clampScalar(0, S32, S32)
753 .clampScalar(1, S32, S64)
754 .scalarize(0)
755 .widenScalarToNextPow2(0, 32)
756 .widenScalarToNextPow2(1, 32);
757
758 // The hardware instructions return a different result on 0 than the generic
759 // instructions expect. The hardware produces -1, but these produce the
760 // bitwidth.
761 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
762 .scalarize(0)
763 .clampScalar(0, S32, S32)
764 .clampScalar(1, S32, S64)
765 .widenScalarToNextPow2(0, 32)
766 .widenScalarToNextPow2(1, 32)
767 .lower();
768
769 // The 64-bit versions produce 32-bit results, but only on the SALU.
770 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
771 .legalFor({{S32, S32}, {S32, S64}})
772 .clampScalar(0, S32, S32)
773 .clampScalar(1, S32, S64)
774 .scalarize(0)
775 .widenScalarToNextPow2(0, 32)
776 .widenScalarToNextPow2(1, 32);
777
778 getActionDefinitionsBuilder(G_BITREVERSE)
779 .legalFor({S32})
780 .clampScalar(0, S32, S32)
781 .scalarize(0);
782
783 if (ST.has16BitInsts()) {
784 getActionDefinitionsBuilder(G_BSWAP)
785 .legalFor({S16, S32, V2S16})
786 .clampMaxNumElements(0, S16, 2)
787 // FIXME: Fixing non-power-of-2 before clamp is workaround for
788 // narrowScalar limitation.
789 .widenScalarToNextPow2(0)
790 .clampScalar(0, S16, S32)
791 .scalarize(0);
792
793 if (ST.hasVOP3PInsts()) {
794 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
795 .legalFor({S32, S16, V2S16})
796 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
797 .clampMaxNumElements(0, S16, 2)
798 .minScalar(0, S16)
799 .widenScalarToNextPow2(0)
800 .scalarize(0)
801 .lower();
802 } else {
803 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
804 .legalFor({S32, S16})
805 .widenScalarToNextPow2(0)
806 .minScalar(0, S16)
807 .scalarize(0)
808 .lower();
809 }
810 } else {
811 // TODO: Should have same legality without v_perm_b32
812 getActionDefinitionsBuilder(G_BSWAP)
813 .legalFor({S32})
814 .lowerIf(scalarNarrowerThan(0, 32))
815 // FIXME: Fixing non-power-of-2 before clamp is workaround for
816 // narrowScalar limitation.
817 .widenScalarToNextPow2(0)
818 .maxScalar(0, S32)
819 .scalarize(0)
820 .lower();
821
822 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
823 .legalFor({S32})
824 .minScalar(0, S32)
825 .widenScalarToNextPow2(0)
826 .scalarize(0)
827 .lower();
828 }
829
830 getActionDefinitionsBuilder(G_INTTOPTR)
831 // List the common cases
832 .legalForCartesianProduct(AddrSpaces64, {S64})
833 .legalForCartesianProduct(AddrSpaces32, {S32})
834 .scalarize(0)
835 // Accept any address space as long as the size matches
836 .legalIf(sameSize(0, 1))
837 .widenScalarIf(smallerThan(1, 0),
838 [](const LegalityQuery &Query) {
839 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
840 })
841 .narrowScalarIf(largerThan(1, 0),
842 [](const LegalityQuery &Query) {
843 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
844 });
845
846 getActionDefinitionsBuilder(G_PTRTOINT)
847 // List the common cases
848 .legalForCartesianProduct(AddrSpaces64, {S64})
849 .legalForCartesianProduct(AddrSpaces32, {S32})
850 .scalarize(0)
851 // Accept any address space as long as the size matches
852 .legalIf(sameSize(0, 1))
853 .widenScalarIf(smallerThan(0, 1),
854 [](const LegalityQuery &Query) {
855 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
856 })
857 .narrowScalarIf(
858 largerThan(0, 1),
859 [](const LegalityQuery &Query) {
860 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
861 });
862
863 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
864 .scalarize(0)
865 .custom();
866
867 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
868 bool IsLoad) -> bool {
869 const LLT DstTy = Query.Types[0];
870
871 // Split vector extloads.
872 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
873 unsigned Align = Query.MMODescrs[0].AlignInBits;
874
875 if (MemSize < DstTy.getSizeInBits())
876 MemSize = std::max(MemSize, Align);
877
878 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
879 return true;
880
881 const LLT PtrTy = Query.Types[1];
882 unsigned AS = PtrTy.getAddressSpace();
883 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
884 return true;
885
886 // Catch weird sized loads that don't evenly divide into the access sizes
887 // TODO: May be able to widen depending on alignment etc.
888 unsigned NumRegs = (MemSize + 31) / 32;
889 if (NumRegs == 3) {
890 if (!ST.hasDwordx3LoadStores())
891 return true;
892 } else {
893 // If the alignment allows, these should have been widened.
894 if (!isPowerOf2_32(NumRegs))
895 return true;
896 }
897
898 if (Align < MemSize) {
899 const SITargetLowering *TLI = ST.getTargetLowering();
900 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
901 }
902
903 return false;
904 };
905
906 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
907 unsigned Opc) -> bool {
908 unsigned Size = Query.Types[0].getSizeInBits();
909 if (isPowerOf2_32(Size))
910 return false;
911
912 if (Size == 96 && ST.hasDwordx3LoadStores())
913 return false;
914
915 unsigned AddrSpace = Query.Types[1].getAddressSpace();
916 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
917 return false;
918
919 unsigned Align = Query.MMODescrs[0].AlignInBits;
920 unsigned RoundedSize = NextPowerOf2(Size);
921 return (Align >= RoundedSize);
922 };
923
924 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
925 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
926 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
927
928 // TODO: Refine based on subtargets which support unaligned access or 128-bit
929 // LDS
930 // TODO: Unsupported flat for SI.
931
932 for (unsigned Op : {G_LOAD, G_STORE}) {
933 const bool IsStore = Op == G_STORE;
934
935 auto &Actions = getActionDefinitionsBuilder(Op);
936 // Explicitly list some common cases.
937 // TODO: Does this help compile time at all?
938 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
939 {V2S32, GlobalPtr, 64, GlobalAlign32},
940 {V4S32, GlobalPtr, 128, GlobalAlign32},
941 {S64, GlobalPtr, 64, GlobalAlign32},
942 {V2S64, GlobalPtr, 128, GlobalAlign32},
943 {V2S16, GlobalPtr, 32, GlobalAlign32},
944 {S32, GlobalPtr, 8, GlobalAlign8},
945 {S32, GlobalPtr, 16, GlobalAlign16},
946
947 {S32, LocalPtr, 32, 32},
948 {S64, LocalPtr, 64, 32},
949 {V2S32, LocalPtr, 64, 32},
950 {S32, LocalPtr, 8, 8},
951 {S32, LocalPtr, 16, 16},
952 {V2S16, LocalPtr, 32, 32},
953
954 {S32, PrivatePtr, 32, 32},
955 {S32, PrivatePtr, 8, 8},
956 {S32, PrivatePtr, 16, 16},
957 {V2S16, PrivatePtr, 32, 32},
958
959 {S32, ConstantPtr, 32, GlobalAlign32},
960 {V2S32, ConstantPtr, 64, GlobalAlign32},
961 {V4S32, ConstantPtr, 128, GlobalAlign32},
962 {S64, ConstantPtr, 64, GlobalAlign32},
963 {V2S32, ConstantPtr, 32, GlobalAlign32}});
964 Actions.legalIf(
965 [=](const LegalityQuery &Query) -> bool {
966 return isLoadStoreLegal(ST, Query, Op);
967 });
968
969 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
970 // 64-bits.
971 //
972 // TODO: Should generalize bitcast action into coerce, which will also cover
973 // inserting addrspacecasts.
974 Actions.customIf(typeIs(1, Constant32Ptr));
975
976 // Turn any illegal element vectors into something easier to deal
977 // with. These will ultimately produce 32-bit scalar shifts to extract the
978 // parts anyway.
979 //
980 // For odd 16-bit element vectors, prefer to split those into pieces with
981 // 16-bit vector parts.
982 Actions.bitcastIf(
983 [=](const LegalityQuery &Query) -> bool {
984 const LLT Ty = Query.Types[0];
985 const unsigned Size = Ty.getSizeInBits();
986
987 if (Size != Query.MMODescrs[0].SizeInBits)
988 return Size <= 32 && Ty.isVector();
989
990 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
991 return true;
992 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
993 !isRegisterVectorElementType(Ty.getElementType());
994 }, bitcastToRegisterType(0));
995
996 Actions
997 .customIf(typeIs(1, Constant32Ptr))
998 // Widen suitably aligned loads by loading extra elements.
999 .moreElementsIf([=](const LegalityQuery &Query) {
1000 const LLT Ty = Query.Types[0];
1001 return Op == G_LOAD && Ty.isVector() &&
1002 shouldWidenLoadResult(Query, Op);
1003 }, moreElementsToNextPow2(0))
1004 .widenScalarIf([=](const LegalityQuery &Query) {
1005 const LLT Ty = Query.Types[0];
1006 return Op == G_LOAD && !Ty.isVector() &&
1007 shouldWidenLoadResult(Query, Op);
1008 }, widenScalarOrEltToNextPow2(0))
1009 .narrowScalarIf(
1010 [=](const LegalityQuery &Query) -> bool {
1011 return !Query.Types[0].isVector() &&
1012 needToSplitMemOp(Query, Op == G_LOAD);
1013 },
1014 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1015 const LLT DstTy = Query.Types[0];
1016 const LLT PtrTy = Query.Types[1];
1017
1018 const unsigned DstSize = DstTy.getSizeInBits();
1019 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1020
1021 // Split extloads.
1022 if (DstSize > MemSize)
1023 return std::make_pair(0, LLT::scalar(MemSize));
1024
1025 if (!isPowerOf2_32(DstSize)) {
1026 // We're probably decomposing an odd sized store. Try to split
1027 // to the widest type. TODO: Account for alignment. As-is it
1028 // should be OK, since the new parts will be further legalized.
1029 unsigned FloorSize = PowerOf2Floor(DstSize);
1030 return std::make_pair(0, LLT::scalar(FloorSize));
1031 }
1032
1033 if (DstSize > 32 && (DstSize % 32 != 0)) {
1034 // FIXME: Need a way to specify non-extload of larger size if
1035 // suitably aligned.
1036 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1037 }
1038
1039 unsigned MaxSize = maxSizeForAddrSpace(ST,
1040 PtrTy.getAddressSpace(),
1041 Op == G_LOAD);
1042 if (MemSize > MaxSize)
1043 return std::make_pair(0, LLT::scalar(MaxSize));
1044
1045 unsigned Align = Query.MMODescrs[0].AlignInBits;
1046 return std::make_pair(0, LLT::scalar(Align));
1047 })
1048 .fewerElementsIf(
1049 [=](const LegalityQuery &Query) -> bool {
1050 return Query.Types[0].isVector() &&
1051 needToSplitMemOp(Query, Op == G_LOAD);
1052 },
1053 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1054 const LLT DstTy = Query.Types[0];
1055 const LLT PtrTy = Query.Types[1];
1056
1057 LLT EltTy = DstTy.getElementType();
1058 unsigned MaxSize = maxSizeForAddrSpace(ST,
1059 PtrTy.getAddressSpace(),
1060 Op == G_LOAD);
1061
1062 // FIXME: Handle widened to power of 2 results better. This ends
1063 // up scalarizing.
1064 // FIXME: 3 element stores scalarized on SI
1065
1066 // Split if it's too large for the address space.
1067 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1068 unsigned NumElts = DstTy.getNumElements();
1069 unsigned EltSize = EltTy.getSizeInBits();
1070
1071 if (MaxSize % EltSize == 0) {
1072 return std::make_pair(
1073 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1074 }
1075
1076 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1077
1078 // FIXME: Refine when odd breakdowns handled
1079 // The scalars will need to be re-legalized.
1080 if (NumPieces == 1 || NumPieces >= NumElts ||
1081 NumElts % NumPieces != 0)
1082 return std::make_pair(0, EltTy);
1083
1084 return std::make_pair(0,
1085 LLT::vector(NumElts / NumPieces, EltTy));
1086 }
1087
1088 // FIXME: We could probably handle weird extending loads better.
1089 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1090 if (DstTy.getSizeInBits() > MemSize)
1091 return std::make_pair(0, EltTy);
1092
1093 unsigned EltSize = EltTy.getSizeInBits();
1094 unsigned DstSize = DstTy.getSizeInBits();
1095 if (!isPowerOf2_32(DstSize)) {
1096 // We're probably decomposing an odd sized store. Try to split
1097 // to the widest type. TODO: Account for alignment. As-is it
1098 // should be OK, since the new parts will be further legalized.
1099 unsigned FloorSize = PowerOf2Floor(DstSize);
1100 return std::make_pair(
1101 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1102 }
1103
1104 // Need to split because of alignment.
1105 unsigned Align = Query.MMODescrs[0].AlignInBits;
1106 if (EltSize > Align &&
1107 (EltSize / Align < DstTy.getNumElements())) {
1108 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1109 }
1110
1111 // May need relegalization for the scalars.
1112 return std::make_pair(0, EltTy);
1113 })
1114 .minScalar(0, S32);
1115
1116 if (IsStore)
1117 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1118
1119 // TODO: Need a bitcast lower option?
1120 Actions
1121 .widenScalarToNextPow2(0)
1122 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1123 }
1124
1125 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1126 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1127 {S32, GlobalPtr, 16, 2 * 8},
1128 {S32, LocalPtr, 8, 8},
1129 {S32, LocalPtr, 16, 16},
1130 {S32, PrivatePtr, 8, 8},
1131 {S32, PrivatePtr, 16, 16},
1132 {S32, ConstantPtr, 8, 8},
1133 {S32, ConstantPtr, 16, 2 * 8}});
1134 if (ST.hasFlatAddressSpace()) {
1135 ExtLoads.legalForTypesWithMemDesc(
1136 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1137 }
1138
1139 ExtLoads.clampScalar(0, S32, S32)
1140 .widenScalarToNextPow2(0)
1141 .unsupportedIfMemSizeNotPow2()
1142 .lower();
1143
1144 auto &Atomics = getActionDefinitionsBuilder(
1145 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1146 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1147 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1148 G_ATOMICRMW_UMIN})
1149 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1150 {S64, GlobalPtr}, {S64, LocalPtr}});
1151 if (ST.hasFlatAddressSpace()) {
1152 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1153 }
1154
1155 if (ST.hasLDSFPAtomics()) {
1156 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1157 .legalFor({{S32, LocalPtr}});
1158 }
1159
1160 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1161 // demarshalling
1162 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1163 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1164 {S32, FlatPtr}, {S64, FlatPtr}})
1165 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1166 {S32, RegionPtr}, {S64, RegionPtr}});
1167 // TODO: Pointer types, any 32-bit or 64-bit vector
1168
1169 // Condition should be s32 for scalar, s1 for vector.
1170 getActionDefinitionsBuilder(G_SELECT)
1171 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1172 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1173 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1174 .clampScalar(0, S16, S64)
1175 .scalarize(1)
1176 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1177 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1178 .clampMaxNumElements(0, S32, 2)
1179 .clampMaxNumElements(0, LocalPtr, 2)
1180 .clampMaxNumElements(0, PrivatePtr, 2)
1181 .scalarize(0)
1182 .widenScalarToNextPow2(0)
1183 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1184
1185 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1186 // be more flexible with the shift amount type.
1187 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1188 .legalFor({{S32, S32}, {S64, S32}});
1189 if (ST.has16BitInsts()) {
1190 if (ST.hasVOP3PInsts()) {
1191 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1192 .clampMaxNumElements(0, S16, 2);
1193 } else
1194 Shifts.legalFor({{S16, S16}});
1195
1196 // TODO: Support 16-bit shift amounts for all types
1197 Shifts.widenScalarIf(
1198 [=](const LegalityQuery &Query) {
1199 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1200 // 32-bit amount.
1201 const LLT ValTy = Query.Types[0];
1202 const LLT AmountTy = Query.Types[1];
1203 return ValTy.getSizeInBits() <= 16 &&
1204 AmountTy.getSizeInBits() < 16;
1205 }, changeTo(1, S16));
1206 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1207 Shifts.clampScalar(1, S32, S32);
1208 Shifts.clampScalar(0, S16, S64);
1209 Shifts.widenScalarToNextPow2(0, 16);
1210 } else {
1211 // Make sure we legalize the shift amount type first, as the general
1212 // expansion for the shifted type will produce much worse code if it hasn't
1213 // been truncated already.
1214 Shifts.clampScalar(1, S32, S32);
1215 Shifts.clampScalar(0, S32, S64);
1216 Shifts.widenScalarToNextPow2(0, 32);
1217 }
1218 Shifts.scalarize(0);
1219
1220 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1221 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1222 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1223 unsigned IdxTypeIdx = 2;
1224
1225 getActionDefinitionsBuilder(Op)
1226 .customIf([=](const LegalityQuery &Query) {
1227 const LLT EltTy = Query.Types[EltTypeIdx];
1228 const LLT VecTy = Query.Types[VecTypeIdx];
1229 const LLT IdxTy = Query.Types[IdxTypeIdx];
1230 return (EltTy.getSizeInBits() == 16 ||
1231 EltTy.getSizeInBits() % 32 == 0) &&
1232 VecTy.getSizeInBits() % 32 == 0 &&
1233 VecTy.getSizeInBits() <= MaxRegisterSize &&
1234 IdxTy.getSizeInBits() == 32;
1235 })
1236 .clampScalar(EltTypeIdx, S32, S64)
1237 .clampScalar(VecTypeIdx, S32, S64)
1238 .clampScalar(IdxTypeIdx, S32, S32);
1239 }
1240
1241 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1242 .unsupportedIf([=](const LegalityQuery &Query) {
1243 const LLT &EltTy = Query.Types[1].getElementType();
1244 return Query.Types[0] != EltTy;
1245 });
1246
1247 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1248 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1249 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1250
1251 // FIXME: Doesn't handle extract of illegal sizes.
1252 getActionDefinitionsBuilder(Op)
1253 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1254 // FIXME: Multiples of 16 should not be legal.
1255 .legalIf([=](const LegalityQuery &Query) {
1256 const LLT BigTy = Query.Types[BigTyIdx];
1257 const LLT LitTy = Query.Types[LitTyIdx];
1258 return (BigTy.getSizeInBits() % 32 == 0) &&
1259 (LitTy.getSizeInBits() % 16 == 0);
1260 })
1261 .widenScalarIf(
1262 [=](const LegalityQuery &Query) {
1263 const LLT BigTy = Query.Types[BigTyIdx];
1264 return (BigTy.getScalarSizeInBits() < 16);
1265 },
1266 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1267 .widenScalarIf(
1268 [=](const LegalityQuery &Query) {
1269 const LLT LitTy = Query.Types[LitTyIdx];
1270 return (LitTy.getScalarSizeInBits() < 16);
1271 },
1272 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1273 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1274 .widenScalarToNextPow2(BigTyIdx, 32);
1275
1276 }
1277
1278 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1279 .legalForCartesianProduct(AllS32Vectors, {S32})
1280 .legalForCartesianProduct(AllS64Vectors, {S64})
1281 .clampNumElements(0, V16S32, V32S32)
1282 .clampNumElements(0, V2S64, V16S64)
1283 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1284
1285 if (ST.hasScalarPackInsts()) {
1286 BuildVector
1287 // FIXME: Should probably widen s1 vectors straight to s32
1288 .minScalarOrElt(0, S16)
1289 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1290 .minScalar(1, S32);
1291
1292 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1293 .legalFor({V2S16, S32})
1294 .lower();
1295 BuildVector.minScalarOrElt(0, S32);
1296 } else {
1297 BuildVector.customFor({V2S16, S16});
1298 BuildVector.minScalarOrElt(0, S32);
1299
1300 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1301 .customFor({V2S16, S32})
1302 .lower();
1303 }
1304
1305 BuildVector.legalIf(isRegisterType(0));
1306
1307 // FIXME: Clamp maximum size
1308 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1309 .legalIf(isRegisterType(0));
1310
1311 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1312 // pre-legalize.
1313 if (ST.hasVOP3PInsts()) {
1314 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1315 .customFor({V2S16, V2S16})
1316 .lower();
1317 } else
1318 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1319
1320 // Merge/Unmerge
1321 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1322 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1323 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1324
1325 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1326 const LLT Ty = Query.Types[TypeIdx];
1327 if (Ty.isVector()) {
1328 const LLT &EltTy = Ty.getElementType();
1329 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1330 return true;
1331 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1332 return true;
1333 }
1334 return false;
1335 };
1336
1337 auto &Builder = getActionDefinitionsBuilder(Op)
1338 .lowerFor({{S16, V2S16}})
1339 .lowerIf([=](const LegalityQuery &Query) {
1340 const LLT BigTy = Query.Types[BigTyIdx];
1341 return BigTy.getSizeInBits() == 32;
1342 })
1343 // Try to widen to s16 first for small types.
1344 // TODO: Only do this on targets with legal s16 shifts
1345 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1346 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1347 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1348 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1349 elementTypeIs(1, S16)),
1350 changeTo(1, V2S16))
1351 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1352 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1353 // valid.
1354 .clampScalar(LitTyIdx, S32, S512)
1355 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1356 // Break up vectors with weird elements into scalars
1357 .fewerElementsIf(
1358 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1359 scalarize(0))
1360 .fewerElementsIf(
1361 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1362 scalarize(1))
1363 .clampScalar(BigTyIdx, S32, MaxScalar);
1364
1365 if (Op == G_MERGE_VALUES) {
1366 Builder.widenScalarIf(
1367 // TODO: Use 16-bit shifts if legal for 8-bit values?
1368 [=](const LegalityQuery &Query) {
1369 const LLT Ty = Query.Types[LitTyIdx];
1370 return Ty.getSizeInBits() < 32;
1371 },
1372 changeTo(LitTyIdx, S32));
1373 }
1374
1375 Builder.widenScalarIf(
1376 [=](const LegalityQuery &Query) {
1377 const LLT Ty = Query.Types[BigTyIdx];
1378 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1379 Ty.getSizeInBits() % 16 != 0;
1380 },
1381 [=](const LegalityQuery &Query) {
1382 // Pick the next power of 2, or a multiple of 64 over 128.
1383 // Whichever is smaller.
1384 const LLT &Ty = Query.Types[BigTyIdx];
1385 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1386 if (NewSizeInBits >= 256) {
1387 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1388 if (RoundedTo < NewSizeInBits)
1389 NewSizeInBits = RoundedTo;
1390 }
1391 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1392 })
1393 .legalIf([=](const LegalityQuery &Query) {
1394 const LLT &BigTy = Query.Types[BigTyIdx];
1395 const LLT &LitTy = Query.Types[LitTyIdx];
1396
1397 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1398 return false;
1399 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1400 return false;
1401
1402 return BigTy.getSizeInBits() % 16 == 0 &&
1403 LitTy.getSizeInBits() % 16 == 0 &&
1404 BigTy.getSizeInBits() <= MaxRegisterSize;
1405 })
1406 // Any vectors left are the wrong size. Scalarize them.
1407 .scalarize(0)
1408 .scalarize(1);
1409 }
1410
1411 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1412 // RegBankSelect.
1413 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1414 .legalFor({{S32}, {S64}});
1415
1416 if (ST.hasVOP3PInsts()) {
1417 SextInReg.lowerFor({{V2S16}})
1418 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1419 // get more vector shift opportunities, since we'll get those when
1420 // expanded.
1421 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1422 } else if (ST.has16BitInsts()) {
1423 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1424 } else {
1425 // Prefer to promote to s32 before lowering if we don't have 16-bit
1426 // shifts. This avoid a lot of intermediate truncate and extend operations.
1427 SextInReg.lowerFor({{S32}, {S64}});
1428 }
1429
1430 // FIXME: Placeholder rule. Really depends on whether the clamp modifier is
1431 // available, and is selectively legal for s16, s32, v2s16.
1432 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT})
1433 .scalarize(0)
1434 .clampScalar(0, S16, S32);
1435
1436 SextInReg
1437 .scalarize(0)
1438 .clampScalar(0, S32, S64)
1439 .lower();
1440
1441 getActionDefinitionsBuilder(G_FSHR)
1442 .legalFor({{S32, S32}})
1443 .scalarize(0)
1444 .lower();
1445
1446 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1447 .legalFor({S64});
1448
1449 getActionDefinitionsBuilder({
1450 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1451 G_FCOPYSIGN,
1452
1453 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1454 G_READ_REGISTER,
1455 G_WRITE_REGISTER,
1456
1457 G_SADDO, G_SSUBO,
1458
1459 // TODO: Implement
1460 G_FMINIMUM, G_FMAXIMUM,
1461 G_FSHL
1462 }).lower();
1463
1464 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1465 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1466 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1467 .unsupported();
1468
1469 computeTables();
1470 verify(*ST.getInstrInfo());
1471 }
1472
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI) const1473 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1474 MachineInstr &MI) const {
1475 MachineIRBuilder &B = Helper.MIRBuilder;
1476 MachineRegisterInfo &MRI = *B.getMRI();
1477 GISelChangeObserver &Observer = Helper.Observer;
1478
1479 switch (MI.getOpcode()) {
1480 case TargetOpcode::G_ADDRSPACE_CAST:
1481 return legalizeAddrSpaceCast(MI, MRI, B);
1482 case TargetOpcode::G_FRINT:
1483 return legalizeFrint(MI, MRI, B);
1484 case TargetOpcode::G_FCEIL:
1485 return legalizeFceil(MI, MRI, B);
1486 case TargetOpcode::G_INTRINSIC_TRUNC:
1487 return legalizeIntrinsicTrunc(MI, MRI, B);
1488 case TargetOpcode::G_SITOFP:
1489 return legalizeITOFP(MI, MRI, B, true);
1490 case TargetOpcode::G_UITOFP:
1491 return legalizeITOFP(MI, MRI, B, false);
1492 case TargetOpcode::G_FPTOSI:
1493 return legalizeFPTOI(MI, MRI, B, true);
1494 case TargetOpcode::G_FPTOUI:
1495 return legalizeFPTOI(MI, MRI, B, false);
1496 case TargetOpcode::G_FMINNUM:
1497 case TargetOpcode::G_FMAXNUM:
1498 case TargetOpcode::G_FMINNUM_IEEE:
1499 case TargetOpcode::G_FMAXNUM_IEEE:
1500 return legalizeMinNumMaxNum(Helper, MI);
1501 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1502 return legalizeExtractVectorElt(MI, MRI, B);
1503 case TargetOpcode::G_INSERT_VECTOR_ELT:
1504 return legalizeInsertVectorElt(MI, MRI, B);
1505 case TargetOpcode::G_SHUFFLE_VECTOR:
1506 return legalizeShuffleVector(MI, MRI, B);
1507 case TargetOpcode::G_FSIN:
1508 case TargetOpcode::G_FCOS:
1509 return legalizeSinCos(MI, MRI, B);
1510 case TargetOpcode::G_GLOBAL_VALUE:
1511 return legalizeGlobalValue(MI, MRI, B);
1512 case TargetOpcode::G_LOAD:
1513 return legalizeLoad(MI, MRI, B, Observer);
1514 case TargetOpcode::G_FMAD:
1515 return legalizeFMad(MI, MRI, B);
1516 case TargetOpcode::G_FDIV:
1517 return legalizeFDIV(MI, MRI, B);
1518 case TargetOpcode::G_UDIV:
1519 case TargetOpcode::G_UREM:
1520 return legalizeUDIV_UREM(MI, MRI, B);
1521 case TargetOpcode::G_SDIV:
1522 case TargetOpcode::G_SREM:
1523 return legalizeSDIV_SREM(MI, MRI, B);
1524 case TargetOpcode::G_ATOMIC_CMPXCHG:
1525 return legalizeAtomicCmpXChg(MI, MRI, B);
1526 case TargetOpcode::G_FLOG:
1527 return legalizeFlog(MI, B, numbers::ln2f);
1528 case TargetOpcode::G_FLOG10:
1529 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1530 case TargetOpcode::G_FEXP:
1531 return legalizeFExp(MI, B);
1532 case TargetOpcode::G_FPOW:
1533 return legalizeFPow(MI, B);
1534 case TargetOpcode::G_FFLOOR:
1535 return legalizeFFloor(MI, MRI, B);
1536 case TargetOpcode::G_BUILD_VECTOR:
1537 return legalizeBuildVector(MI, MRI, B);
1538 default:
1539 return false;
1540 }
1541
1542 llvm_unreachable("expected switch to return");
1543 }
1544
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1545 Register AMDGPULegalizerInfo::getSegmentAperture(
1546 unsigned AS,
1547 MachineRegisterInfo &MRI,
1548 MachineIRBuilder &B) const {
1549 MachineFunction &MF = B.getMF();
1550 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1551 const LLT S32 = LLT::scalar(32);
1552
1553 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1554
1555 if (ST.hasApertureRegs()) {
1556 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1557 // getreg.
1558 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1559 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1560 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1561 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1562 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1563 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1564 unsigned Encoding =
1565 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1566 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1567 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1568
1569 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1570
1571 B.buildInstr(AMDGPU::S_GETREG_B32)
1572 .addDef(GetReg)
1573 .addImm(Encoding);
1574 MRI.setType(GetReg, S32);
1575
1576 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1577 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1578 }
1579
1580 Register QueuePtr = MRI.createGenericVirtualRegister(
1581 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1582
1583 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1584 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1585 return Register();
1586
1587 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1588 // private_segment_aperture_base_hi.
1589 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1590
1591 // TODO: can we be smarter about machine pointer info?
1592 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1593 MachineMemOperand *MMO = MF.getMachineMemOperand(
1594 PtrInfo,
1595 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1596 MachineMemOperand::MOInvariant,
1597 4, commonAlignment(Align(64), StructOffset));
1598
1599 Register LoadAddr;
1600
1601 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1602 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1603 }
1604
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1605 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1606 MachineInstr &MI, MachineRegisterInfo &MRI,
1607 MachineIRBuilder &B) const {
1608 MachineFunction &MF = B.getMF();
1609
1610 const LLT S32 = LLT::scalar(32);
1611 Register Dst = MI.getOperand(0).getReg();
1612 Register Src = MI.getOperand(1).getReg();
1613
1614 LLT DstTy = MRI.getType(Dst);
1615 LLT SrcTy = MRI.getType(Src);
1616 unsigned DestAS = DstTy.getAddressSpace();
1617 unsigned SrcAS = SrcTy.getAddressSpace();
1618
1619 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1620 // vector element.
1621 assert(!DstTy.isVector());
1622
1623 const AMDGPUTargetMachine &TM
1624 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1625
1626 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1627 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1628 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1629 return true;
1630 }
1631
1632 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1633 // Truncate.
1634 B.buildExtract(Dst, Src, 0);
1635 MI.eraseFromParent();
1636 return true;
1637 }
1638
1639 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1640 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1641 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1642
1643 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1644 // another. Merge operands are required to be the same type, but creating an
1645 // extra ptrtoint would be kind of pointless.
1646 auto HighAddr = B.buildConstant(
1647 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1648 B.buildMerge(Dst, {Src, HighAddr});
1649 MI.eraseFromParent();
1650 return true;
1651 }
1652
1653 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1654 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1655 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1656 unsigned NullVal = TM.getNullPointerValue(DestAS);
1657
1658 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1659 auto FlatNull = B.buildConstant(SrcTy, 0);
1660
1661 // Extract low 32-bits of the pointer.
1662 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1663
1664 auto CmpRes =
1665 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1666 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1667
1668 MI.eraseFromParent();
1669 return true;
1670 }
1671
1672 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1673 return false;
1674
1675 if (!ST.hasFlatAddressSpace())
1676 return false;
1677
1678 auto SegmentNull =
1679 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1680 auto FlatNull =
1681 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1682
1683 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1684 if (!ApertureReg.isValid())
1685 return false;
1686
1687 auto CmpRes =
1688 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1689
1690 // Coerce the type of the low half of the result so we can use merge_values.
1691 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1692
1693 // TODO: Should we allow mismatched types but matching sizes in merges to
1694 // avoid the ptrtoint?
1695 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1696 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1697
1698 MI.eraseFromParent();
1699 return true;
1700 }
1701
legalizeFrint(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1702 bool AMDGPULegalizerInfo::legalizeFrint(
1703 MachineInstr &MI, MachineRegisterInfo &MRI,
1704 MachineIRBuilder &B) const {
1705 Register Src = MI.getOperand(1).getReg();
1706 LLT Ty = MRI.getType(Src);
1707 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1708
1709 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1710 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1711
1712 auto C1 = B.buildFConstant(Ty, C1Val);
1713 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1714
1715 // TODO: Should this propagate fast-math-flags?
1716 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1717 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1718
1719 auto C2 = B.buildFConstant(Ty, C2Val);
1720 auto Fabs = B.buildFAbs(Ty, Src);
1721
1722 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1723 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1724 return true;
1725 }
1726
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1727 bool AMDGPULegalizerInfo::legalizeFceil(
1728 MachineInstr &MI, MachineRegisterInfo &MRI,
1729 MachineIRBuilder &B) const {
1730
1731 const LLT S1 = LLT::scalar(1);
1732 const LLT S64 = LLT::scalar(64);
1733
1734 Register Src = MI.getOperand(1).getReg();
1735 assert(MRI.getType(Src) == S64);
1736
1737 // result = trunc(src)
1738 // if (src > 0.0 && src != result)
1739 // result += 1.0
1740
1741 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1742
1743 const auto Zero = B.buildFConstant(S64, 0.0);
1744 const auto One = B.buildFConstant(S64, 1.0);
1745 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1746 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1747 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1748 auto Add = B.buildSelect(S64, And, One, Zero);
1749
1750 // TODO: Should this propagate fast-math-flags?
1751 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1752 return true;
1753 }
1754
extractF64Exponent(unsigned Hi,MachineIRBuilder & B)1755 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1756 MachineIRBuilder &B) {
1757 const unsigned FractBits = 52;
1758 const unsigned ExpBits = 11;
1759 LLT S32 = LLT::scalar(32);
1760
1761 auto Const0 = B.buildConstant(S32, FractBits - 32);
1762 auto Const1 = B.buildConstant(S32, ExpBits);
1763
1764 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1765 .addUse(Const0.getReg(0))
1766 .addUse(Const1.getReg(0));
1767
1768 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1769 }
1770
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1771 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1772 MachineInstr &MI, MachineRegisterInfo &MRI,
1773 MachineIRBuilder &B) const {
1774 const LLT S1 = LLT::scalar(1);
1775 const LLT S32 = LLT::scalar(32);
1776 const LLT S64 = LLT::scalar(64);
1777
1778 Register Src = MI.getOperand(1).getReg();
1779 assert(MRI.getType(Src) == S64);
1780
1781 // TODO: Should this use extract since the low half is unused?
1782 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1783 Register Hi = Unmerge.getReg(1);
1784
1785 // Extract the upper half, since this is where we will find the sign and
1786 // exponent.
1787 auto Exp = extractF64Exponent(Hi, B);
1788
1789 const unsigned FractBits = 52;
1790
1791 // Extract the sign bit.
1792 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1793 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1794
1795 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1796
1797 const auto Zero32 = B.buildConstant(S32, 0);
1798
1799 // Extend back to 64-bits.
1800 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1801
1802 auto Shr = B.buildAShr(S64, FractMask, Exp);
1803 auto Not = B.buildNot(S64, Shr);
1804 auto Tmp0 = B.buildAnd(S64, Src, Not);
1805 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1806
1807 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1808 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1809
1810 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1811 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1812 return true;
1813 }
1814
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const1815 bool AMDGPULegalizerInfo::legalizeITOFP(
1816 MachineInstr &MI, MachineRegisterInfo &MRI,
1817 MachineIRBuilder &B, bool Signed) const {
1818
1819 Register Dst = MI.getOperand(0).getReg();
1820 Register Src = MI.getOperand(1).getReg();
1821
1822 const LLT S64 = LLT::scalar(64);
1823 const LLT S32 = LLT::scalar(32);
1824
1825 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1826
1827 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1828
1829 auto CvtHi = Signed ?
1830 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1831 B.buildUITOFP(S64, Unmerge.getReg(1));
1832
1833 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1834
1835 auto ThirtyTwo = B.buildConstant(S32, 32);
1836 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1837 .addUse(CvtHi.getReg(0))
1838 .addUse(ThirtyTwo.getReg(0));
1839
1840 // TODO: Should this propagate fast-math-flags?
1841 B.buildFAdd(Dst, LdExp, CvtLo);
1842 MI.eraseFromParent();
1843 return true;
1844 }
1845
1846 // TODO: Copied from DAG implementation. Verify logic and document how this
1847 // actually works.
legalizeFPTOI(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const1848 bool AMDGPULegalizerInfo::legalizeFPTOI(
1849 MachineInstr &MI, MachineRegisterInfo &MRI,
1850 MachineIRBuilder &B, bool Signed) const {
1851
1852 Register Dst = MI.getOperand(0).getReg();
1853 Register Src = MI.getOperand(1).getReg();
1854
1855 const LLT S64 = LLT::scalar(64);
1856 const LLT S32 = LLT::scalar(32);
1857
1858 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1859
1860 unsigned Flags = MI.getFlags();
1861
1862 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1863 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1864 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1865
1866 auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1867 auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1868 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1869
1870 auto Hi = Signed ?
1871 B.buildFPTOSI(S32, FloorMul) :
1872 B.buildFPTOUI(S32, FloorMul);
1873 auto Lo = B.buildFPTOUI(S32, Fma);
1874
1875 B.buildMerge(Dst, { Lo, Hi });
1876 MI.eraseFromParent();
1877
1878 return true;
1879 }
1880
legalizeMinNumMaxNum(LegalizerHelper & Helper,MachineInstr & MI) const1881 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1882 MachineInstr &MI) const {
1883 MachineFunction &MF = Helper.MIRBuilder.getMF();
1884 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1885
1886 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1887 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1888
1889 // With ieee_mode disabled, the instructions have the correct behavior
1890 // already for G_FMINNUM/G_FMAXNUM
1891 if (!MFI->getMode().IEEE)
1892 return !IsIEEEOp;
1893
1894 if (IsIEEEOp)
1895 return true;
1896
1897 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1898 }
1899
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1900 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1901 MachineInstr &MI, MachineRegisterInfo &MRI,
1902 MachineIRBuilder &B) const {
1903 // TODO: Should move some of this into LegalizerHelper.
1904
1905 // TODO: Promote dynamic indexing of s16 to s32
1906
1907 // FIXME: Artifact combiner probably should have replaced the truncated
1908 // constant before this, so we shouldn't need
1909 // getConstantVRegValWithLookThrough.
1910 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1911 MI.getOperand(2).getReg(), MRI);
1912 if (!IdxVal) // Dynamic case will be selected to register indexing.
1913 return true;
1914
1915 Register Dst = MI.getOperand(0).getReg();
1916 Register Vec = MI.getOperand(1).getReg();
1917
1918 LLT VecTy = MRI.getType(Vec);
1919 LLT EltTy = VecTy.getElementType();
1920 assert(EltTy == MRI.getType(Dst));
1921
1922 if (IdxVal->Value < VecTy.getNumElements())
1923 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1924 else
1925 B.buildUndef(Dst);
1926
1927 MI.eraseFromParent();
1928 return true;
1929 }
1930
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1931 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1932 MachineInstr &MI, MachineRegisterInfo &MRI,
1933 MachineIRBuilder &B) const {
1934 // TODO: Should move some of this into LegalizerHelper.
1935
1936 // TODO: Promote dynamic indexing of s16 to s32
1937
1938 // FIXME: Artifact combiner probably should have replaced the truncated
1939 // constant before this, so we shouldn't need
1940 // getConstantVRegValWithLookThrough.
1941 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1942 MI.getOperand(3).getReg(), MRI);
1943 if (!IdxVal) // Dynamic case will be selected to register indexing.
1944 return true;
1945
1946 Register Dst = MI.getOperand(0).getReg();
1947 Register Vec = MI.getOperand(1).getReg();
1948 Register Ins = MI.getOperand(2).getReg();
1949
1950 LLT VecTy = MRI.getType(Vec);
1951 LLT EltTy = VecTy.getElementType();
1952 assert(EltTy == MRI.getType(Ins));
1953
1954 if (IdxVal->Value < VecTy.getNumElements())
1955 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1956 else
1957 B.buildUndef(Dst);
1958
1959 MI.eraseFromParent();
1960 return true;
1961 }
1962
legalizeShuffleVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1963 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1964 MachineInstr &MI, MachineRegisterInfo &MRI,
1965 MachineIRBuilder &B) const {
1966 const LLT V2S16 = LLT::vector(2, 16);
1967
1968 Register Dst = MI.getOperand(0).getReg();
1969 Register Src0 = MI.getOperand(1).getReg();
1970 LLT DstTy = MRI.getType(Dst);
1971 LLT SrcTy = MRI.getType(Src0);
1972
1973 if (SrcTy == V2S16 && DstTy == V2S16 &&
1974 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1975 return true;
1976
1977 MachineIRBuilder HelperBuilder(MI);
1978 GISelObserverWrapper DummyObserver;
1979 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1980 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1981 }
1982
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1983 bool AMDGPULegalizerInfo::legalizeSinCos(
1984 MachineInstr &MI, MachineRegisterInfo &MRI,
1985 MachineIRBuilder &B) const {
1986
1987 Register DstReg = MI.getOperand(0).getReg();
1988 Register SrcReg = MI.getOperand(1).getReg();
1989 LLT Ty = MRI.getType(DstReg);
1990 unsigned Flags = MI.getFlags();
1991
1992 Register TrigVal;
1993 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1994 if (ST.hasTrigReducedRange()) {
1995 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1996 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1997 .addUse(MulVal.getReg(0))
1998 .setMIFlags(Flags).getReg(0);
1999 } else
2000 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2001
2002 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2003 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2004 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2005 .addUse(TrigVal)
2006 .setMIFlags(Flags);
2007 MI.eraseFromParent();
2008 return true;
2009 }
2010
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,int64_t Offset,unsigned GAFlags) const2011 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2012 MachineIRBuilder &B,
2013 const GlobalValue *GV,
2014 int64_t Offset,
2015 unsigned GAFlags) const {
2016 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2017 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2018 // to the following code sequence:
2019 //
2020 // For constant address space:
2021 // s_getpc_b64 s[0:1]
2022 // s_add_u32 s0, s0, $symbol
2023 // s_addc_u32 s1, s1, 0
2024 //
2025 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2026 // a fixup or relocation is emitted to replace $symbol with a literal
2027 // constant, which is a pc-relative offset from the encoding of the $symbol
2028 // operand to the global variable.
2029 //
2030 // For global address space:
2031 // s_getpc_b64 s[0:1]
2032 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2033 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2034 //
2035 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2036 // fixups or relocations are emitted to replace $symbol@*@lo and
2037 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2038 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2039 // operand to the global variable.
2040 //
2041 // What we want here is an offset from the value returned by s_getpc
2042 // (which is the address of the s_add_u32 instruction) to the global
2043 // variable, but since the encoding of $symbol starts 4 bytes after the start
2044 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2045 // small. This requires us to add 4 to the global variable offset in order to
2046 // compute the correct address.
2047
2048 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2049
2050 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2051 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2052
2053 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2054 .addDef(PCReg);
2055
2056 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2057 if (GAFlags == SIInstrInfo::MO_NONE)
2058 MIB.addImm(0);
2059 else
2060 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2061
2062 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2063
2064 if (PtrTy.getSizeInBits() == 32)
2065 B.buildExtract(DstReg, PCReg, 0);
2066 return true;
2067 }
2068
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2069 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2070 MachineInstr &MI, MachineRegisterInfo &MRI,
2071 MachineIRBuilder &B) const {
2072 Register DstReg = MI.getOperand(0).getReg();
2073 LLT Ty = MRI.getType(DstReg);
2074 unsigned AS = Ty.getAddressSpace();
2075
2076 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2077 MachineFunction &MF = B.getMF();
2078 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2079
2080 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2081 if (!MFI->isEntryFunction()) {
2082 const Function &Fn = MF.getFunction();
2083 DiagnosticInfoUnsupported BadLDSDecl(
2084 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2085 DS_Warning);
2086 Fn.getContext().diagnose(BadLDSDecl);
2087
2088 // We currently don't have a way to correctly allocate LDS objects that
2089 // aren't directly associated with a kernel. We do force inlining of
2090 // functions that use local objects. However, if these dead functions are
2091 // not eliminated, we don't want a compile time error. Just emit a warning
2092 // and a trap, since there should be no callable path here.
2093 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2094 B.buildUndef(DstReg);
2095 MI.eraseFromParent();
2096 return true;
2097 }
2098
2099 // TODO: We could emit code to handle the initialization somewhere.
2100 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2101 const SITargetLowering *TLI = ST.getTargetLowering();
2102 if (!TLI->shouldUseLDSConstAddress(GV)) {
2103 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2104 return true; // Leave in place;
2105 }
2106
2107 B.buildConstant(
2108 DstReg,
2109 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2110 MI.eraseFromParent();
2111 return true;
2112 }
2113
2114 const Function &Fn = MF.getFunction();
2115 DiagnosticInfoUnsupported BadInit(
2116 Fn, "unsupported initializer for address space", MI.getDebugLoc());
2117 Fn.getContext().diagnose(BadInit);
2118 return true;
2119 }
2120
2121 const SITargetLowering *TLI = ST.getTargetLowering();
2122
2123 if (TLI->shouldEmitFixup(GV)) {
2124 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2125 MI.eraseFromParent();
2126 return true;
2127 }
2128
2129 if (TLI->shouldEmitPCReloc(GV)) {
2130 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2131 MI.eraseFromParent();
2132 return true;
2133 }
2134
2135 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2136 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2137
2138 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2139 MachinePointerInfo::getGOT(MF),
2140 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2141 MachineMemOperand::MOInvariant,
2142 8 /*Size*/, Align(8));
2143
2144 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2145
2146 if (Ty.getSizeInBits() == 32) {
2147 // Truncate if this is a 32-bit constant adrdess.
2148 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2149 B.buildExtract(DstReg, Load, 0);
2150 } else
2151 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2152
2153 MI.eraseFromParent();
2154 return true;
2155 }
2156
legalizeLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,GISelChangeObserver & Observer) const2157 bool AMDGPULegalizerInfo::legalizeLoad(
2158 MachineInstr &MI, MachineRegisterInfo &MRI,
2159 MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2160 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2161 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2162 Observer.changingInstr(MI);
2163 MI.getOperand(1).setReg(Cast.getReg(0));
2164 Observer.changedInstr(MI);
2165 return true;
2166 }
2167
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2168 bool AMDGPULegalizerInfo::legalizeFMad(
2169 MachineInstr &MI, MachineRegisterInfo &MRI,
2170 MachineIRBuilder &B) const {
2171 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2172 assert(Ty.isScalar());
2173
2174 MachineFunction &MF = B.getMF();
2175 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2176
2177 // TODO: Always legal with future ftz flag.
2178 // FIXME: Do we need just output?
2179 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2180 return true;
2181 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2182 return true;
2183
2184 MachineIRBuilder HelperBuilder(MI);
2185 GISelObserverWrapper DummyObserver;
2186 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2187 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2188 }
2189
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2190 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2191 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2192 Register DstReg = MI.getOperand(0).getReg();
2193 Register PtrReg = MI.getOperand(1).getReg();
2194 Register CmpVal = MI.getOperand(2).getReg();
2195 Register NewVal = MI.getOperand(3).getReg();
2196
2197 assert(SITargetLowering::isFlatGlobalAddrSpace(
2198 MRI.getType(PtrReg).getAddressSpace()) &&
2199 "this should not have been custom lowered");
2200
2201 LLT ValTy = MRI.getType(CmpVal);
2202 LLT VecTy = LLT::vector(2, ValTy);
2203
2204 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2205
2206 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2207 .addDef(DstReg)
2208 .addUse(PtrReg)
2209 .addUse(PackedVal)
2210 .setMemRefs(MI.memoperands());
2211
2212 MI.eraseFromParent();
2213 return true;
2214 }
2215
legalizeFlog(MachineInstr & MI,MachineIRBuilder & B,double Log2BaseInverted) const2216 bool AMDGPULegalizerInfo::legalizeFlog(
2217 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2218 Register Dst = MI.getOperand(0).getReg();
2219 Register Src = MI.getOperand(1).getReg();
2220 LLT Ty = B.getMRI()->getType(Dst);
2221 unsigned Flags = MI.getFlags();
2222
2223 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2224 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2225
2226 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2227 MI.eraseFromParent();
2228 return true;
2229 }
2230
legalizeFExp(MachineInstr & MI,MachineIRBuilder & B) const2231 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2232 MachineIRBuilder &B) const {
2233 Register Dst = MI.getOperand(0).getReg();
2234 Register Src = MI.getOperand(1).getReg();
2235 unsigned Flags = MI.getFlags();
2236 LLT Ty = B.getMRI()->getType(Dst);
2237
2238 auto K = B.buildFConstant(Ty, numbers::log2e);
2239 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2240 B.buildFExp2(Dst, Mul, Flags);
2241 MI.eraseFromParent();
2242 return true;
2243 }
2244
legalizeFPow(MachineInstr & MI,MachineIRBuilder & B) const2245 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2246 MachineIRBuilder &B) const {
2247 Register Dst = MI.getOperand(0).getReg();
2248 Register Src0 = MI.getOperand(1).getReg();
2249 Register Src1 = MI.getOperand(2).getReg();
2250 unsigned Flags = MI.getFlags();
2251 LLT Ty = B.getMRI()->getType(Dst);
2252 const LLT S16 = LLT::scalar(16);
2253 const LLT S32 = LLT::scalar(32);
2254
2255 if (Ty == S32) {
2256 auto Log = B.buildFLog2(S32, Src0, Flags);
2257 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2258 .addUse(Log.getReg(0))
2259 .addUse(Src1)
2260 .setMIFlags(Flags);
2261 B.buildFExp2(Dst, Mul, Flags);
2262 } else if (Ty == S16) {
2263 // There's no f16 fmul_legacy, so we need to convert for it.
2264 auto Log = B.buildFLog2(S16, Src0, Flags);
2265 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2266 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2267 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2268 .addUse(Ext0.getReg(0))
2269 .addUse(Ext1.getReg(0))
2270 .setMIFlags(Flags);
2271
2272 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2273 } else
2274 return false;
2275
2276 MI.eraseFromParent();
2277 return true;
2278 }
2279
2280 // Find a source register, ignoring any possible source modifiers.
stripAnySourceMods(Register OrigSrc,MachineRegisterInfo & MRI)2281 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2282 Register ModSrc = OrigSrc;
2283 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2284 ModSrc = SrcFNeg->getOperand(1).getReg();
2285 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2286 ModSrc = SrcFAbs->getOperand(1).getReg();
2287 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2288 ModSrc = SrcFAbs->getOperand(1).getReg();
2289 return ModSrc;
2290 }
2291
legalizeFFloor(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2292 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2293 MachineRegisterInfo &MRI,
2294 MachineIRBuilder &B) const {
2295
2296 const LLT S1 = LLT::scalar(1);
2297 const LLT S64 = LLT::scalar(64);
2298 Register Dst = MI.getOperand(0).getReg();
2299 Register OrigSrc = MI.getOperand(1).getReg();
2300 unsigned Flags = MI.getFlags();
2301 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2302 "this should not have been custom lowered");
2303
2304 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2305 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2306 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2307 // V_FRACT bug is:
2308 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2309 //
2310 // Convert floor(x) to (x - fract(x))
2311
2312 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2313 .addUse(OrigSrc)
2314 .setMIFlags(Flags);
2315
2316 // Give source modifier matching some assistance before obscuring a foldable
2317 // pattern.
2318
2319 // TODO: We can avoid the neg on the fract? The input sign to fract
2320 // shouldn't matter?
2321 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2322
2323 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2324
2325 Register Min = MRI.createGenericVirtualRegister(S64);
2326
2327 // We don't need to concern ourselves with the snan handling difference, so
2328 // use the one which will directly select.
2329 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2330 if (MFI->getMode().IEEE)
2331 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2332 else
2333 B.buildFMinNum(Min, Fract, Const, Flags);
2334
2335 Register CorrectedFract = Min;
2336 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2337 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2338 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2339 }
2340
2341 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2342 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2343
2344 MI.eraseFromParent();
2345 return true;
2346 }
2347
2348 // Turn an illegal packed v2s16 build vector into bit operations.
2349 // TODO: This should probably be a bitcast action in LegalizerHelper.
legalizeBuildVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2350 bool AMDGPULegalizerInfo::legalizeBuildVector(
2351 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2352 Register Dst = MI.getOperand(0).getReg();
2353 const LLT S32 = LLT::scalar(32);
2354 assert(MRI.getType(Dst) == LLT::vector(2, 16));
2355
2356 Register Src0 = MI.getOperand(1).getReg();
2357 Register Src1 = MI.getOperand(2).getReg();
2358 assert(MRI.getType(Src0) == LLT::scalar(16));
2359
2360 auto Merge = B.buildMerge(S32, {Src0, Src1});
2361 B.buildBitcast(Dst, Merge);
2362
2363 MI.eraseFromParent();
2364 return true;
2365 }
2366
2367 // Return the use branch instruction, otherwise null if the usage is invalid.
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br,MachineBasicBlock * & UncondBrTarget)2368 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2369 MachineRegisterInfo &MRI,
2370 MachineInstr *&Br,
2371 MachineBasicBlock *&UncondBrTarget) {
2372 Register CondDef = MI.getOperand(0).getReg();
2373 if (!MRI.hasOneNonDBGUse(CondDef))
2374 return nullptr;
2375
2376 MachineBasicBlock *Parent = MI.getParent();
2377 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2378 if (UseMI.getParent() != Parent ||
2379 UseMI.getOpcode() != AMDGPU::G_BRCOND)
2380 return nullptr;
2381
2382 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2383 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2384 if (Next == Parent->end()) {
2385 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2386 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2387 return nullptr;
2388 UncondBrTarget = &*NextMBB;
2389 } else {
2390 if (Next->getOpcode() != AMDGPU::G_BR)
2391 return nullptr;
2392 Br = &*Next;
2393 UncondBrTarget = Br->getOperand(0).getMBB();
2394 }
2395
2396 return &UseMI;
2397 }
2398
insertLiveInCopy(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register LiveIn,Register PhyReg) const2399 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2400 MachineRegisterInfo &MRI,
2401 Register LiveIn,
2402 Register PhyReg) const {
2403 assert(PhyReg.isPhysical() && "Physical register expected");
2404
2405 // Insert the live-in copy, if required, by defining destination virtual
2406 // register.
2407 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2408 if (!MRI.getVRegDef(LiveIn)) {
2409 // FIXME: Should have scoped insert pt
2410 MachineBasicBlock &OrigInsBB = B.getMBB();
2411 auto OrigInsPt = B.getInsertPt();
2412
2413 MachineBasicBlock &EntryMBB = B.getMF().front();
2414 EntryMBB.addLiveIn(PhyReg);
2415 B.setInsertPt(EntryMBB, EntryMBB.begin());
2416 B.buildCopy(LiveIn, PhyReg);
2417
2418 B.setInsertPt(OrigInsBB, OrigInsPt);
2419 }
2420
2421 return LiveIn;
2422 }
2423
getLiveInRegister(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register PhyReg,LLT Ty,bool InsertLiveInCopy) const2424 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2425 MachineRegisterInfo &MRI,
2426 Register PhyReg, LLT Ty,
2427 bool InsertLiveInCopy) const {
2428 assert(PhyReg.isPhysical() && "Physical register expected");
2429
2430 // Get or create virtual live-in regester
2431 Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2432 if (!LiveIn) {
2433 LiveIn = MRI.createGenericVirtualRegister(Ty);
2434 MRI.addLiveIn(PhyReg, LiveIn);
2435 }
2436
2437 // When the actual true copy required is from virtual register to physical
2438 // register (to be inserted later), live-in copy insertion from physical
2439 // to register virtual register is not required
2440 if (!InsertLiveInCopy)
2441 return LiveIn;
2442
2443 return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2444 }
2445
getArgDescriptor(MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2446 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2447 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2448 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2449 const ArgDescriptor *Arg;
2450 const TargetRegisterClass *RC;
2451 LLT ArgTy;
2452 std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
2453 if (!Arg) {
2454 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2455 return nullptr;
2456 }
2457 return Arg;
2458 }
2459
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg) const2460 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2461 const ArgDescriptor *Arg) const {
2462 if (!Arg->isRegister() || !Arg->getRegister().isValid())
2463 return false; // TODO: Handle these
2464
2465 Register SrcReg = Arg->getRegister();
2466 assert(SrcReg.isPhysical() && "Physical register expected");
2467 assert(DstReg.isVirtual() && "Virtual register expected");
2468
2469 MachineRegisterInfo &MRI = *B.getMRI();
2470
2471 LLT Ty = MRI.getType(DstReg);
2472 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2473
2474 if (Arg->isMasked()) {
2475 // TODO: Should we try to emit this once in the entry block?
2476 const LLT S32 = LLT::scalar(32);
2477 const unsigned Mask = Arg->getMask();
2478 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2479
2480 Register AndMaskSrc = LiveIn;
2481
2482 if (Shift != 0) {
2483 auto ShiftAmt = B.buildConstant(S32, Shift);
2484 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2485 }
2486
2487 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2488 } else {
2489 B.buildCopy(DstReg, LiveIn);
2490 }
2491
2492 return true;
2493 }
2494
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2495 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2496 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2497 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2498
2499 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2500 if (!Arg)
2501 return false;
2502
2503 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2504 return false;
2505
2506 MI.eraseFromParent();
2507 return true;
2508 }
2509
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2510 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2511 MachineRegisterInfo &MRI,
2512 MachineIRBuilder &B) const {
2513 Register Dst = MI.getOperand(0).getReg();
2514 LLT DstTy = MRI.getType(Dst);
2515 LLT S16 = LLT::scalar(16);
2516 LLT S32 = LLT::scalar(32);
2517 LLT S64 = LLT::scalar(64);
2518
2519 if (legalizeFastUnsafeFDIV(MI, MRI, B))
2520 return true;
2521
2522 if (DstTy == S16)
2523 return legalizeFDIV16(MI, MRI, B);
2524 if (DstTy == S32)
2525 return legalizeFDIV32(MI, MRI, B);
2526 if (DstTy == S64)
2527 return legalizeFDIV64(MI, MRI, B);
2528
2529 return false;
2530 }
2531
legalizeUDIV_UREM32Impl(MachineIRBuilder & B,Register DstReg,Register X,Register Y,bool IsDiv) const2532 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2533 Register DstReg,
2534 Register X,
2535 Register Y,
2536 bool IsDiv) const {
2537 const LLT S1 = LLT::scalar(1);
2538 const LLT S32 = LLT::scalar(32);
2539
2540 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2541 // algorithm used here.
2542
2543 // Initial estimate of inv(y).
2544 auto FloatY = B.buildUITOFP(S32, Y);
2545 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2546 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2547 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2548 auto Z = B.buildFPTOUI(S32, ScaledY);
2549
2550 // One round of UNR.
2551 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2552 auto NegYZ = B.buildMul(S32, NegY, Z);
2553 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2554
2555 // Quotient/remainder estimate.
2556 auto Q = B.buildUMulH(S32, X, Z);
2557 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2558
2559 // First quotient/remainder refinement.
2560 auto One = B.buildConstant(S32, 1);
2561 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2562 if (IsDiv)
2563 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2564 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2565
2566 // Second quotient/remainder refinement.
2567 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2568 if (IsDiv)
2569 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2570 else
2571 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2572 }
2573
legalizeUDIV_UREM32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2574 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2575 MachineRegisterInfo &MRI,
2576 MachineIRBuilder &B) const {
2577 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2578 Register DstReg = MI.getOperand(0).getReg();
2579 Register Num = MI.getOperand(1).getReg();
2580 Register Den = MI.getOperand(2).getReg();
2581 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2582 MI.eraseFromParent();
2583 return true;
2584 }
2585
2586 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2587 //
2588 // Return lo, hi of result
2589 //
2590 // %cvt.lo = G_UITOFP Val.lo
2591 // %cvt.hi = G_UITOFP Val.hi
2592 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2593 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2594 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2595 // %mul2 = G_FMUL %mul1, 2**(-32)
2596 // %trunc = G_INTRINSIC_TRUNC %mul2
2597 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2598 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
emitReciprocalU64(MachineIRBuilder & B,Register Val)2599 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2600 Register Val) {
2601 const LLT S32 = LLT::scalar(32);
2602 auto Unmerge = B.buildUnmerge(S32, Val);
2603
2604 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2605 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2606
2607 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2608 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2609
2610 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2611 auto Mul1 =
2612 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2613
2614 // 2**(-32)
2615 auto Mul2 =
2616 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2617 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2618
2619 // -(2**32)
2620 auto Mad2 = B.buildFMAD(S32, Trunc,
2621 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2622
2623 auto ResultLo = B.buildFPTOUI(S32, Mad2);
2624 auto ResultHi = B.buildFPTOUI(S32, Trunc);
2625
2626 return {ResultLo.getReg(0), ResultHi.getReg(0)};
2627 }
2628
legalizeUDIV_UREM64Impl(MachineIRBuilder & B,Register DstReg,Register Numer,Register Denom,bool IsDiv) const2629 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2630 Register DstReg,
2631 Register Numer,
2632 Register Denom,
2633 bool IsDiv) const {
2634 const LLT S32 = LLT::scalar(32);
2635 const LLT S64 = LLT::scalar(64);
2636 const LLT S1 = LLT::scalar(1);
2637 Register RcpLo, RcpHi;
2638
2639 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2640
2641 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2642
2643 auto Zero64 = B.buildConstant(S64, 0);
2644 auto NegDenom = B.buildSub(S64, Zero64, Denom);
2645
2646 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2647 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2648
2649 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2650 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2651 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2652
2653 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2654 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2655 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2656 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2657
2658 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2659 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2660 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2661 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2662 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2663
2664 auto Zero32 = B.buildConstant(S32, 0);
2665 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2666 auto Add2_HiC =
2667 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2668 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2669 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2670
2671 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2672 Register NumerLo = UnmergeNumer.getReg(0);
2673 Register NumerHi = UnmergeNumer.getReg(1);
2674
2675 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2676 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2677 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2678 Register Mul3_Lo = UnmergeMul3.getReg(0);
2679 Register Mul3_Hi = UnmergeMul3.getReg(1);
2680 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2681 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2682 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2683 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2684
2685 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2686 Register DenomLo = UnmergeDenom.getReg(0);
2687 Register DenomHi = UnmergeDenom.getReg(1);
2688
2689 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2690 auto C1 = B.buildSExt(S32, CmpHi);
2691
2692 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2693 auto C2 = B.buildSExt(S32, CmpLo);
2694
2695 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2696 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2697
2698 // TODO: Here and below portions of the code can be enclosed into if/endif.
2699 // Currently control flow is unconditional and we have 4 selects after
2700 // potential endif to substitute PHIs.
2701
2702 // if C3 != 0 ...
2703 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2704 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2705 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2706 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2707
2708 auto One64 = B.buildConstant(S64, 1);
2709 auto Add3 = B.buildAdd(S64, MulHi3, One64);
2710
2711 auto C4 =
2712 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2713 auto C5 =
2714 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2715 auto C6 = B.buildSelect(
2716 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2717
2718 // if (C6 != 0)
2719 auto Add4 = B.buildAdd(S64, Add3, One64);
2720 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2721
2722 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2723 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2724 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2725
2726 // endif C6
2727 // endif C3
2728
2729 if (IsDiv) {
2730 auto Sel1 = B.buildSelect(
2731 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2732 B.buildSelect(DstReg,
2733 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2734 } else {
2735 auto Sel2 = B.buildSelect(
2736 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2737 B.buildSelect(DstReg,
2738 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2739 }
2740 }
2741
legalizeUDIV_UREM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2742 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2743 MachineRegisterInfo &MRI,
2744 MachineIRBuilder &B) const {
2745 const LLT S64 = LLT::scalar(64);
2746 const LLT S32 = LLT::scalar(32);
2747 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2748 Register DstReg = MI.getOperand(0).getReg();
2749 Register Num = MI.getOperand(1).getReg();
2750 Register Den = MI.getOperand(2).getReg();
2751 LLT Ty = MRI.getType(DstReg);
2752
2753 if (Ty == S32)
2754 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2755 else if (Ty == S64)
2756 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2757 else
2758 return false;
2759
2760 MI.eraseFromParent();
2761 return true;
2762
2763 }
2764
legalizeSDIV_SREM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2765 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2766 MachineRegisterInfo &MRI,
2767 MachineIRBuilder &B) const {
2768 const LLT S64 = LLT::scalar(64);
2769 const LLT S32 = LLT::scalar(32);
2770
2771 Register DstReg = MI.getOperand(0).getReg();
2772 const LLT Ty = MRI.getType(DstReg);
2773 if (Ty != S32 && Ty != S64)
2774 return false;
2775
2776 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2777
2778 Register LHS = MI.getOperand(1).getReg();
2779 Register RHS = MI.getOperand(2).getReg();
2780
2781 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2782 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2783 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2784
2785 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2786 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2787
2788 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2789 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2790
2791 Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2792 if (Ty == S32)
2793 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2794 else
2795 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2796
2797 Register Sign;
2798 if (IsDiv)
2799 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2800 else
2801 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2802
2803 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2804 B.buildSub(DstReg, UDivRem, Sign);
2805
2806 MI.eraseFromParent();
2807 return true;
2808 }
2809
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2810 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2811 MachineRegisterInfo &MRI,
2812 MachineIRBuilder &B) const {
2813 Register Res = MI.getOperand(0).getReg();
2814 Register LHS = MI.getOperand(1).getReg();
2815 Register RHS = MI.getOperand(2).getReg();
2816
2817 uint16_t Flags = MI.getFlags();
2818
2819 LLT ResTy = MRI.getType(Res);
2820 LLT S32 = LLT::scalar(32);
2821 LLT S64 = LLT::scalar(64);
2822
2823 const MachineFunction &MF = B.getMF();
2824 bool Unsafe =
2825 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2826
2827 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2828 return false;
2829
2830 if (!Unsafe && ResTy == S32 &&
2831 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2832 return false;
2833
2834 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2835 // 1 / x -> RCP(x)
2836 if (CLHS->isExactlyValue(1.0)) {
2837 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2838 .addUse(RHS)
2839 .setMIFlags(Flags);
2840
2841 MI.eraseFromParent();
2842 return true;
2843 }
2844
2845 // -1 / x -> RCP( FNEG(x) )
2846 if (CLHS->isExactlyValue(-1.0)) {
2847 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2848 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2849 .addUse(FNeg.getReg(0))
2850 .setMIFlags(Flags);
2851
2852 MI.eraseFromParent();
2853 return true;
2854 }
2855 }
2856
2857 // x / y -> x * (1.0 / y)
2858 if (Unsafe) {
2859 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2860 .addUse(RHS)
2861 .setMIFlags(Flags);
2862 B.buildFMul(Res, LHS, RCP, Flags);
2863
2864 MI.eraseFromParent();
2865 return true;
2866 }
2867
2868 return false;
2869 }
2870
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2871 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2872 MachineRegisterInfo &MRI,
2873 MachineIRBuilder &B) const {
2874 Register Res = MI.getOperand(0).getReg();
2875 Register LHS = MI.getOperand(1).getReg();
2876 Register RHS = MI.getOperand(2).getReg();
2877
2878 uint16_t Flags = MI.getFlags();
2879
2880 LLT S16 = LLT::scalar(16);
2881 LLT S32 = LLT::scalar(32);
2882
2883 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2884 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2885
2886 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2887 .addUse(RHSExt.getReg(0))
2888 .setMIFlags(Flags);
2889
2890 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2891 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2892
2893 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2894 .addUse(RDst.getReg(0))
2895 .addUse(RHS)
2896 .addUse(LHS)
2897 .setMIFlags(Flags);
2898
2899 MI.eraseFromParent();
2900 return true;
2901 }
2902
2903 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2904 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,AMDGPU::SIModeRegisterDefaults Mode)2905 static void toggleSPDenormMode(bool Enable,
2906 MachineIRBuilder &B,
2907 const GCNSubtarget &ST,
2908 AMDGPU::SIModeRegisterDefaults Mode) {
2909 // Set SP denorm mode to this value.
2910 unsigned SPDenormMode =
2911 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2912
2913 if (ST.hasDenormModeInst()) {
2914 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2915 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2916
2917 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2918 B.buildInstr(AMDGPU::S_DENORM_MODE)
2919 .addImm(NewDenormModeValue);
2920
2921 } else {
2922 // Select FP32 bit field in mode register.
2923 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2924 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2925 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2926
2927 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2928 .addImm(SPDenormMode)
2929 .addImm(SPDenormModeBitField);
2930 }
2931 }
2932
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2933 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2934 MachineRegisterInfo &MRI,
2935 MachineIRBuilder &B) const {
2936 Register Res = MI.getOperand(0).getReg();
2937 Register LHS = MI.getOperand(1).getReg();
2938 Register RHS = MI.getOperand(2).getReg();
2939 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2940 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2941
2942 uint16_t Flags = MI.getFlags();
2943
2944 LLT S32 = LLT::scalar(32);
2945 LLT S1 = LLT::scalar(1);
2946
2947 auto One = B.buildFConstant(S32, 1.0f);
2948
2949 auto DenominatorScaled =
2950 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2951 .addUse(LHS)
2952 .addUse(RHS)
2953 .addImm(0)
2954 .setMIFlags(Flags);
2955 auto NumeratorScaled =
2956 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2957 .addUse(LHS)
2958 .addUse(RHS)
2959 .addImm(1)
2960 .setMIFlags(Flags);
2961
2962 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2963 .addUse(DenominatorScaled.getReg(0))
2964 .setMIFlags(Flags);
2965 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2966
2967 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2968 // aren't modeled as reading it.
2969 if (!Mode.allFP32Denormals())
2970 toggleSPDenormMode(true, B, ST, Mode);
2971
2972 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2973 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2974 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2975 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2976 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2977 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2978
2979 if (!Mode.allFP32Denormals())
2980 toggleSPDenormMode(false, B, ST, Mode);
2981
2982 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2983 .addUse(Fma4.getReg(0))
2984 .addUse(Fma1.getReg(0))
2985 .addUse(Fma3.getReg(0))
2986 .addUse(NumeratorScaled.getReg(1))
2987 .setMIFlags(Flags);
2988
2989 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2990 .addUse(Fmas.getReg(0))
2991 .addUse(RHS)
2992 .addUse(LHS)
2993 .setMIFlags(Flags);
2994
2995 MI.eraseFromParent();
2996 return true;
2997 }
2998
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2999 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3000 MachineRegisterInfo &MRI,
3001 MachineIRBuilder &B) const {
3002 Register Res = MI.getOperand(0).getReg();
3003 Register LHS = MI.getOperand(1).getReg();
3004 Register RHS = MI.getOperand(2).getReg();
3005
3006 uint16_t Flags = MI.getFlags();
3007
3008 LLT S64 = LLT::scalar(64);
3009 LLT S1 = LLT::scalar(1);
3010
3011 auto One = B.buildFConstant(S64, 1.0);
3012
3013 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3014 .addUse(LHS)
3015 .addUse(RHS)
3016 .addImm(0)
3017 .setMIFlags(Flags);
3018
3019 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3020
3021 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3022 .addUse(DivScale0.getReg(0))
3023 .setMIFlags(Flags);
3024
3025 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3026 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3027 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3028
3029 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3030 .addUse(LHS)
3031 .addUse(RHS)
3032 .addImm(1)
3033 .setMIFlags(Flags);
3034
3035 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3036 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3037 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3038
3039 Register Scale;
3040 if (!ST.hasUsableDivScaleConditionOutput()) {
3041 // Workaround a hardware bug on SI where the condition output from div_scale
3042 // is not usable.
3043
3044 LLT S32 = LLT::scalar(32);
3045
3046 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3047 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3048 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3049 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3050
3051 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3052 Scale1Unmerge.getReg(1));
3053 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3054 Scale0Unmerge.getReg(1));
3055 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3056 } else {
3057 Scale = DivScale1.getReg(1);
3058 }
3059
3060 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3061 .addUse(Fma4.getReg(0))
3062 .addUse(Fma3.getReg(0))
3063 .addUse(Mul.getReg(0))
3064 .addUse(Scale)
3065 .setMIFlags(Flags);
3066
3067 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3068 .addUse(Fmas.getReg(0))
3069 .addUse(RHS)
3070 .addUse(LHS)
3071 .setMIFlags(Flags);
3072
3073 MI.eraseFromParent();
3074 return true;
3075 }
3076
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3077 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3078 MachineRegisterInfo &MRI,
3079 MachineIRBuilder &B) const {
3080 Register Res = MI.getOperand(0).getReg();
3081 Register LHS = MI.getOperand(2).getReg();
3082 Register RHS = MI.getOperand(3).getReg();
3083 uint16_t Flags = MI.getFlags();
3084
3085 LLT S32 = LLT::scalar(32);
3086 LLT S1 = LLT::scalar(1);
3087
3088 auto Abs = B.buildFAbs(S32, RHS, Flags);
3089 const APFloat C0Val(1.0f);
3090
3091 auto C0 = B.buildConstant(S32, 0x6f800000);
3092 auto C1 = B.buildConstant(S32, 0x2f800000);
3093 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3094
3095 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3096 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3097
3098 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3099
3100 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3101 .addUse(Mul0.getReg(0))
3102 .setMIFlags(Flags);
3103
3104 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3105
3106 B.buildFMul(Res, Sel, Mul1, Flags);
3107
3108 MI.eraseFromParent();
3109 return true;
3110 }
3111
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3112 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3113 MachineRegisterInfo &MRI,
3114 MachineIRBuilder &B) const {
3115 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3116 if (!MFI->isEntryFunction()) {
3117 return legalizePreloadedArgIntrin(MI, MRI, B,
3118 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3119 }
3120
3121 uint64_t Offset =
3122 ST.getTargetLowering()->getImplicitParameterOffset(
3123 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3124 Register DstReg = MI.getOperand(0).getReg();
3125 LLT DstTy = MRI.getType(DstReg);
3126 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3127
3128 const ArgDescriptor *Arg;
3129 const TargetRegisterClass *RC;
3130 LLT ArgTy;
3131 std::tie(Arg, RC, ArgTy) =
3132 MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3133 if (!Arg)
3134 return false;
3135
3136 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3137 if (!loadInputValue(KernargPtrReg, B, Arg))
3138 return false;
3139
3140 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3141 MI.eraseFromParent();
3142 return true;
3143 }
3144
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const3145 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3146 MachineRegisterInfo &MRI,
3147 MachineIRBuilder &B,
3148 unsigned AddrSpace) const {
3149 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3150 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3151 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3152 MI.eraseFromParent();
3153 return true;
3154 }
3155
3156 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3157 // offset (the offset that is included in bounds checking and swizzling, to be
3158 // split between the instruction's voffset and immoffset fields) and soffset
3159 // (the offset that is excluded from bounds checking and swizzling, to go in
3160 // the instruction's soffset field). This function takes the first kind of
3161 // offset and figures out how to split it between voffset and immoffset.
3162 std::tuple<Register, unsigned, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const3163 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3164 Register OrigOffset) const {
3165 const unsigned MaxImm = 4095;
3166 Register BaseReg;
3167 unsigned TotalConstOffset;
3168 MachineInstr *OffsetDef;
3169 const LLT S32 = LLT::scalar(32);
3170
3171 std::tie(BaseReg, TotalConstOffset, OffsetDef)
3172 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3173
3174 unsigned ImmOffset = TotalConstOffset;
3175
3176 // If the immediate value is too big for the immoffset field, put the value
3177 // and -4096 into the immoffset field so that the value that is copied/added
3178 // for the voffset field is a multiple of 4096, and it stands more chance
3179 // of being CSEd with the copy/add for another similar load/store.
3180 // However, do not do that rounding down to a multiple of 4096 if that is a
3181 // negative number, as it appears to be illegal to have a negative offset
3182 // in the vgpr, even if adding the immediate offset makes it positive.
3183 unsigned Overflow = ImmOffset & ~MaxImm;
3184 ImmOffset -= Overflow;
3185 if ((int32_t)Overflow < 0) {
3186 Overflow += ImmOffset;
3187 ImmOffset = 0;
3188 }
3189
3190 if (Overflow != 0) {
3191 if (!BaseReg) {
3192 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3193 } else {
3194 auto OverflowVal = B.buildConstant(S32, Overflow);
3195 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3196 }
3197 }
3198
3199 if (!BaseReg)
3200 BaseReg = B.buildConstant(S32, 0).getReg(0);
3201
3202 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3203 }
3204
3205 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const3206 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3207 MachineRegisterInfo &MRI,
3208 Register Reg) const {
3209 if (!ST.hasUnpackedD16VMem())
3210 return Reg;
3211
3212 const LLT S16 = LLT::scalar(16);
3213 const LLT S32 = LLT::scalar(32);
3214 LLT StoreVT = MRI.getType(Reg);
3215 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3216
3217 auto Unmerge = B.buildUnmerge(S16, Reg);
3218
3219 SmallVector<Register, 4> WideRegs;
3220 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3221 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3222
3223 int NumElts = StoreVT.getNumElements();
3224
3225 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3226 }
3227
fixStoreSourceType(MachineIRBuilder & B,Register VData,bool IsFormat) const3228 Register AMDGPULegalizerInfo::fixStoreSourceType(
3229 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3230 MachineRegisterInfo *MRI = B.getMRI();
3231 LLT Ty = MRI->getType(VData);
3232
3233 const LLT S16 = LLT::scalar(16);
3234
3235 // Fixup illegal register types for i8 stores.
3236 if (Ty == LLT::scalar(8) || Ty == S16) {
3237 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3238 return AnyExt;
3239 }
3240
3241 if (Ty.isVector()) {
3242 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3243 if (IsFormat)
3244 return handleD16VData(B, *MRI, VData);
3245 }
3246 }
3247
3248 return VData;
3249 }
3250
legalizeBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsTyped,bool IsFormat) const3251 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3252 MachineRegisterInfo &MRI,
3253 MachineIRBuilder &B,
3254 bool IsTyped,
3255 bool IsFormat) const {
3256 Register VData = MI.getOperand(1).getReg();
3257 LLT Ty = MRI.getType(VData);
3258 LLT EltTy = Ty.getScalarType();
3259 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3260 const LLT S32 = LLT::scalar(32);
3261
3262 VData = fixStoreSourceType(B, VData, IsFormat);
3263 Register RSrc = MI.getOperand(2).getReg();
3264
3265 MachineMemOperand *MMO = *MI.memoperands_begin();
3266 const int MemSize = MMO->getSize();
3267
3268 unsigned ImmOffset;
3269 unsigned TotalOffset;
3270
3271 // The typed intrinsics add an immediate after the registers.
3272 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3273
3274 // The struct intrinsic variants add one additional operand over raw.
3275 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3276 Register VIndex;
3277 int OpOffset = 0;
3278 if (HasVIndex) {
3279 VIndex = MI.getOperand(3).getReg();
3280 OpOffset = 1;
3281 }
3282
3283 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3284 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3285
3286 unsigned Format = 0;
3287 if (IsTyped) {
3288 Format = MI.getOperand(5 + OpOffset).getImm();
3289 ++OpOffset;
3290 }
3291
3292 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3293
3294 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3295 if (TotalOffset != 0)
3296 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3297
3298 unsigned Opc;
3299 if (IsTyped) {
3300 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3301 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3302 } else if (IsFormat) {
3303 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3304 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3305 } else {
3306 switch (MemSize) {
3307 case 1:
3308 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3309 break;
3310 case 2:
3311 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3312 break;
3313 default:
3314 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3315 break;
3316 }
3317 }
3318
3319 if (!VIndex)
3320 VIndex = B.buildConstant(S32, 0).getReg(0);
3321
3322 auto MIB = B.buildInstr(Opc)
3323 .addUse(VData) // vdata
3324 .addUse(RSrc) // rsrc
3325 .addUse(VIndex) // vindex
3326 .addUse(VOffset) // voffset
3327 .addUse(SOffset) // soffset
3328 .addImm(ImmOffset); // offset(imm)
3329
3330 if (IsTyped)
3331 MIB.addImm(Format);
3332
3333 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3334 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3335 .addMemOperand(MMO);
3336
3337 MI.eraseFromParent();
3338 return true;
3339 }
3340
legalizeBufferLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat,bool IsTyped) const3341 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3342 MachineRegisterInfo &MRI,
3343 MachineIRBuilder &B,
3344 bool IsFormat,
3345 bool IsTyped) const {
3346 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3347 MachineMemOperand *MMO = *MI.memoperands_begin();
3348 const int MemSize = MMO->getSize();
3349 const LLT S32 = LLT::scalar(32);
3350
3351 Register Dst = MI.getOperand(0).getReg();
3352 Register RSrc = MI.getOperand(2).getReg();
3353
3354 // The typed intrinsics add an immediate after the registers.
3355 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3356
3357 // The struct intrinsic variants add one additional operand over raw.
3358 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3359 Register VIndex;
3360 int OpOffset = 0;
3361 if (HasVIndex) {
3362 VIndex = MI.getOperand(3).getReg();
3363 OpOffset = 1;
3364 }
3365
3366 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3367 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3368
3369 unsigned Format = 0;
3370 if (IsTyped) {
3371 Format = MI.getOperand(5 + OpOffset).getImm();
3372 ++OpOffset;
3373 }
3374
3375 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3376 unsigned ImmOffset;
3377 unsigned TotalOffset;
3378
3379 LLT Ty = MRI.getType(Dst);
3380 LLT EltTy = Ty.getScalarType();
3381 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3382 const bool Unpacked = ST.hasUnpackedD16VMem();
3383
3384 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3385 if (TotalOffset != 0)
3386 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3387
3388 unsigned Opc;
3389
3390 if (IsTyped) {
3391 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3392 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3393 } else if (IsFormat) {
3394 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3395 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3396 } else {
3397 switch (MemSize) {
3398 case 1:
3399 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3400 break;
3401 case 2:
3402 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3403 break;
3404 default:
3405 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3406 break;
3407 }
3408 }
3409
3410 Register LoadDstReg;
3411
3412 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3413 LLT UnpackedTy = Ty.changeElementSize(32);
3414
3415 if (IsExtLoad)
3416 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3417 else if (Unpacked && IsD16 && Ty.isVector())
3418 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3419 else
3420 LoadDstReg = Dst;
3421
3422 if (!VIndex)
3423 VIndex = B.buildConstant(S32, 0).getReg(0);
3424
3425 auto MIB = B.buildInstr(Opc)
3426 .addDef(LoadDstReg) // vdata
3427 .addUse(RSrc) // rsrc
3428 .addUse(VIndex) // vindex
3429 .addUse(VOffset) // voffset
3430 .addUse(SOffset) // soffset
3431 .addImm(ImmOffset); // offset(imm)
3432
3433 if (IsTyped)
3434 MIB.addImm(Format);
3435
3436 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3437 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3438 .addMemOperand(MMO);
3439
3440 if (LoadDstReg != Dst) {
3441 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3442
3443 // Widen result for extending loads was widened.
3444 if (IsExtLoad)
3445 B.buildTrunc(Dst, LoadDstReg);
3446 else {
3447 // Repack to original 16-bit vector result
3448 // FIXME: G_TRUNC should work, but legalization currently fails
3449 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3450 SmallVector<Register, 4> Repack;
3451 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3452 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3453 B.buildMerge(Dst, Repack);
3454 }
3455 }
3456
3457 MI.eraseFromParent();
3458 return true;
3459 }
3460
legalizeAtomicIncDec(MachineInstr & MI,MachineIRBuilder & B,bool IsInc) const3461 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3462 MachineIRBuilder &B,
3463 bool IsInc) const {
3464 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3465 AMDGPU::G_AMDGPU_ATOMIC_DEC;
3466 B.buildInstr(Opc)
3467 .addDef(MI.getOperand(0).getReg())
3468 .addUse(MI.getOperand(2).getReg())
3469 .addUse(MI.getOperand(3).getReg())
3470 .cloneMemRefs(MI);
3471 MI.eraseFromParent();
3472 return true;
3473 }
3474
getBufferAtomicPseudo(Intrinsic::ID IntrID)3475 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3476 switch (IntrID) {
3477 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3478 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3479 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3480 case Intrinsic::amdgcn_raw_buffer_atomic_add:
3481 case Intrinsic::amdgcn_struct_buffer_atomic_add:
3482 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3483 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3484 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3485 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3486 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3487 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3488 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3489 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3490 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3491 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3492 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3493 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3494 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3495 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3496 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3497 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3498 case Intrinsic::amdgcn_raw_buffer_atomic_and:
3499 case Intrinsic::amdgcn_struct_buffer_atomic_and:
3500 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3501 case Intrinsic::amdgcn_raw_buffer_atomic_or:
3502 case Intrinsic::amdgcn_struct_buffer_atomic_or:
3503 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3504 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3505 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3506 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3507 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3508 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3509 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3510 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3511 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3512 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3513 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3514 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3515 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3516 default:
3517 llvm_unreachable("unhandled atomic opcode");
3518 }
3519 }
3520
legalizeBufferAtomic(MachineInstr & MI,MachineIRBuilder & B,Intrinsic::ID IID) const3521 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3522 MachineIRBuilder &B,
3523 Intrinsic::ID IID) const {
3524 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3525 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3526
3527 Register Dst = MI.getOperand(0).getReg();
3528 Register VData = MI.getOperand(2).getReg();
3529
3530 Register CmpVal;
3531 int OpOffset = 0;
3532
3533 if (IsCmpSwap) {
3534 CmpVal = MI.getOperand(3 + OpOffset).getReg();
3535 ++OpOffset;
3536 }
3537
3538 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3539 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3540
3541 // The struct intrinsic variants add one additional operand over raw.
3542 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3543 Register VIndex;
3544 if (HasVIndex) {
3545 VIndex = MI.getOperand(4 + OpOffset).getReg();
3546 ++OpOffset;
3547 }
3548
3549 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3550 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3551 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3552
3553 MachineMemOperand *MMO = *MI.memoperands_begin();
3554
3555 unsigned ImmOffset;
3556 unsigned TotalOffset;
3557 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3558 if (TotalOffset != 0)
3559 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3560
3561 if (!VIndex)
3562 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3563
3564 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3565 .addDef(Dst)
3566 .addUse(VData); // vdata
3567
3568 if (IsCmpSwap)
3569 MIB.addReg(CmpVal);
3570
3571 MIB.addUse(RSrc) // rsrc
3572 .addUse(VIndex) // vindex
3573 .addUse(VOffset) // voffset
3574 .addUse(SOffset) // soffset
3575 .addImm(ImmOffset) // offset(imm)
3576 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3577 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3578 .addMemOperand(MMO);
3579
3580 MI.eraseFromParent();
3581 return true;
3582 }
3583
3584 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3585 /// vector with s16 typed elements.
packImageA16AddressToDwords(MachineIRBuilder & B,MachineInstr & MI,SmallVectorImpl<Register> & PackedAddrs,int AddrIdx,int DimIdx,int EndIdx,int NumGradients)3586 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3587 SmallVectorImpl<Register> &PackedAddrs,
3588 int AddrIdx, int DimIdx, int EndIdx,
3589 int NumGradients) {
3590 const LLT S16 = LLT::scalar(16);
3591 const LLT V2S16 = LLT::vector(2, 16);
3592
3593 for (int I = AddrIdx; I < EndIdx; ++I) {
3594 MachineOperand &SrcOp = MI.getOperand(I);
3595 if (!SrcOp.isReg())
3596 continue; // _L to _LZ may have eliminated this.
3597
3598 Register AddrReg = SrcOp.getReg();
3599
3600 if (I < DimIdx) {
3601 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3602 PackedAddrs.push_back(AddrReg);
3603 } else {
3604 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3605 // derivatives dx/dh and dx/dv are packed with undef.
3606 if (((I + 1) >= EndIdx) ||
3607 ((NumGradients / 2) % 2 == 1 &&
3608 (I == DimIdx + (NumGradients / 2) - 1 ||
3609 I == DimIdx + NumGradients - 1)) ||
3610 // Check for _L to _LZ optimization
3611 !MI.getOperand(I + 1).isReg()) {
3612 PackedAddrs.push_back(
3613 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3614 .getReg(0));
3615 } else {
3616 PackedAddrs.push_back(
3617 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3618 .getReg(0));
3619 ++I;
3620 }
3621 }
3622 }
3623 }
3624
3625 /// Convert from separate vaddr components to a single vector address register,
3626 /// and replace the remaining operands with $noreg.
convertImageAddrToPacked(MachineIRBuilder & B,MachineInstr & MI,int DimIdx,int NumVAddrs)3627 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3628 int DimIdx, int NumVAddrs) {
3629 const LLT S32 = LLT::scalar(32);
3630
3631 SmallVector<Register, 8> AddrRegs;
3632 for (int I = 0; I != NumVAddrs; ++I) {
3633 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3634 if (SrcOp.isReg()) {
3635 AddrRegs.push_back(SrcOp.getReg());
3636 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3637 }
3638 }
3639
3640 int NumAddrRegs = AddrRegs.size();
3641 if (NumAddrRegs != 1) {
3642 // Round up to 8 elements for v5-v7
3643 // FIXME: Missing intermediate sized register classes and instructions.
3644 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3645 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3646 auto Undef = B.buildUndef(S32);
3647 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3648 NumAddrRegs = RoundedNumRegs;
3649 }
3650
3651 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3652 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3653 }
3654
3655 for (int I = 1; I != NumVAddrs; ++I) {
3656 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3657 if (SrcOp.isReg())
3658 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3659 }
3660 }
3661
3662 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3663 ///
3664 /// Depending on the subtarget, load/store with 16-bit element data need to be
3665 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3666 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3667 /// registers.
3668 ///
3669 /// We don't want to directly select image instructions just yet, but also want
3670 /// to exposes all register repacking to the legalizer/combiners. We also don't
3671 /// want a selected instrution entering RegBankSelect. In order to avoid
3672 /// defining a multitude of intermediate image instructions, directly hack on
3673 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3674 /// now unnecessary arguments with $noreg.
legalizeImageIntrinsic(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer,const AMDGPU::ImageDimIntrinsicInfo * ImageDimIntr) const3675 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3676 MachineInstr &MI, MachineIRBuilder &B,
3677 GISelChangeObserver &Observer,
3678 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3679
3680 const int NumDefs = MI.getNumExplicitDefs();
3681 bool IsTFE = NumDefs == 2;
3682 // We are only processing the operands of d16 image operations on subtargets
3683 // that use the unpacked register layout, or need to repack the TFE result.
3684
3685 // TODO: Do we need to guard against already legalized intrinsics?
3686 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3687 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3688
3689 MachineRegisterInfo *MRI = B.getMRI();
3690 const LLT S32 = LLT::scalar(32);
3691 const LLT S16 = LLT::scalar(16);
3692 const LLT V2S16 = LLT::vector(2, 16);
3693
3694 // Index of first address argument
3695 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3696
3697 int NumVAddrs, NumGradients;
3698 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3699 const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3700 getDMaskIdx(BaseOpcode, NumDefs);
3701 unsigned DMask = 0;
3702
3703 // Check for 16 bit addresses and pack if true.
3704 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3705 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3706 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3707 const bool IsG16 = GradTy == S16;
3708 const bool IsA16 = AddrTy == S16;
3709
3710 int DMaskLanes = 0;
3711 if (!BaseOpcode->Atomic) {
3712 DMask = MI.getOperand(DMaskIdx).getImm();
3713 if (BaseOpcode->Gather4) {
3714 DMaskLanes = 4;
3715 } else if (DMask != 0) {
3716 DMaskLanes = countPopulation(DMask);
3717 } else if (!IsTFE && !BaseOpcode->Store) {
3718 // If dmask is 0, this is a no-op load. This can be eliminated.
3719 B.buildUndef(MI.getOperand(0));
3720 MI.eraseFromParent();
3721 return true;
3722 }
3723 }
3724
3725 Observer.changingInstr(MI);
3726 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3727
3728 unsigned NewOpcode = NumDefs == 0 ?
3729 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3730
3731 // Track that we legalized this
3732 MI.setDesc(B.getTII().get(NewOpcode));
3733
3734 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3735 // dmask to be at least 1 otherwise the instruction will fail
3736 if (IsTFE && DMask == 0) {
3737 DMask = 0x1;
3738 DMaskLanes = 1;
3739 MI.getOperand(DMaskIdx).setImm(DMask);
3740 }
3741
3742 if (BaseOpcode->Atomic) {
3743 Register VData0 = MI.getOperand(2).getReg();
3744 LLT Ty = MRI->getType(VData0);
3745
3746 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3747 if (Ty.isVector())
3748 return false;
3749
3750 if (BaseOpcode->AtomicX2) {
3751 Register VData1 = MI.getOperand(3).getReg();
3752 // The two values are packed in one register.
3753 LLT PackedTy = LLT::vector(2, Ty);
3754 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3755 MI.getOperand(2).setReg(Concat.getReg(0));
3756 MI.getOperand(3).setReg(AMDGPU::NoRegister);
3757 }
3758 }
3759
3760 int CorrectedNumVAddrs = NumVAddrs;
3761
3762 // Optimize _L to _LZ when _L is zero
3763 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3764 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3765 const ConstantFP *ConstantLod;
3766 const int LodIdx = AddrIdx + NumVAddrs - 1;
3767
3768 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3769 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3770 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3771 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3772 LZMappingInfo->LZ, ImageDimIntr->Dim);
3773
3774 // The starting indexes should remain in the same place.
3775 --NumVAddrs;
3776 --CorrectedNumVAddrs;
3777
3778 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3779 static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3780 MI.RemoveOperand(LodIdx);
3781 }
3782 }
3783 }
3784
3785 // Optimize _mip away, when 'lod' is zero
3786 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3787 int64_t ConstantLod;
3788 const int LodIdx = AddrIdx + NumVAddrs - 1;
3789
3790 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3791 if (ConstantLod == 0) {
3792 // TODO: Change intrinsic opcode and remove operand instead or replacing
3793 // it with 0, as the _L to _LZ handling is done above.
3794 MI.getOperand(LodIdx).ChangeToImmediate(0);
3795 --CorrectedNumVAddrs;
3796 }
3797 }
3798 }
3799
3800 // Rewrite the addressing register layout before doing anything else.
3801 if (IsA16 || IsG16) {
3802 if (IsA16) {
3803 // Target must support the feature and gradients need to be 16 bit too
3804 if (!ST.hasA16() || !IsG16)
3805 return false;
3806 } else if (!ST.hasG16())
3807 return false;
3808
3809 if (NumVAddrs > 1) {
3810 SmallVector<Register, 4> PackedRegs;
3811 // Don't compress addresses for G16
3812 const int PackEndIdx =
3813 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3814 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3815 PackEndIdx, NumGradients);
3816
3817 if (!IsA16) {
3818 // Add uncompressed address
3819 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3820 int AddrReg = MI.getOperand(I).getReg();
3821 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3822 PackedRegs.push_back(AddrReg);
3823 }
3824 }
3825
3826 // See also below in the non-a16 branch
3827 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3828
3829 if (!UseNSA && PackedRegs.size() > 1) {
3830 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3831 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3832 PackedRegs[0] = Concat.getReg(0);
3833 PackedRegs.resize(1);
3834 }
3835
3836 const int NumPacked = PackedRegs.size();
3837 for (int I = 0; I != NumVAddrs; ++I) {
3838 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3839 if (!SrcOp.isReg()) {
3840 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3841 continue;
3842 }
3843
3844 assert(SrcOp.getReg() != AMDGPU::NoRegister);
3845
3846 if (I < NumPacked)
3847 SrcOp.setReg(PackedRegs[I]);
3848 else
3849 SrcOp.setReg(AMDGPU::NoRegister);
3850 }
3851 }
3852 } else {
3853 // If the register allocator cannot place the address registers contiguously
3854 // without introducing moves, then using the non-sequential address encoding
3855 // is always preferable, since it saves VALU instructions and is usually a
3856 // wash in terms of code size or even better.
3857 //
3858 // However, we currently have no way of hinting to the register allocator
3859 // that MIMG addresses should be placed contiguously when it is possible to
3860 // do so, so force non-NSA for the common 2-address case as a heuristic.
3861 //
3862 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3863 // allocation when possible.
3864 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3865
3866 if (!UseNSA && NumVAddrs > 1)
3867 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3868 }
3869
3870 int Flags = 0;
3871 if (IsA16)
3872 Flags |= 1;
3873 if (IsG16)
3874 Flags |= 2;
3875 MI.addOperand(MachineOperand::CreateImm(Flags));
3876
3877 if (BaseOpcode->Store) { // No TFE for stores?
3878 // TODO: Handle dmask trim
3879 Register VData = MI.getOperand(1).getReg();
3880 LLT Ty = MRI->getType(VData);
3881 if (!Ty.isVector() || Ty.getElementType() != S16)
3882 return true;
3883
3884 Register RepackedReg = handleD16VData(B, *MRI, VData);
3885 if (RepackedReg != VData) {
3886 MI.getOperand(1).setReg(RepackedReg);
3887 }
3888
3889 return true;
3890 }
3891
3892 Register DstReg = MI.getOperand(0).getReg();
3893 LLT Ty = MRI->getType(DstReg);
3894 const LLT EltTy = Ty.getScalarType();
3895 const bool IsD16 = Ty.getScalarType() == S16;
3896 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3897
3898 // Confirm that the return type is large enough for the dmask specified
3899 if (NumElts < DMaskLanes)
3900 return false;
3901
3902 if (NumElts > 4 || DMaskLanes > 4)
3903 return false;
3904
3905 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3906 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3907
3908 // The raw dword aligned data component of the load. The only legal cases
3909 // where this matters should be when using the packed D16 format, for
3910 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3911 LLT RoundedTy;
3912
3913 // S32 vector to to cover all data, plus TFE result element.
3914 LLT TFETy;
3915
3916 // Register type to use for each loaded component. Will be S32 or V2S16.
3917 LLT RegTy;
3918
3919 if (IsD16 && ST.hasUnpackedD16VMem()) {
3920 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3921 TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3922 RegTy = S32;
3923 } else {
3924 unsigned EltSize = EltTy.getSizeInBits();
3925 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3926 unsigned RoundedSize = 32 * RoundedElts;
3927 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3928 TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3929 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3930 }
3931
3932 // The return type does not need adjustment.
3933 // TODO: Should we change s16 case to s32 or <2 x s16>?
3934 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3935 return true;
3936
3937 Register Dst1Reg;
3938
3939 // Insert after the instruction.
3940 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3941
3942 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3943 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3944 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3945 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3946
3947 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3948
3949 MI.getOperand(0).setReg(NewResultReg);
3950
3951 // In the IR, TFE is supposed to be used with a 2 element struct return
3952 // type. The intruction really returns these two values in one contiguous
3953 // register, with one additional dword beyond the loaded data. Rewrite the
3954 // return type to use a single register result.
3955
3956 if (IsTFE) {
3957 Dst1Reg = MI.getOperand(1).getReg();
3958 if (MRI->getType(Dst1Reg) != S32)
3959 return false;
3960
3961 // TODO: Make sure the TFE operand bit is set.
3962 MI.RemoveOperand(1);
3963
3964 // Handle the easy case that requires no repack instructions.
3965 if (Ty == S32) {
3966 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3967 return true;
3968 }
3969 }
3970
3971 // Now figure out how to copy the new result register back into the old
3972 // result.
3973 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3974
3975 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
3976
3977 if (ResultNumRegs == 1) {
3978 assert(!IsTFE);
3979 ResultRegs[0] = NewResultReg;
3980 } else {
3981 // We have to repack into a new vector of some kind.
3982 for (int I = 0; I != NumDataRegs; ++I)
3983 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3984 B.buildUnmerge(ResultRegs, NewResultReg);
3985
3986 // Drop the final TFE element to get the data part. The TFE result is
3987 // directly written to the right place already.
3988 if (IsTFE)
3989 ResultRegs.resize(NumDataRegs);
3990 }
3991
3992 // For an s16 scalar result, we form an s32 result with a truncate regardless
3993 // of packed vs. unpacked.
3994 if (IsD16 && !Ty.isVector()) {
3995 B.buildTrunc(DstReg, ResultRegs[0]);
3996 return true;
3997 }
3998
3999 // Avoid a build/concat_vector of 1 entry.
4000 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4001 B.buildBitcast(DstReg, ResultRegs[0]);
4002 return true;
4003 }
4004
4005 assert(Ty.isVector());
4006
4007 if (IsD16) {
4008 // For packed D16 results with TFE enabled, all the data components are
4009 // S32. Cast back to the expected type.
4010 //
4011 // TODO: We don't really need to use load s32 elements. We would only need one
4012 // cast for the TFE result if a multiple of v2s16 was used.
4013 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4014 for (Register &Reg : ResultRegs)
4015 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4016 } else if (ST.hasUnpackedD16VMem()) {
4017 for (Register &Reg : ResultRegs)
4018 Reg = B.buildTrunc(S16, Reg).getReg(0);
4019 }
4020 }
4021
4022 auto padWithUndef = [&](LLT Ty, int NumElts) {
4023 if (NumElts == 0)
4024 return;
4025 Register Undef = B.buildUndef(Ty).getReg(0);
4026 for (int I = 0; I != NumElts; ++I)
4027 ResultRegs.push_back(Undef);
4028 };
4029
4030 // Pad out any elements eliminated due to the dmask.
4031 LLT ResTy = MRI->getType(ResultRegs[0]);
4032 if (!ResTy.isVector()) {
4033 padWithUndef(ResTy, NumElts - ResultRegs.size());
4034 B.buildBuildVector(DstReg, ResultRegs);
4035 return true;
4036 }
4037
4038 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4039 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4040
4041 // Deal with the one annoying legal case.
4042 const LLT V3S16 = LLT::vector(3, 16);
4043 if (Ty == V3S16) {
4044 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4045 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4046 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4047 return true;
4048 }
4049
4050 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4051 B.buildConcatVectors(DstReg, ResultRegs);
4052 return true;
4053 }
4054
legalizeSBufferLoad(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer) const4055 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4056 MachineInstr &MI, MachineIRBuilder &B,
4057 GISelChangeObserver &Observer) const {
4058 Register Dst = MI.getOperand(0).getReg();
4059 LLT Ty = B.getMRI()->getType(Dst);
4060 unsigned Size = Ty.getSizeInBits();
4061 MachineFunction &MF = B.getMF();
4062
4063 Observer.changingInstr(MI);
4064
4065 // FIXME: We don't really need this intermediate instruction. The intrinsic
4066 // should be fixed to have a memory operand. Since it's readnone, we're not
4067 // allowed to add one.
4068 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4069 MI.RemoveOperand(1); // Remove intrinsic ID
4070
4071 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4072 // TODO: Should this use datalayout alignment?
4073 const unsigned MemSize = (Size + 7) / 8;
4074 const Align MemAlign(4);
4075 MachineMemOperand *MMO = MF.getMachineMemOperand(
4076 MachinePointerInfo(),
4077 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4078 MachineMemOperand::MOInvariant,
4079 MemSize, MemAlign);
4080 MI.addMemOperand(MF, MMO);
4081
4082 // There are no 96-bit result scalar loads, but widening to 128-bit should
4083 // always be legal. We may need to restore this to a 96-bit result if it turns
4084 // out this needs to be converted to a vector load during RegBankSelect.
4085 if (!isPowerOf2_32(Size)) {
4086 LegalizerHelper Helper(MF, *this, Observer, B);
4087
4088 if (Ty.isVector())
4089 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4090 else
4091 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4092 }
4093
4094 Observer.changedInstr(MI);
4095 return true;
4096 }
4097
legalizeTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4098 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4099 MachineRegisterInfo &MRI,
4100 MachineIRBuilder &B) const {
4101 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4102 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4103 !ST.isTrapHandlerEnabled()) {
4104 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4105 } else {
4106 // Pass queue pointer to trap handler as input, and insert trap instruction
4107 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4108 const ArgDescriptor *Arg =
4109 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4110 if (!Arg)
4111 return false;
4112 MachineRegisterInfo &MRI = *B.getMRI();
4113 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4114 Register LiveIn = getLiveInRegister(
4115 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4116 /*InsertLiveInCopy=*/false);
4117 if (!loadInputValue(LiveIn, B, Arg))
4118 return false;
4119 B.buildCopy(SGPR01, LiveIn);
4120 B.buildInstr(AMDGPU::S_TRAP)
4121 .addImm(GCNSubtarget::TrapIDLLVMTrap)
4122 .addReg(SGPR01, RegState::Implicit);
4123 }
4124
4125 MI.eraseFromParent();
4126 return true;
4127 }
4128
legalizeDebugTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4129 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4130 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4131 // Is non-HSA path or trap-handler disabled? then, report a warning
4132 // accordingly
4133 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4134 !ST.isTrapHandlerEnabled()) {
4135 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4136 "debugtrap handler not supported",
4137 MI.getDebugLoc(), DS_Warning);
4138 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4139 Ctx.diagnose(NoTrap);
4140 } else {
4141 // Insert debug-trap instruction
4142 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4143 }
4144
4145 MI.eraseFromParent();
4146 return true;
4147 }
4148
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const4149 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4150 MachineInstr &MI) const {
4151 MachineIRBuilder &B = Helper.MIRBuilder;
4152 MachineRegisterInfo &MRI = *B.getMRI();
4153
4154 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4155 auto IntrID = MI.getIntrinsicID();
4156 switch (IntrID) {
4157 case Intrinsic::amdgcn_if:
4158 case Intrinsic::amdgcn_else: {
4159 MachineInstr *Br = nullptr;
4160 MachineBasicBlock *UncondBrTarget = nullptr;
4161 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4162 const SIRegisterInfo *TRI
4163 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4164
4165 Register Def = MI.getOperand(1).getReg();
4166 Register Use = MI.getOperand(3).getReg();
4167
4168 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4169 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4170 if (IntrID == Intrinsic::amdgcn_if) {
4171 B.buildInstr(AMDGPU::SI_IF)
4172 .addDef(Def)
4173 .addUse(Use)
4174 .addMBB(UncondBrTarget);
4175 } else {
4176 B.buildInstr(AMDGPU::SI_ELSE)
4177 .addDef(Def)
4178 .addUse(Use)
4179 .addMBB(UncondBrTarget)
4180 .addImm(0);
4181 }
4182
4183 if (Br) {
4184 Br->getOperand(0).setMBB(CondBrTarget);
4185 } else {
4186 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4187 // since we're swapping branch targets it needs to be reinserted.
4188 // FIXME: IRTranslator should probably not do this
4189 B.buildBr(*CondBrTarget);
4190 }
4191
4192 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4193 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4194 MI.eraseFromParent();
4195 BrCond->eraseFromParent();
4196 return true;
4197 }
4198
4199 return false;
4200 }
4201 case Intrinsic::amdgcn_loop: {
4202 MachineInstr *Br = nullptr;
4203 MachineBasicBlock *UncondBrTarget = nullptr;
4204 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4205 const SIRegisterInfo *TRI
4206 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4207
4208 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4209 Register Reg = MI.getOperand(2).getReg();
4210
4211 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4212 B.buildInstr(AMDGPU::SI_LOOP)
4213 .addUse(Reg)
4214 .addMBB(UncondBrTarget);
4215
4216 if (Br)
4217 Br->getOperand(0).setMBB(CondBrTarget);
4218 else
4219 B.buildBr(*CondBrTarget);
4220
4221 MI.eraseFromParent();
4222 BrCond->eraseFromParent();
4223 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4224 return true;
4225 }
4226
4227 return false;
4228 }
4229 case Intrinsic::amdgcn_kernarg_segment_ptr:
4230 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4231 // This only makes sense to call in a kernel, so just lower to null.
4232 B.buildConstant(MI.getOperand(0).getReg(), 0);
4233 MI.eraseFromParent();
4234 return true;
4235 }
4236
4237 return legalizePreloadedArgIntrin(
4238 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4239 case Intrinsic::amdgcn_implicitarg_ptr:
4240 return legalizeImplicitArgPtr(MI, MRI, B);
4241 case Intrinsic::amdgcn_workitem_id_x:
4242 return legalizePreloadedArgIntrin(MI, MRI, B,
4243 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4244 case Intrinsic::amdgcn_workitem_id_y:
4245 return legalizePreloadedArgIntrin(MI, MRI, B,
4246 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4247 case Intrinsic::amdgcn_workitem_id_z:
4248 return legalizePreloadedArgIntrin(MI, MRI, B,
4249 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4250 case Intrinsic::amdgcn_workgroup_id_x:
4251 return legalizePreloadedArgIntrin(MI, MRI, B,
4252 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4253 case Intrinsic::amdgcn_workgroup_id_y:
4254 return legalizePreloadedArgIntrin(MI, MRI, B,
4255 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4256 case Intrinsic::amdgcn_workgroup_id_z:
4257 return legalizePreloadedArgIntrin(MI, MRI, B,
4258 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4259 case Intrinsic::amdgcn_dispatch_ptr:
4260 return legalizePreloadedArgIntrin(MI, MRI, B,
4261 AMDGPUFunctionArgInfo::DISPATCH_PTR);
4262 case Intrinsic::amdgcn_queue_ptr:
4263 return legalizePreloadedArgIntrin(MI, MRI, B,
4264 AMDGPUFunctionArgInfo::QUEUE_PTR);
4265 case Intrinsic::amdgcn_implicit_buffer_ptr:
4266 return legalizePreloadedArgIntrin(
4267 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4268 case Intrinsic::amdgcn_dispatch_id:
4269 return legalizePreloadedArgIntrin(MI, MRI, B,
4270 AMDGPUFunctionArgInfo::DISPATCH_ID);
4271 case Intrinsic::amdgcn_fdiv_fast:
4272 return legalizeFDIVFastIntrin(MI, MRI, B);
4273 case Intrinsic::amdgcn_is_shared:
4274 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4275 case Intrinsic::amdgcn_is_private:
4276 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4277 case Intrinsic::amdgcn_wavefrontsize: {
4278 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4279 MI.eraseFromParent();
4280 return true;
4281 }
4282 case Intrinsic::amdgcn_s_buffer_load:
4283 return legalizeSBufferLoad(MI, B, Helper.Observer);
4284 case Intrinsic::amdgcn_raw_buffer_store:
4285 case Intrinsic::amdgcn_struct_buffer_store:
4286 return legalizeBufferStore(MI, MRI, B, false, false);
4287 case Intrinsic::amdgcn_raw_buffer_store_format:
4288 case Intrinsic::amdgcn_struct_buffer_store_format:
4289 return legalizeBufferStore(MI, MRI, B, false, true);
4290 case Intrinsic::amdgcn_raw_tbuffer_store:
4291 case Intrinsic::amdgcn_struct_tbuffer_store:
4292 return legalizeBufferStore(MI, MRI, B, true, true);
4293 case Intrinsic::amdgcn_raw_buffer_load:
4294 case Intrinsic::amdgcn_struct_buffer_load:
4295 return legalizeBufferLoad(MI, MRI, B, false, false);
4296 case Intrinsic::amdgcn_raw_buffer_load_format:
4297 case Intrinsic::amdgcn_struct_buffer_load_format:
4298 return legalizeBufferLoad(MI, MRI, B, true, false);
4299 case Intrinsic::amdgcn_raw_tbuffer_load:
4300 case Intrinsic::amdgcn_struct_tbuffer_load:
4301 return legalizeBufferLoad(MI, MRI, B, true, true);
4302 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4303 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4304 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4305 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4306 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4307 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4308 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4309 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4310 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4311 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4312 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4313 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4314 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4315 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4316 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4317 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4318 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4319 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4320 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4321 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4322 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4323 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4324 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4325 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4326 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4327 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4328 return legalizeBufferAtomic(MI, B, IntrID);
4329 case Intrinsic::amdgcn_atomic_inc:
4330 return legalizeAtomicIncDec(MI, B, true);
4331 case Intrinsic::amdgcn_atomic_dec:
4332 return legalizeAtomicIncDec(MI, B, false);
4333 case Intrinsic::trap:
4334 return legalizeTrapIntrinsic(MI, MRI, B);
4335 case Intrinsic::debugtrap:
4336 return legalizeDebugTrapIntrinsic(MI, MRI, B);
4337 default: {
4338 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4339 AMDGPU::getImageDimIntrinsicInfo(IntrID))
4340 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4341 return true;
4342 }
4343 }
4344
4345 return true;
4346 }
4347