1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements the lowering of LLVM calls to machine code calls for
11 /// GlobalISel.
12 ///
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUISelLowering.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPUSubtarget.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "SIISelLowering.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "llvm/CodeGen/Analysis.h"
26 #include "llvm/CodeGen/CallingConvLower.h"
27 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
28 #include "llvm/CodeGen/MachineInstrBuilder.h"
29 #include "llvm/Support/LowLevelTypeImpl.h"
30
31 #define DEBUG_TYPE "amdgpu-call-lowering"
32
33 using namespace llvm;
34
35 namespace {
36
37 struct AMDGPUValueHandler : public CallLowering::ValueHandler {
AMDGPUValueHandler__anond9a63f4d0111::AMDGPUValueHandler38 AMDGPUValueHandler(bool IsIncoming, MachineIRBuilder &B,
39 MachineRegisterInfo &MRI, CCAssignFn *AssignFn)
40 : ValueHandler(IsIncoming, B, MRI, AssignFn) {}
41
42 /// Wrapper around extendRegister to ensure we extend to a full 32-bit
43 /// register.
extendRegisterMin32__anond9a63f4d0111::AMDGPUValueHandler44 Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) {
45 if (VA.getLocVT().getSizeInBits() < 32) {
46 // 16-bit types are reported as legal for 32-bit registers. We need to
47 // extend and do a 32-bit copy to avoid the verifier complaining about it.
48 return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
49 }
50
51 return extendRegister(ValVReg, VA);
52 }
53 };
54
55 struct AMDGPUOutgoingValueHandler : public AMDGPUValueHandler {
AMDGPUOutgoingValueHandler__anond9a63f4d0111::AMDGPUOutgoingValueHandler56 AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
57 MachineInstrBuilder MIB, CCAssignFn *AssignFn)
58 : AMDGPUValueHandler(false, B, MRI, AssignFn), MIB(MIB) {}
59
60 MachineInstrBuilder MIB;
61
getStackAddress__anond9a63f4d0111::AMDGPUOutgoingValueHandler62 Register getStackAddress(uint64_t Size, int64_t Offset,
63 MachinePointerInfo &MPO) override {
64 llvm_unreachable("not implemented");
65 }
66
assignValueToAddress__anond9a63f4d0111::AMDGPUOutgoingValueHandler67 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
68 MachinePointerInfo &MPO, CCValAssign &VA) override {
69 llvm_unreachable("not implemented");
70 }
71
assignValueToReg__anond9a63f4d0111::AMDGPUOutgoingValueHandler72 void assignValueToReg(Register ValVReg, Register PhysReg,
73 CCValAssign &VA) override {
74 Register ExtReg = extendRegisterMin32(ValVReg, VA);
75
76 // If this is a scalar return, insert a readfirstlane just in case the value
77 // ends up in a VGPR.
78 // FIXME: Assert this is a shader return.
79 const SIRegisterInfo *TRI
80 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
81 if (TRI->isSGPRReg(MRI, PhysReg)) {
82 auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
83 {MRI.getType(ExtReg)}, false)
84 .addReg(ExtReg);
85 ExtReg = ToSGPR.getReg(0);
86 }
87
88 MIRBuilder.buildCopy(PhysReg, ExtReg);
89 MIB.addUse(PhysReg, RegState::Implicit);
90 }
91
assignArg__anond9a63f4d0111::AMDGPUOutgoingValueHandler92 bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
93 CCValAssign::LocInfo LocInfo,
94 const CallLowering::ArgInfo &Info,
95 ISD::ArgFlagsTy Flags,
96 CCState &State) override {
97 return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
98 }
99 };
100
101 struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler {
102 uint64_t StackUsed = 0;
103
AMDGPUIncomingArgHandler__anond9a63f4d0111::AMDGPUIncomingArgHandler104 AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
105 CCAssignFn *AssignFn)
106 : AMDGPUValueHandler(true, B, MRI, AssignFn) {}
107
getStackAddress__anond9a63f4d0111::AMDGPUIncomingArgHandler108 Register getStackAddress(uint64_t Size, int64_t Offset,
109 MachinePointerInfo &MPO) override {
110 auto &MFI = MIRBuilder.getMF().getFrameInfo();
111 int FI = MFI.CreateFixedObject(Size, Offset, true);
112 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
113 auto AddrReg = MIRBuilder.buildFrameIndex(
114 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI);
115 StackUsed = std::max(StackUsed, Size + Offset);
116 return AddrReg.getReg(0);
117 }
118
assignValueToReg__anond9a63f4d0111::AMDGPUIncomingArgHandler119 void assignValueToReg(Register ValVReg, Register PhysReg,
120 CCValAssign &VA) override {
121 markPhysRegUsed(PhysReg);
122
123 if (VA.getLocVT().getSizeInBits() < 32) {
124 // 16-bit types are reported as legal for 32-bit registers. We need to do
125 // a 32-bit copy, and truncate to avoid the verifier complaining about it.
126 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
127 MIRBuilder.buildTrunc(ValVReg, Copy);
128 return;
129 }
130
131 switch (VA.getLocInfo()) {
132 case CCValAssign::LocInfo::SExt:
133 case CCValAssign::LocInfo::ZExt:
134 case CCValAssign::LocInfo::AExt: {
135 auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
136 MIRBuilder.buildTrunc(ValVReg, Copy);
137 break;
138 }
139 default:
140 MIRBuilder.buildCopy(ValVReg, PhysReg);
141 break;
142 }
143 }
144
assignValueToAddress__anond9a63f4d0111::AMDGPUIncomingArgHandler145 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize,
146 MachinePointerInfo &MPO, CCValAssign &VA) override {
147 MachineFunction &MF = MIRBuilder.getMF();
148
149 // The reported memory location may be wider than the value.
150 const LLT RegTy = MRI.getType(ValVReg);
151 MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize);
152
153 // FIXME: Get alignment
154 auto MMO = MF.getMachineMemOperand(
155 MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize,
156 inferAlignFromPtrInfo(MF, MPO));
157 MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
158 }
159
160 /// How the physical register gets marked varies between formal
161 /// parameters (it's a basic-block live-in), and a call instruction
162 /// (it's an implicit-def of the BL).
163 virtual void markPhysRegUsed(unsigned PhysReg) = 0;
164 };
165
166 struct FormalArgHandler : public AMDGPUIncomingArgHandler {
FormalArgHandler__anond9a63f4d0111::FormalArgHandler167 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
168 CCAssignFn *AssignFn)
169 : AMDGPUIncomingArgHandler(B, MRI, AssignFn) {}
170
markPhysRegUsed__anond9a63f4d0111::FormalArgHandler171 void markPhysRegUsed(unsigned PhysReg) override {
172 MIRBuilder.getMBB().addLiveIn(PhysReg);
173 }
174 };
175
176 struct CallReturnHandler : public AMDGPUIncomingArgHandler {
CallReturnHandler__anond9a63f4d0111::CallReturnHandler177 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
178 MachineInstrBuilder MIB, CCAssignFn *AssignFn)
179 : AMDGPUIncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
180
markPhysRegUsed__anond9a63f4d0111::CallReturnHandler181 void markPhysRegUsed(unsigned PhysReg) override {
182 MIB.addDef(PhysReg, RegState::Implicit);
183 }
184
185 MachineInstrBuilder MIB;
186 };
187
188 struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler {
189 MachineInstrBuilder MIB;
190 CCAssignFn *AssignFnVarArg;
191
192 /// For tail calls, the byte offset of the call's argument area from the
193 /// callee's. Unused elsewhere.
194 int FPDiff;
195
196 // Cache the SP register vreg if we need it more than once in this call site.
197 Register SPReg;
198
199 bool IsTailCall;
200
AMDGPUOutgoingArgHandler__anond9a63f4d0111::AMDGPUOutgoingArgHandler201 AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
202 MachineRegisterInfo &MRI, MachineInstrBuilder MIB,
203 CCAssignFn *AssignFn, CCAssignFn *AssignFnVarArg,
204 bool IsTailCall = false, int FPDiff = 0)
205 : AMDGPUValueHandler(false, MIRBuilder, MRI, AssignFn), MIB(MIB),
206 AssignFnVarArg(AssignFnVarArg), FPDiff(FPDiff), IsTailCall(IsTailCall) {
207 }
208
getStackAddress__anond9a63f4d0111::AMDGPUOutgoingArgHandler209 Register getStackAddress(uint64_t Size, int64_t Offset,
210 MachinePointerInfo &MPO) override {
211 MachineFunction &MF = MIRBuilder.getMF();
212 const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
213 const LLT S32 = LLT::scalar(32);
214
215 if (IsTailCall) {
216 llvm_unreachable("implement me");
217 }
218
219 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
220
221 if (!SPReg)
222 SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0);
223
224 auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
225
226 auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg);
227 MPO = MachinePointerInfo::getStack(MF, Offset);
228 return AddrReg.getReg(0);
229 }
230
assignValueToReg__anond9a63f4d0111::AMDGPUOutgoingArgHandler231 void assignValueToReg(Register ValVReg, Register PhysReg,
232 CCValAssign &VA) override {
233 MIB.addUse(PhysReg, RegState::Implicit);
234 Register ExtReg = extendRegisterMin32(ValVReg, VA);
235 MIRBuilder.buildCopy(PhysReg, ExtReg);
236 }
237
assignValueToAddress__anond9a63f4d0111::AMDGPUOutgoingArgHandler238 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
239 MachinePointerInfo &MPO, CCValAssign &VA) override {
240 MachineFunction &MF = MIRBuilder.getMF();
241 uint64_t LocMemOffset = VA.getLocMemOffset();
242 const auto &ST = MF.getSubtarget<GCNSubtarget>();
243
244 auto MMO = MF.getMachineMemOperand(
245 MPO, MachineMemOperand::MOStore, Size,
246 commonAlignment(ST.getStackAlignment(), LocMemOffset));
247 MIRBuilder.buildStore(ValVReg, Addr, *MMO);
248 }
249
assignValueToAddress__anond9a63f4d0111::AMDGPUOutgoingArgHandler250 void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr,
251 uint64_t Size, MachinePointerInfo &MPO,
252 CCValAssign &VA) override {
253 Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
254 ? extendRegister(Arg.Regs[0], VA)
255 : Arg.Regs[0];
256
257 // If we extended we might need to adjust the MMO's Size.
258 const LLT RegTy = MRI.getType(ValVReg);
259 if (RegTy.getSizeInBytes() > Size)
260 Size = RegTy.getSizeInBytes();
261
262 assignValueToAddress(ValVReg, Addr, Size, MPO, VA);
263 }
264 };
265 }
266
AMDGPUCallLowering(const AMDGPUTargetLowering & TLI)267 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
268 : CallLowering(&TLI) {
269 }
270
271 // FIXME: Compatability shim
extOpcodeToISDExtOpcode(unsigned MIOpc)272 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
273 switch (MIOpc) {
274 case TargetOpcode::G_SEXT:
275 return ISD::SIGN_EXTEND;
276 case TargetOpcode::G_ZEXT:
277 return ISD::ZERO_EXTEND;
278 case TargetOpcode::G_ANYEXT:
279 return ISD::ANY_EXTEND;
280 default:
281 llvm_unreachable("not an extend opcode");
282 }
283 }
284
splitToValueTypes(MachineIRBuilder & B,const ArgInfo & OrigArg,SmallVectorImpl<ArgInfo> & SplitArgs,const DataLayout & DL,CallingConv::ID CallConv,bool IsOutgoing,SplitArgTy PerformArgSplit) const285 void AMDGPUCallLowering::splitToValueTypes(
286 MachineIRBuilder &B,
287 const ArgInfo &OrigArg,
288 SmallVectorImpl<ArgInfo> &SplitArgs,
289 const DataLayout &DL, CallingConv::ID CallConv,
290 bool IsOutgoing,
291 SplitArgTy PerformArgSplit) const {
292 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
293 LLVMContext &Ctx = OrigArg.Ty->getContext();
294
295 if (OrigArg.Ty->isVoidTy())
296 return;
297
298 SmallVector<EVT, 4> SplitVTs;
299 ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
300
301 assert(OrigArg.Regs.size() == SplitVTs.size());
302
303 int SplitIdx = 0;
304 for (EVT VT : SplitVTs) {
305 Register Reg = OrigArg.Regs[SplitIdx];
306 Type *Ty = VT.getTypeForEVT(Ctx);
307 LLT LLTy = getLLTForType(*Ty, DL);
308
309 if (IsOutgoing && VT.isScalarInteger()) {
310 unsigned ExtendOp = TargetOpcode::G_ANYEXT;
311 if (OrigArg.Flags[0].isSExt()) {
312 assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
313 ExtendOp = TargetOpcode::G_SEXT;
314 } else if (OrigArg.Flags[0].isZExt()) {
315 assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
316 ExtendOp = TargetOpcode::G_ZEXT;
317 }
318
319 EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
320 extOpcodeToISDExtOpcode(ExtendOp));
321 if (ExtVT.getSizeInBits() != VT.getSizeInBits()) {
322 VT = ExtVT;
323 Ty = ExtVT.getTypeForEVT(Ctx);
324 LLTy = getLLTForType(*Ty, DL);
325 Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0);
326 }
327 }
328
329 unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
330 MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
331
332 if (NumParts == 1) {
333 // No splitting to do, but we want to replace the original type (e.g. [1 x
334 // double] -> double).
335 SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed);
336
337 ++SplitIdx;
338 continue;
339 }
340
341 SmallVector<Register, 8> SplitRegs;
342 Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx);
343 LLT PartLLT = getLLTForType(*PartTy, DL);
344 MachineRegisterInfo &MRI = *B.getMRI();
345
346 // FIXME: Should we be reporting all of the part registers for a single
347 // argument, and let handleAssignments take care of the repacking?
348 for (unsigned i = 0; i < NumParts; ++i) {
349 Register PartReg = MRI.createGenericVirtualRegister(PartLLT);
350 SplitRegs.push_back(PartReg);
351 SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
352 }
353
354 PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx);
355
356 ++SplitIdx;
357 }
358 }
359
360 // TODO: Move to generic code
unpackRegsToOrigType(MachineIRBuilder & B,ArrayRef<Register> DstRegs,Register SrcReg,const CallLowering::ArgInfo & Info,LLT SrcTy,LLT PartTy)361 static void unpackRegsToOrigType(MachineIRBuilder &B,
362 ArrayRef<Register> DstRegs,
363 Register SrcReg,
364 const CallLowering::ArgInfo &Info,
365 LLT SrcTy,
366 LLT PartTy) {
367 assert(DstRegs.size() > 1 && "Nothing to unpack");
368
369 const unsigned PartSize = PartTy.getSizeInBits();
370
371 if (SrcTy.isVector() && !PartTy.isVector() &&
372 PartSize > SrcTy.getElementType().getSizeInBits()) {
373 // Vector was scalarized, and the elements extended.
374 auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg);
375 for (int i = 0, e = DstRegs.size(); i != e; ++i)
376 B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
377 return;
378 }
379
380 LLT GCDTy = getGCDType(SrcTy, PartTy);
381 if (GCDTy == PartTy) {
382 // If this already evenly divisible, we can create a simple unmerge.
383 B.buildUnmerge(DstRegs, SrcReg);
384 return;
385 }
386
387 MachineRegisterInfo &MRI = *B.getMRI();
388 LLT DstTy = MRI.getType(DstRegs[0]);
389 LLT LCMTy = getLCMType(SrcTy, PartTy);
390
391 const unsigned LCMSize = LCMTy.getSizeInBits();
392 const unsigned DstSize = DstTy.getSizeInBits();
393 const unsigned SrcSize = SrcTy.getSizeInBits();
394
395 Register UnmergeSrc = SrcReg;
396 if (LCMSize != SrcSize) {
397 // Widen to the common type.
398 Register Undef = B.buildUndef(SrcTy).getReg(0);
399 SmallVector<Register, 8> MergeParts(1, SrcReg);
400 for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize)
401 MergeParts.push_back(Undef);
402
403 UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0);
404 }
405
406 // Unmerge to the original registers and pad with dead defs.
407 SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end());
408 for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize;
409 Size += DstSize) {
410 UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy));
411 }
412
413 B.buildUnmerge(UnmergeResults, UnmergeSrc);
414 }
415
416 /// Lower the return value for the already existing \p Ret. This assumes that
417 /// \p B's insertion point is correct.
lowerReturnVal(MachineIRBuilder & B,const Value * Val,ArrayRef<Register> VRegs,MachineInstrBuilder & Ret) const418 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
419 const Value *Val, ArrayRef<Register> VRegs,
420 MachineInstrBuilder &Ret) const {
421 if (!Val)
422 return true;
423
424 auto &MF = B.getMF();
425 const auto &F = MF.getFunction();
426 const DataLayout &DL = MF.getDataLayout();
427 MachineRegisterInfo *MRI = B.getMRI();
428
429 CallingConv::ID CC = F.getCallingConv();
430 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
431
432 ArgInfo OrigRetInfo(VRegs, Val->getType());
433 setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
434 SmallVector<ArgInfo, 4> SplitRetInfos;
435
436 splitToValueTypes(
437 B, OrigRetInfo, SplitRetInfos, DL, CC, true,
438 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
439 int VTSplitIdx) {
440 unpackRegsToOrigType(B, Regs, SrcReg,
441 SplitRetInfos[VTSplitIdx],
442 LLTy, PartLLT);
443 });
444
445 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
446 AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn);
447 return handleAssignments(B, SplitRetInfos, RetHandler);
448 }
449
lowerReturn(MachineIRBuilder & B,const Value * Val,ArrayRef<Register> VRegs) const450 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
451 const Value *Val,
452 ArrayRef<Register> VRegs) const {
453
454 MachineFunction &MF = B.getMF();
455 MachineRegisterInfo &MRI = MF.getRegInfo();
456 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
457 MFI->setIfReturnsVoid(!Val);
458
459 assert(!Val == VRegs.empty() && "Return value without a vreg");
460
461 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
462 const bool IsShader = AMDGPU::isShader(CC);
463 const bool IsWaveEnd =
464 (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
465 if (IsWaveEnd) {
466 B.buildInstr(AMDGPU::S_ENDPGM)
467 .addImm(0);
468 return true;
469 }
470
471 auto const &ST = MF.getSubtarget<GCNSubtarget>();
472
473 unsigned ReturnOpc =
474 IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
475
476 auto Ret = B.buildInstrNoInsert(ReturnOpc);
477 Register ReturnAddrVReg;
478 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
479 ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
480 Ret.addUse(ReturnAddrVReg);
481 }
482
483 if (!lowerReturnVal(B, Val, VRegs, Ret))
484 return false;
485
486 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
487 const SIRegisterInfo *TRI = ST.getRegisterInfo();
488 Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
489 &AMDGPU::SGPR_64RegClass);
490 B.buildCopy(ReturnAddrVReg, LiveInReturn);
491 }
492
493 // TODO: Handle CalleeSavedRegsViaCopy.
494
495 B.insertInstr(Ret);
496 return true;
497 }
498
lowerParameterPtr(Register DstReg,MachineIRBuilder & B,Type * ParamTy,uint64_t Offset) const499 void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
500 Type *ParamTy,
501 uint64_t Offset) const {
502 MachineFunction &MF = B.getMF();
503 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
504 MachineRegisterInfo &MRI = MF.getRegInfo();
505 Register KernArgSegmentPtr =
506 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
507 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
508
509 auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
510
511 B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
512 }
513
lowerParameter(MachineIRBuilder & B,Type * ParamTy,uint64_t Offset,Align Alignment,Register DstReg) const514 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
515 uint64_t Offset, Align Alignment,
516 Register DstReg) const {
517 MachineFunction &MF = B.getMF();
518 const Function &F = MF.getFunction();
519 const DataLayout &DL = F.getParent()->getDataLayout();
520 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
521 unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
522
523 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
524 Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
525 lowerParameterPtr(PtrReg, B, ParamTy, Offset);
526
527 MachineMemOperand *MMO = MF.getMachineMemOperand(
528 PtrInfo,
529 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
530 MachineMemOperand::MOInvariant,
531 TypeSize, Alignment);
532
533 B.buildLoad(DstReg, PtrReg, *MMO);
534 }
535
536 // Allocate special inputs passed in user SGPRs.
allocateHSAUserSGPRs(CCState & CCInfo,MachineIRBuilder & B,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info)537 static void allocateHSAUserSGPRs(CCState &CCInfo,
538 MachineIRBuilder &B,
539 MachineFunction &MF,
540 const SIRegisterInfo &TRI,
541 SIMachineFunctionInfo &Info) {
542 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
543 if (Info.hasPrivateSegmentBuffer()) {
544 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
545 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
546 CCInfo.AllocateReg(PrivateSegmentBufferReg);
547 }
548
549 if (Info.hasDispatchPtr()) {
550 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
551 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
552 CCInfo.AllocateReg(DispatchPtrReg);
553 }
554
555 if (Info.hasQueuePtr()) {
556 Register QueuePtrReg = Info.addQueuePtr(TRI);
557 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
558 CCInfo.AllocateReg(QueuePtrReg);
559 }
560
561 if (Info.hasKernargSegmentPtr()) {
562 MachineRegisterInfo &MRI = MF.getRegInfo();
563 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
564 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
565 Register VReg = MRI.createGenericVirtualRegister(P4);
566 MRI.addLiveIn(InputPtrReg, VReg);
567 B.getMBB().addLiveIn(InputPtrReg);
568 B.buildCopy(VReg, InputPtrReg);
569 CCInfo.AllocateReg(InputPtrReg);
570 }
571
572 if (Info.hasDispatchID()) {
573 Register DispatchIDReg = Info.addDispatchID(TRI);
574 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
575 CCInfo.AllocateReg(DispatchIDReg);
576 }
577
578 if (Info.hasFlatScratchInit()) {
579 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
580 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
581 CCInfo.AllocateReg(FlatScratchInitReg);
582 }
583
584 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
585 // these from the dispatch pointer.
586 }
587
lowerFormalArgumentsKernel(MachineIRBuilder & B,const Function & F,ArrayRef<ArrayRef<Register>> VRegs) const588 bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
589 MachineIRBuilder &B, const Function &F,
590 ArrayRef<ArrayRef<Register>> VRegs) const {
591 MachineFunction &MF = B.getMF();
592 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
593 MachineRegisterInfo &MRI = MF.getRegInfo();
594 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
595 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
596 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
597
598 const DataLayout &DL = F.getParent()->getDataLayout();
599
600 SmallVector<CCValAssign, 16> ArgLocs;
601 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
602
603 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
604
605 unsigned i = 0;
606 const Align KernArgBaseAlign(16);
607 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
608 uint64_t ExplicitArgOffset = 0;
609
610 // TODO: Align down to dword alignment and extract bits for extending loads.
611 for (auto &Arg : F.args()) {
612 const bool IsByRef = Arg.hasByRefAttr();
613 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
614 unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
615 if (AllocSize == 0)
616 continue;
617
618 MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None;
619 if (!ABIAlign)
620 ABIAlign = DL.getABITypeAlign(ArgTy);
621
622 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
623 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
624
625 if (Arg.use_empty()) {
626 ++i;
627 continue;
628 }
629
630 Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
631
632 if (IsByRef) {
633 unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
634
635 assert(VRegs[i].size() == 1 &&
636 "expected only one register for byval pointers");
637 if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
638 lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset);
639 } else {
640 const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
641 Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy);
642 lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset);
643
644 B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
645 }
646 } else {
647 ArrayRef<Register> OrigArgRegs = VRegs[i];
648 Register ArgReg =
649 OrigArgRegs.size() == 1
650 ? OrigArgRegs[0]
651 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
652
653 lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
654 if (OrigArgRegs.size() > 1)
655 unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
656 }
657
658 ++i;
659 }
660
661 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
662 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
663 return true;
664 }
665
666 /// Pack values \p SrcRegs to cover the vector type result \p DstRegs.
mergeVectorRegsToResultRegs(MachineIRBuilder & B,ArrayRef<Register> DstRegs,ArrayRef<Register> SrcRegs)667 static MachineInstrBuilder mergeVectorRegsToResultRegs(
668 MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) {
669 MachineRegisterInfo &MRI = *B.getMRI();
670 LLT LLTy = MRI.getType(DstRegs[0]);
671 LLT PartLLT = MRI.getType(SrcRegs[0]);
672
673 // Deal with v3s16 split into v2s16
674 LLT LCMTy = getLCMType(LLTy, PartLLT);
675 if (LCMTy == LLTy) {
676 // Common case where no padding is needed.
677 assert(DstRegs.size() == 1);
678 return B.buildConcatVectors(DstRegs[0], SrcRegs);
679 }
680
681 const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
682 Register Undef = B.buildUndef(PartLLT).getReg(0);
683
684 // Build vector of undefs.
685 SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
686
687 // Replace the first sources with the real registers.
688 std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
689
690 auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs);
691 int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
692
693 SmallVector<Register, 8> PadDstRegs(NumDst);
694 std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin());
695
696 // Create the excess dead defs for the unmerge.
697 for (int I = DstRegs.size(); I != NumDst; ++I)
698 PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
699
700 return B.buildUnmerge(PadDstRegs, Widened);
701 }
702
703 // TODO: Move this to generic code
packSplitRegsToOrigType(MachineIRBuilder & B,ArrayRef<Register> OrigRegs,ArrayRef<Register> Regs,LLT LLTy,LLT PartLLT)704 static void packSplitRegsToOrigType(MachineIRBuilder &B,
705 ArrayRef<Register> OrigRegs,
706 ArrayRef<Register> Regs,
707 LLT LLTy,
708 LLT PartLLT) {
709 MachineRegisterInfo &MRI = *B.getMRI();
710
711 if (!LLTy.isVector() && !PartLLT.isVector()) {
712 assert(OrigRegs.size() == 1);
713 LLT OrigTy = MRI.getType(OrigRegs[0]);
714
715 unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size();
716 if (SrcSize == OrigTy.getSizeInBits())
717 B.buildMerge(OrigRegs[0], Regs);
718 else {
719 auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs);
720 B.buildTrunc(OrigRegs[0], Widened);
721 }
722
723 return;
724 }
725
726 if (LLTy.isVector() && PartLLT.isVector()) {
727 assert(OrigRegs.size() == 1);
728 assert(LLTy.getElementType() == PartLLT.getElementType());
729 mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
730 return;
731 }
732
733 assert(LLTy.isVector() && !PartLLT.isVector());
734
735 LLT DstEltTy = LLTy.getElementType();
736
737 // Pointer information was discarded. We'll need to coerce some register types
738 // to avoid violating type constraints.
739 LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType();
740
741 assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits());
742
743 if (DstEltTy == PartLLT) {
744 // Vector was trivially scalarized.
745
746 if (RealDstEltTy.isPointer()) {
747 for (Register Reg : Regs)
748 MRI.setType(Reg, RealDstEltTy);
749 }
750
751 B.buildBuildVector(OrigRegs[0], Regs);
752 } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
753 // Deal with vector with 64-bit elements decomposed to 32-bit
754 // registers. Need to create intermediate 64-bit elements.
755 SmallVector<Register, 8> EltMerges;
756 int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
757
758 assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
759
760 for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) {
761 auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt));
762 // Fix the type in case this is really a vector of pointers.
763 MRI.setType(Merge.getReg(0), RealDstEltTy);
764 EltMerges.push_back(Merge.getReg(0));
765 Regs = Regs.drop_front(PartsPerElt);
766 }
767
768 B.buildBuildVector(OrigRegs[0], EltMerges);
769 } else {
770 // Vector was split, and elements promoted to a wider type.
771 LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
772 auto BV = B.buildBuildVector(BVType, Regs);
773 B.buildTrunc(OrigRegs[0], BV);
774 }
775 }
776
lowerFormalArguments(MachineIRBuilder & B,const Function & F,ArrayRef<ArrayRef<Register>> VRegs) const777 bool AMDGPUCallLowering::lowerFormalArguments(
778 MachineIRBuilder &B, const Function &F,
779 ArrayRef<ArrayRef<Register>> VRegs) const {
780 CallingConv::ID CC = F.getCallingConv();
781
782 // The infrastructure for normal calling convention lowering is essentially
783 // useless for kernels. We want to avoid any kind of legalization or argument
784 // splitting.
785 if (CC == CallingConv::AMDGPU_KERNEL)
786 return lowerFormalArgumentsKernel(B, F, VRegs);
787
788 const bool IsGraphics = AMDGPU::isGraphics(CC);
789 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
790
791 MachineFunction &MF = B.getMF();
792 MachineBasicBlock &MBB = B.getMBB();
793 MachineRegisterInfo &MRI = MF.getRegInfo();
794 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
795 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
796 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
797 const DataLayout &DL = F.getParent()->getDataLayout();
798
799
800 SmallVector<CCValAssign, 16> ArgLocs;
801 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
802
803 if (!IsEntryFunc) {
804 Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
805 Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
806 &AMDGPU::SGPR_64RegClass);
807 MBB.addLiveIn(ReturnAddrReg);
808 B.buildCopy(LiveInReturn, ReturnAddrReg);
809 }
810
811 if (Info->hasImplicitBufferPtr()) {
812 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
813 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
814 CCInfo.AllocateReg(ImplicitBufferPtrReg);
815 }
816
817
818 SmallVector<ArgInfo, 32> SplitArgs;
819 unsigned Idx = 0;
820 unsigned PSInputNum = 0;
821
822 for (auto &Arg : F.args()) {
823 if (DL.getTypeStoreSize(Arg.getType()) == 0)
824 continue;
825
826 const bool InReg = Arg.hasAttribute(Attribute::InReg);
827
828 // SGPR arguments to functions not implemented.
829 if (!IsGraphics && InReg)
830 return false;
831
832 if (Arg.hasAttribute(Attribute::SwiftSelf) ||
833 Arg.hasAttribute(Attribute::SwiftError) ||
834 Arg.hasAttribute(Attribute::Nest))
835 return false;
836
837 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
838 const bool ArgUsed = !Arg.use_empty();
839 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
840
841 if (!SkipArg) {
842 Info->markPSInputAllocated(PSInputNum);
843 if (ArgUsed)
844 Info->markPSInputEnabled(PSInputNum);
845 }
846
847 ++PSInputNum;
848
849 if (SkipArg) {
850 for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
851 B.buildUndef(VRegs[Idx][I]);
852
853 ++Idx;
854 continue;
855 }
856 }
857
858 ArgInfo OrigArg(VRegs[Idx], Arg.getType());
859 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
860 setArgFlags(OrigArg, OrigArgIdx, DL, F);
861
862 splitToValueTypes(
863 B, OrigArg, SplitArgs, DL, CC, false,
864 // FIXME: We should probably be passing multiple registers to
865 // handleAssignments to do this
866 [&](ArrayRef<Register> Regs, Register DstReg,
867 LLT LLTy, LLT PartLLT, int VTSplitIdx) {
868 assert(DstReg == VRegs[Idx][VTSplitIdx]);
869 packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
870 LLTy, PartLLT);
871 });
872
873 ++Idx;
874 }
875
876 // At least one interpolation mode must be enabled or else the GPU will
877 // hang.
878 //
879 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
880 // set PSInputAddr, the user wants to enable some bits after the compilation
881 // based on run-time states. Since we can't know what the final PSInputEna
882 // will look like, so we shouldn't do anything here and the user should take
883 // responsibility for the correct programming.
884 //
885 // Otherwise, the following restrictions apply:
886 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
887 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
888 // enabled too.
889 if (CC == CallingConv::AMDGPU_PS) {
890 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
891 ((Info->getPSInputAddr() & 0xF) == 0 &&
892 Info->isPSInputAllocated(11))) {
893 CCInfo.AllocateReg(AMDGPU::VGPR0);
894 CCInfo.AllocateReg(AMDGPU::VGPR1);
895 Info->markPSInputAllocated(0);
896 Info->markPSInputEnabled(0);
897 }
898
899 if (Subtarget.isAmdPalOS()) {
900 // For isAmdPalOS, the user does not enable some bits after compilation
901 // based on run-time states; the register values being generated here are
902 // the final ones set in hardware. Therefore we need to apply the
903 // workaround to PSInputAddr and PSInputEnable together. (The case where
904 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
905 // set up an input arg for a particular interpolation mode, but nothing
906 // uses that input arg. Really we should have an earlier pass that removes
907 // such an arg.)
908 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
909 if ((PsInputBits & 0x7F) == 0 ||
910 ((PsInputBits & 0xF) == 0 &&
911 (PsInputBits >> 11 & 1)))
912 Info->markPSInputEnabled(
913 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
914 }
915 }
916
917 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
918 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
919
920 if (!MBB.empty())
921 B.setInstr(*MBB.begin());
922
923 if (!IsEntryFunc) {
924 // For the fixed ABI, pass workitem IDs in the last argument register.
925 if (AMDGPUTargetMachine::EnableFixedFunctionABI)
926 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
927 }
928
929 FormalArgHandler Handler(B, MRI, AssignFn);
930 if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
931 return false;
932
933 if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
934 // Special inputs come after user arguments.
935 TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
936 }
937
938 // Start adding system SGPRs.
939 if (IsEntryFunc) {
940 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
941 } else {
942 if (!Subtarget.enableFlatScratch())
943 CCInfo.AllocateReg(Info->getScratchRSrcReg());
944 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
945 }
946
947 // Move back to the end of the basic block.
948 B.setMBB(MBB);
949
950 return true;
951 }
952
passSpecialInputs(MachineIRBuilder & MIRBuilder,CCState & CCInfo,SmallVectorImpl<std::pair<MCRegister,Register>> & ArgRegs,CallLoweringInfo & Info) const953 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
954 CCState &CCInfo,
955 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
956 CallLoweringInfo &Info) const {
957 MachineFunction &MF = MIRBuilder.getMF();
958
959 const AMDGPUFunctionArgInfo *CalleeArgInfo
960 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
961
962 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
963 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
964
965
966 // TODO: Unify with private memory register handling. This is complicated by
967 // the fact that at least in kernels, the input argument is not necessarily
968 // in the same location as the input.
969 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
970 AMDGPUFunctionArgInfo::DISPATCH_PTR,
971 AMDGPUFunctionArgInfo::QUEUE_PTR,
972 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
973 AMDGPUFunctionArgInfo::DISPATCH_ID,
974 AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
975 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
976 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
977 };
978
979 MachineRegisterInfo &MRI = MF.getRegInfo();
980
981 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
982 const AMDGPULegalizerInfo *LI
983 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
984
985 for (auto InputID : InputRegs) {
986 const ArgDescriptor *OutgoingArg;
987 const TargetRegisterClass *ArgRC;
988 LLT ArgTy;
989
990 std::tie(OutgoingArg, ArgRC, ArgTy) =
991 CalleeArgInfo->getPreloadedValue(InputID);
992 if (!OutgoingArg)
993 continue;
994
995 const ArgDescriptor *IncomingArg;
996 const TargetRegisterClass *IncomingArgRC;
997 std::tie(IncomingArg, IncomingArgRC, ArgTy) =
998 CallerArgInfo.getPreloadedValue(InputID);
999 assert(IncomingArgRC == ArgRC);
1000
1001 Register InputReg = MRI.createGenericVirtualRegister(ArgTy);
1002
1003 if (IncomingArg) {
1004 LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
1005 } else {
1006 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1007 LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
1008 }
1009
1010 if (OutgoingArg->isRegister()) {
1011 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
1012 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
1013 report_fatal_error("failed to allocate implicit input argument");
1014 } else {
1015 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
1016 return false;
1017 }
1018 }
1019
1020 // Pack workitem IDs into a single register or pass it as is if already
1021 // packed.
1022 const ArgDescriptor *OutgoingArg;
1023 const TargetRegisterClass *ArgRC;
1024 LLT ArgTy;
1025
1026 std::tie(OutgoingArg, ArgRC, ArgTy) =
1027 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1028 if (!OutgoingArg)
1029 std::tie(OutgoingArg, ArgRC, ArgTy) =
1030 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1031 if (!OutgoingArg)
1032 std::tie(OutgoingArg, ArgRC, ArgTy) =
1033 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1034 if (!OutgoingArg)
1035 return false;
1036
1037 auto WorkitemIDX =
1038 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1039 auto WorkitemIDY =
1040 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1041 auto WorkitemIDZ =
1042 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1043
1044 const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX);
1045 const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY);
1046 const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ);
1047 const LLT S32 = LLT::scalar(32);
1048
1049 // If incoming ids are not packed we need to pack them.
1050 // FIXME: Should consider known workgroup size to eliminate known 0 cases.
1051 Register InputReg;
1052 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) {
1053 InputReg = MRI.createGenericVirtualRegister(S32);
1054 LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
1055 std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
1056 }
1057
1058 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
1059 Register Y = MRI.createGenericVirtualRegister(S32);
1060 LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY),
1061 std::get<2>(WorkitemIDY));
1062
1063 Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0);
1064 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y;
1065 }
1066
1067 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
1068 Register Z = MRI.createGenericVirtualRegister(S32);
1069 LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ),
1070 std::get<2>(WorkitemIDZ));
1071
1072 Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0);
1073 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
1074 }
1075
1076 if (!InputReg) {
1077 InputReg = MRI.createGenericVirtualRegister(S32);
1078
1079 // Workitem ids are already packed, any of present incoming arguments will
1080 // carry all required fields.
1081 ArgDescriptor IncomingArg = ArgDescriptor::createArg(
1082 IncomingArgX ? *IncomingArgX :
1083 IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u);
1084 LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg,
1085 &AMDGPU::VGPR_32RegClass, S32);
1086 }
1087
1088 if (OutgoingArg->isRegister()) {
1089 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
1090 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
1091 report_fatal_error("failed to allocate implicit input argument");
1092 } else {
1093 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
1094 return false;
1095 }
1096
1097 return true;
1098 }
1099
1100 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
1101 /// CC.
1102 static std::pair<CCAssignFn *, CCAssignFn *>
getAssignFnsForCC(CallingConv::ID CC,const SITargetLowering & TLI)1103 getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
1104 return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
1105 }
1106
getCallOpcode(const MachineFunction & CallerF,bool IsIndirect,bool IsTailCall)1107 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
1108 bool IsTailCall) {
1109 return AMDGPU::SI_CALL;
1110 }
1111
1112 // Add operands to call instruction to track the callee.
addCallTargetOperands(MachineInstrBuilder & CallInst,MachineIRBuilder & MIRBuilder,AMDGPUCallLowering::CallLoweringInfo & Info)1113 static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
1114 MachineIRBuilder &MIRBuilder,
1115 AMDGPUCallLowering::CallLoweringInfo &Info) {
1116 if (Info.Callee.isReg()) {
1117 CallInst.addReg(Info.Callee.getReg());
1118 CallInst.addImm(0);
1119 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
1120 // The call lowering lightly assumed we can directly encode a call target in
1121 // the instruction, which is not the case. Materialize the address here.
1122 const GlobalValue *GV = Info.Callee.getGlobal();
1123 auto Ptr = MIRBuilder.buildGlobalValue(
1124 LLT::pointer(GV->getAddressSpace(), 64), GV);
1125 CallInst.addReg(Ptr.getReg(0));
1126 CallInst.add(Info.Callee);
1127 } else
1128 return false;
1129
1130 return true;
1131 }
1132
lowerCall(MachineIRBuilder & MIRBuilder,CallLoweringInfo & Info) const1133 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
1134 CallLoweringInfo &Info) const {
1135 if (Info.IsVarArg) {
1136 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1137 return false;
1138 }
1139
1140 MachineFunction &MF = MIRBuilder.getMF();
1141 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1142 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1143
1144 const Function &F = MF.getFunction();
1145 MachineRegisterInfo &MRI = MF.getRegInfo();
1146 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1147 const DataLayout &DL = F.getParent()->getDataLayout();
1148 CallingConv::ID CallConv = F.getCallingConv();
1149
1150 if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
1151 CallConv != CallingConv::AMDGPU_Gfx) {
1152 LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
1153 return false;
1154 }
1155
1156 if (AMDGPU::isShader(CallConv)) {
1157 LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n");
1158 return false;
1159 }
1160
1161 SmallVector<ArgInfo, 8> OutArgs;
1162 SmallVector<ArgInfo, 4> SplitRetInfos;
1163
1164 for (auto &OrigArg : Info.OrigArgs) {
1165 splitToValueTypes(
1166 MIRBuilder, OrigArg, OutArgs, DL, Info.CallConv, true,
1167 // FIXME: We should probably be passing multiple registers to
1168 // handleAssignments to do this
1169 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
1170 int VTSplitIdx) {
1171 unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT);
1172 });
1173 }
1174
1175 // If we can lower as a tail call, do that instead.
1176 bool CanTailCallOpt = false;
1177
1178 // We must emit a tail call if we have musttail.
1179 if (Info.IsMustTailCall && !CanTailCallOpt) {
1180 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1181 return false;
1182 }
1183
1184 // Find out which ABI gets to decide where things go.
1185 CCAssignFn *AssignFnFixed;
1186 CCAssignFn *AssignFnVarArg;
1187 std::tie(AssignFnFixed, AssignFnVarArg) =
1188 getAssignFnsForCC(Info.CallConv, TLI);
1189
1190 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)
1191 .addImm(0)
1192 .addImm(0);
1193
1194 // Create a temporarily-floating call instruction so we can add the implicit
1195 // uses of arg registers.
1196 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
1197
1198 auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
1199 MIB.addDef(TRI->getReturnAddressReg(MF));
1200
1201 if (!addCallTargetOperands(MIB, MIRBuilder, Info))
1202 return false;
1203
1204 // Tell the call which registers are clobbered.
1205 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
1206 MIB.addRegMask(Mask);
1207
1208 SmallVector<CCValAssign, 16> ArgLocs;
1209 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1210
1211 // We could pass MIB and directly add the implicit uses to the call
1212 // now. However, as an aesthetic choice, place implicit argument operands
1213 // after the ordinary user argument registers.
1214 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
1215
1216 if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
1217 // With a fixed ABI, allocate fixed registers before user arguments.
1218 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1219 return false;
1220 }
1221
1222 // Do the actual argument marshalling.
1223 SmallVector<Register, 8> PhysRegs;
1224 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
1225 AssignFnVarArg, false);
1226 if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler))
1227 return false;
1228
1229 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1230
1231 if (!ST.enableFlatScratch()) {
1232 // Insert copies for the SRD. In the HSA case, this should be an identity
1233 // copy.
1234 auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
1235 MFI->getScratchRSrcReg());
1236 MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
1237 MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
1238 }
1239
1240 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
1241 MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
1242 MIB.addReg(ArgReg.first, RegState::Implicit);
1243 }
1244
1245 // Get a count of how many bytes are to be pushed on the stack.
1246 unsigned NumBytes = CCInfo.getNextStackOffset();
1247
1248 // If Callee is a reg, since it is used by a target specific
1249 // instruction, it must have a register class matching the
1250 // constraint of that instruction.
1251
1252 // FIXME: We should define regbankselectable call instructions to handle
1253 // divergent call targets.
1254 if (MIB->getOperand(1).isReg()) {
1255 MIB->getOperand(1).setReg(constrainOperandRegClass(
1256 MF, *TRI, MRI, *ST.getInstrInfo(),
1257 *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1),
1258 1));
1259 }
1260
1261 auto OrigInsertPt = MIRBuilder.getInsertPt();
1262
1263 // Now we can add the actual call instruction to the correct position.
1264 MIRBuilder.insertInstr(MIB);
1265
1266 // Insert this now to give us an anchor point for managing the insert point.
1267 MachineInstrBuilder CallSeqEnd =
1268 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN);
1269
1270 SmallVector<ArgInfo, 8> InArgs;
1271 if (!Info.OrigRet.Ty->isVoidTy()) {
1272 splitToValueTypes(
1273 MIRBuilder, Info.OrigRet, InArgs, DL, Info.CallConv, false,
1274 [&](ArrayRef<Register> Regs, Register DstReg,
1275 LLT LLTy, LLT PartLLT, int VTSplitIdx) {
1276 assert(DstReg == Info.OrigRet.Regs[VTSplitIdx]);
1277 packSplitRegsToOrigType(MIRBuilder, Info.OrigRet.Regs[VTSplitIdx],
1278 Regs, LLTy, PartLLT);
1279 });
1280 }
1281
1282 // Make sure the raw argument copies are inserted before the marshalling to
1283 // the original types.
1284 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), CallSeqEnd);
1285
1286 // Finally we can copy the returned value back into its virtual-register. In
1287 // symmetry with the arguments, the physical register must be an
1288 // implicit-define of the call instruction.
1289 if (!Info.OrigRet.Ty->isVoidTy()) {
1290 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv,
1291 Info.IsVarArg);
1292 CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
1293 if (!handleAssignments(MIRBuilder, InArgs, Handler))
1294 return false;
1295 }
1296
1297 uint64_t CalleePopBytes = NumBytes;
1298 CallSeqEnd.addImm(0)
1299 .addImm(CalleePopBytes);
1300
1301 // Restore the insert point to after the call sequence.
1302 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), OrigInsertPt);
1303 return true;
1304 }
1305