1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements the lowering of LLVM calls to machine code calls for
11 /// GlobalISel.
12 ///
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUISelLowering.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIISelLowering.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/CodeGen/Analysis.h"
24 #include "llvm/CodeGen/CallingConvLower.h"
25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26 #include "llvm/CodeGen/MachineInstrBuilder.h"
27 #include "llvm/Support/LowLevelTypeImpl.h"
28
29 using namespace llvm;
30
31 namespace {
32
33 struct OutgoingArgHandler : public CallLowering::ValueHandler {
OutgoingArgHandler__anon6a326bda0111::OutgoingArgHandler34 OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
35 MachineInstrBuilder MIB, CCAssignFn *AssignFn)
36 : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
37
38 MachineInstrBuilder MIB;
39
getStackAddress__anon6a326bda0111::OutgoingArgHandler40 Register getStackAddress(uint64_t Size, int64_t Offset,
41 MachinePointerInfo &MPO) override {
42 llvm_unreachable("not implemented");
43 }
44
assignValueToAddress__anon6a326bda0111::OutgoingArgHandler45 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
46 MachinePointerInfo &MPO, CCValAssign &VA) override {
47 llvm_unreachable("not implemented");
48 }
49
assignValueToReg__anon6a326bda0111::OutgoingArgHandler50 void assignValueToReg(Register ValVReg, Register PhysReg,
51 CCValAssign &VA) override {
52 MIB.addUse(PhysReg);
53 MIRBuilder.buildCopy(PhysReg, ValVReg);
54 }
55
assignArg__anon6a326bda0111::OutgoingArgHandler56 bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
57 CCValAssign::LocInfo LocInfo,
58 const CallLowering::ArgInfo &Info,
59 CCState &State) override {
60 return AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
61 }
62 };
63
64 }
65
AMDGPUCallLowering(const AMDGPUTargetLowering & TLI)66 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
67 : CallLowering(&TLI) {
68 }
69
lowerReturn(MachineIRBuilder & MIRBuilder,const Value * Val,ArrayRef<Register> VRegs) const70 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
71 const Value *Val,
72 ArrayRef<Register> VRegs) const {
73
74 MachineFunction &MF = MIRBuilder.getMF();
75 MachineRegisterInfo &MRI = MF.getRegInfo();
76 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
77 MFI->setIfReturnsVoid(!Val);
78
79 if (!Val) {
80 MIRBuilder.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
81 return true;
82 }
83
84 Register VReg = VRegs[0];
85
86 const Function &F = MF.getFunction();
87 auto &DL = F.getParent()->getDataLayout();
88 if (!AMDGPU::isShader(F.getCallingConv()))
89 return false;
90
91
92 const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
93 SmallVector<EVT, 4> SplitVTs;
94 SmallVector<uint64_t, 4> Offsets;
95 ArgInfo OrigArg{VReg, Val->getType()};
96 setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
97 ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
98
99 SmallVector<ArgInfo, 8> SplitArgs;
100 CCAssignFn *AssignFn = CCAssignFnForReturn(F.getCallingConv(), false);
101 for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
102 Type *SplitTy = SplitVTs[i].getTypeForEVT(F.getContext());
103 SplitArgs.push_back({VRegs[i], SplitTy, OrigArg.Flags, OrigArg.IsFixed});
104 }
105 auto RetInstr = MIRBuilder.buildInstrNoInsert(AMDGPU::SI_RETURN_TO_EPILOG);
106 OutgoingArgHandler Handler(MIRBuilder, MRI, RetInstr, AssignFn);
107 if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
108 return false;
109 MIRBuilder.insertInstr(RetInstr);
110
111 return true;
112 }
113
lowerParameterPtr(MachineIRBuilder & MIRBuilder,Type * ParamTy,uint64_t Offset) const114 Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
115 Type *ParamTy,
116 uint64_t Offset) const {
117
118 MachineFunction &MF = MIRBuilder.getMF();
119 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
120 MachineRegisterInfo &MRI = MF.getRegInfo();
121 const Function &F = MF.getFunction();
122 const DataLayout &DL = F.getParent()->getDataLayout();
123 PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
124 LLT PtrType = getLLTForType(*PtrTy, DL);
125 Register DstReg = MRI.createGenericVirtualRegister(PtrType);
126 Register KernArgSegmentPtr =
127 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
128 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
129
130 Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
131 MIRBuilder.buildConstant(OffsetReg, Offset);
132
133 MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg);
134
135 return DstReg;
136 }
137
lowerParameter(MachineIRBuilder & MIRBuilder,Type * ParamTy,uint64_t Offset,unsigned Align,Register DstReg) const138 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
139 Type *ParamTy, uint64_t Offset,
140 unsigned Align,
141 Register DstReg) const {
142 MachineFunction &MF = MIRBuilder.getMF();
143 const Function &F = MF.getFunction();
144 const DataLayout &DL = F.getParent()->getDataLayout();
145 PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
146 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
147 unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
148 Register PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
149
150 MachineMemOperand *MMO =
151 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad |
152 MachineMemOperand::MONonTemporal |
153 MachineMemOperand::MOInvariant,
154 TypeSize, Align);
155
156 MIRBuilder.buildLoad(DstReg, PtrReg, *MMO);
157 }
158
findFirstFreeSGPR(CCState & CCInfo)159 static Register findFirstFreeSGPR(CCState &CCInfo) {
160 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
161 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
162 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
163 return AMDGPU::SGPR0 + Reg;
164 }
165 }
166 llvm_unreachable("Cannot allocate sgpr");
167 }
168
allocateSpecialEntryInputVGPRs(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info)169 static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
170 MachineFunction &MF,
171 const SIRegisterInfo &TRI,
172 SIMachineFunctionInfo &Info) {
173 const LLT S32 = LLT::scalar(32);
174 MachineRegisterInfo &MRI = MF.getRegInfo();
175
176 if (Info.hasWorkItemIDX()) {
177 Register Reg = AMDGPU::VGPR0;
178 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
179
180 CCInfo.AllocateReg(Reg);
181 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
182 }
183
184 if (Info.hasWorkItemIDY()) {
185 Register Reg = AMDGPU::VGPR1;
186 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
187
188 CCInfo.AllocateReg(Reg);
189 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
190 }
191
192 if (Info.hasWorkItemIDZ()) {
193 Register Reg = AMDGPU::VGPR2;
194 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
195
196 CCInfo.AllocateReg(Reg);
197 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
198 }
199 }
200
201 // Allocate special inputs passed in user SGPRs.
allocateHSAUserSGPRs(CCState & CCInfo,MachineIRBuilder & MIRBuilder,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info)202 static void allocateHSAUserSGPRs(CCState &CCInfo,
203 MachineIRBuilder &MIRBuilder,
204 MachineFunction &MF,
205 const SIRegisterInfo &TRI,
206 SIMachineFunctionInfo &Info) {
207 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
208 if (Info.hasPrivateSegmentBuffer()) {
209 unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
210 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
211 CCInfo.AllocateReg(PrivateSegmentBufferReg);
212 }
213
214 if (Info.hasDispatchPtr()) {
215 unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
216 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
217 CCInfo.AllocateReg(DispatchPtrReg);
218 }
219
220 if (Info.hasQueuePtr()) {
221 unsigned QueuePtrReg = Info.addQueuePtr(TRI);
222 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
223 CCInfo.AllocateReg(QueuePtrReg);
224 }
225
226 if (Info.hasKernargSegmentPtr()) {
227 MachineRegisterInfo &MRI = MF.getRegInfo();
228 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
229 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
230 Register VReg = MRI.createGenericVirtualRegister(P4);
231 MRI.addLiveIn(InputPtrReg, VReg);
232 MIRBuilder.getMBB().addLiveIn(InputPtrReg);
233 MIRBuilder.buildCopy(VReg, InputPtrReg);
234 CCInfo.AllocateReg(InputPtrReg);
235 }
236
237 if (Info.hasDispatchID()) {
238 unsigned DispatchIDReg = Info.addDispatchID(TRI);
239 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
240 CCInfo.AllocateReg(DispatchIDReg);
241 }
242
243 if (Info.hasFlatScratchInit()) {
244 unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
245 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
246 CCInfo.AllocateReg(FlatScratchInitReg);
247 }
248
249 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
250 // these from the dispatch pointer.
251 }
252
allocateSystemSGPRs(CCState & CCInfo,MachineFunction & MF,SIMachineFunctionInfo & Info,CallingConv::ID CallConv,bool IsShader)253 static void allocateSystemSGPRs(CCState &CCInfo,
254 MachineFunction &MF,
255 SIMachineFunctionInfo &Info,
256 CallingConv::ID CallConv,
257 bool IsShader) {
258 const LLT S32 = LLT::scalar(32);
259 MachineRegisterInfo &MRI = MF.getRegInfo();
260
261 if (Info.hasWorkGroupIDX()) {
262 Register Reg = Info.addWorkGroupIDX();
263 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
264 CCInfo.AllocateReg(Reg);
265 }
266
267 if (Info.hasWorkGroupIDY()) {
268 Register Reg = Info.addWorkGroupIDY();
269 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
270 CCInfo.AllocateReg(Reg);
271 }
272
273 if (Info.hasWorkGroupIDZ()) {
274 unsigned Reg = Info.addWorkGroupIDZ();
275 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
276 CCInfo.AllocateReg(Reg);
277 }
278
279 if (Info.hasWorkGroupInfo()) {
280 unsigned Reg = Info.addWorkGroupInfo();
281 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
282 CCInfo.AllocateReg(Reg);
283 }
284
285 if (Info.hasPrivateSegmentWaveByteOffset()) {
286 // Scratch wave offset passed in system SGPR.
287 unsigned PrivateSegmentWaveByteOffsetReg;
288
289 if (IsShader) {
290 PrivateSegmentWaveByteOffsetReg =
291 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
292
293 // This is true if the scratch wave byte offset doesn't have a fixed
294 // location.
295 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
296 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
297 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
298 }
299 } else
300 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
301
302 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
303 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
304 }
305 }
306
lowerFormalArgumentsKernel(MachineIRBuilder & MIRBuilder,const Function & F,ArrayRef<ArrayRef<Register>> VRegs) const307 bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
308 MachineIRBuilder &MIRBuilder, const Function &F,
309 ArrayRef<ArrayRef<Register>> VRegs) const {
310 MachineFunction &MF = MIRBuilder.getMF();
311 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
312 MachineRegisterInfo &MRI = MF.getRegInfo();
313 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
314 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
315 const DataLayout &DL = F.getParent()->getDataLayout();
316
317 SmallVector<CCValAssign, 16> ArgLocs;
318 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
319
320 allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info);
321
322 unsigned i = 0;
323 const unsigned KernArgBaseAlign = 16;
324 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
325 uint64_t ExplicitArgOffset = 0;
326
327 // TODO: Align down to dword alignment and extract bits for extending loads.
328 for (auto &Arg : F.args()) {
329 Type *ArgTy = Arg.getType();
330 unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
331 if (AllocSize == 0)
332 continue;
333
334 unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
335
336 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
337 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
338
339 ArrayRef<Register> OrigArgRegs = VRegs[i];
340 Register ArgReg =
341 OrigArgRegs.size() == 1
342 ? OrigArgRegs[0]
343 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
344 unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
345 ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
346 lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, ArgReg);
347 if (OrigArgRegs.size() > 1)
348 unpackRegs(OrigArgRegs, ArgReg, ArgTy, MIRBuilder);
349 ++i;
350 }
351
352 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
353 allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
354 return true;
355 }
356
lowerFormalArguments(MachineIRBuilder & MIRBuilder,const Function & F,ArrayRef<ArrayRef<Register>> VRegs) const357 bool AMDGPUCallLowering::lowerFormalArguments(
358 MachineIRBuilder &MIRBuilder, const Function &F,
359 ArrayRef<ArrayRef<Register>> VRegs) const {
360 // The infrastructure for normal calling convention lowering is essentially
361 // useless for kernels. We want to avoid any kind of legalization or argument
362 // splitting.
363 if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL)
364 return lowerFormalArgumentsKernel(MIRBuilder, F, VRegs);
365
366 // AMDGPU_GS and AMDGP_HS are not supported yet.
367 if (F.getCallingConv() == CallingConv::AMDGPU_GS ||
368 F.getCallingConv() == CallingConv::AMDGPU_HS)
369 return false;
370
371 MachineFunction &MF = MIRBuilder.getMF();
372 MachineRegisterInfo &MRI = MF.getRegInfo();
373 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
374 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
375 const DataLayout &DL = F.getParent()->getDataLayout();
376
377 bool IsShader = AMDGPU::isShader(F.getCallingConv());
378
379 SmallVector<CCValAssign, 16> ArgLocs;
380 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
381
382 if (Info->hasImplicitBufferPtr()) {
383 unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
384 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
385 CCInfo.AllocateReg(ImplicitBufferPtrReg);
386 }
387
388 unsigned NumArgs = F.arg_size();
389 Function::const_arg_iterator CurOrigArg = F.arg_begin();
390 const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
391 unsigned PSInputNum = 0;
392 BitVector Skipped(NumArgs);
393 for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
394 EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType());
395
396 // We can only hanlde simple value types at the moment.
397 ISD::ArgFlagsTy Flags;
398 assert(VRegs[i].size() == 1 && "Can't lower into more than one register");
399 ArgInfo OrigArg{VRegs[i][0], CurOrigArg->getType()};
400 setArgFlags(OrigArg, i + 1, DL, F);
401 Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType()));
402
403 if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
404 !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() &&
405 PSInputNum <= 15) {
406 if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) {
407 Skipped.set(i);
408 ++PSInputNum;
409 continue;
410 }
411
412 Info->markPSInputAllocated(PSInputNum);
413 if (!CurOrigArg->use_empty())
414 Info->markPSInputEnabled(PSInputNum);
415
416 ++PSInputNum;
417 }
418
419 CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(),
420 /*IsVarArg=*/false);
421
422 if (ValEVT.isVector()) {
423 EVT ElemVT = ValEVT.getVectorElementType();
424 if (!ValEVT.isSimple())
425 return false;
426 MVT ValVT = ElemVT.getSimpleVT();
427 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full,
428 OrigArg.Flags, CCInfo);
429 if (!Res)
430 return false;
431 } else {
432 MVT ValVT = ValEVT.getSimpleVT();
433 if (!ValEVT.isSimple())
434 return false;
435 bool Res =
436 AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo);
437
438 // Fail if we don't know how to handle this type.
439 if (Res)
440 return false;
441 }
442 }
443
444 Function::const_arg_iterator Arg = F.arg_begin();
445
446 if (F.getCallingConv() == CallingConv::AMDGPU_VS ||
447 F.getCallingConv() == CallingConv::AMDGPU_PS) {
448 for (unsigned i = 0, OrigArgIdx = 0;
449 OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) {
450 if (Skipped.test(OrigArgIdx))
451 continue;
452 assert(VRegs[OrigArgIdx].size() == 1 &&
453 "Can't lower into more than 1 reg");
454 CCValAssign &VA = ArgLocs[i++];
455 MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx][0]);
456 MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
457 MIRBuilder.buildCopy(VRegs[OrigArgIdx][0], VA.getLocReg());
458 }
459
460 allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader);
461 return true;
462 }
463
464 return false;
465 }
466