1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPURegisterBankInfo.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/SmallSet.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
24 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
25 #include "llvm/CodeGen/TargetRegisterInfo.h"
26 #include "llvm/CodeGen/TargetSubtargetInfo.h"
27 #include "llvm/IR/Constants.h"
28
29 #define GET_TARGET_REGBANK_IMPL
30 #include "AMDGPUGenRegisterBank.inc"
31
32 // This file will be TableGen'ed at some point.
33 #include "AMDGPUGenRegisterBankInfo.def"
34
35 using namespace llvm;
36
37 namespace {
38
39 // Observer to apply a register bank to new registers created by LegalizerHelper.
40 class ApplyRegBankMapping final : public GISelChangeObserver {
41 private:
42 MachineRegisterInfo &MRI;
43 const RegisterBank *NewBank;
44 SmallVector<MachineInstr *, 4> NewInsts;
45
46 public:
ApplyRegBankMapping(MachineRegisterInfo & MRI_,const RegisterBank * RB)47 ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB)
48 : MRI(MRI_), NewBank(RB) {}
49
~ApplyRegBankMapping()50 ~ApplyRegBankMapping() {
51 for (MachineInstr *MI : NewInsts)
52 applyBank(*MI);
53 }
54
55 /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)56 void applyBank(MachineInstr &MI) {
57 for (MachineOperand &Op : MI.operands()) {
58 if (!Op.isReg())
59 continue;
60
61 Register Reg = Op.getReg();
62 if (MRI.getRegClassOrRegBank(Reg))
63 continue;
64
65 const RegisterBank *RB = NewBank;
66 // FIXME: This might not be enough to detect when SCC should be used.
67 if (MRI.getType(Reg) == LLT::scalar(1))
68 RB = (NewBank == &AMDGPU::SGPRRegBank ?
69 &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank);
70
71 MRI.setRegBank(Reg, *RB);
72 }
73 }
74
erasingInstr(MachineInstr & MI)75 void erasingInstr(MachineInstr &MI) override {}
76
createdInstr(MachineInstr & MI)77 void createdInstr(MachineInstr &MI) override {
78 // At this point, the instruction was just inserted and has no operands.
79 NewInsts.push_back(&MI);
80 }
81
changingInstr(MachineInstr & MI)82 void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)83 void changedInstr(MachineInstr &MI) override {}
84 };
85
86 }
AMDGPURegisterBankInfo(const TargetRegisterInfo & TRI)87 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
88 : AMDGPUGenRegisterBankInfo(),
89 TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
90
91 // HACK: Until this is fully tablegen'd.
92 static bool AlreadyInit = false;
93 if (AlreadyInit)
94 return;
95
96 AlreadyInit = true;
97
98 const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID);
99 (void)RBSGPR;
100 assert(&RBSGPR == &AMDGPU::SGPRRegBank);
101
102 const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID);
103 (void)RBVGPR;
104 assert(&RBVGPR == &AMDGPU::VGPRRegBank);
105
106 }
107
copyCost(const RegisterBank & Dst,const RegisterBank & Src,unsigned Size) const108 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
109 const RegisterBank &Src,
110 unsigned Size) const {
111 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
112 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
113 Src.getID() == AMDGPU::VGPRRegBankID) {
114 return std::numeric_limits<unsigned>::max();
115 }
116
117 // Bool values are tricky, because the meaning is based on context. The SCC
118 // and VCC banks are for the natural scalar and vector conditions produced by
119 // a compare.
120 //
121 // Legalization doesn't know about the necessary context, so an s1 use may
122 // have been a truncate from an arbitrary value, in which case a copy (lowered
123 // as a compare with 0) needs to be inserted.
124 if (Size == 1 &&
125 (Dst.getID() == AMDGPU::SCCRegBankID ||
126 Dst.getID() == AMDGPU::SGPRRegBankID) &&
127 (Src.getID() == AMDGPU::SGPRRegBankID ||
128 Src.getID() == AMDGPU::VGPRRegBankID ||
129 Src.getID() == AMDGPU::VCCRegBankID))
130 return std::numeric_limits<unsigned>::max();
131
132 if (Dst.getID() == AMDGPU::SCCRegBankID &&
133 Src.getID() == AMDGPU::VCCRegBankID)
134 return std::numeric_limits<unsigned>::max();
135
136 return RegisterBankInfo::copyCost(Dst, Src, Size);
137 }
138
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const139 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
140 const ValueMapping &ValMapping,
141 const RegisterBank *CurBank) const {
142 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
143 // VGPR.
144 // FIXME: Is there a better way to do this?
145 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
146 return 10; // This is expensive.
147
148 assert(ValMapping.NumBreakDowns == 2 &&
149 ValMapping.BreakDown[0].Length == 32 &&
150 ValMapping.BreakDown[0].StartIdx == 0 &&
151 ValMapping.BreakDown[1].Length == 32 &&
152 ValMapping.BreakDown[1].StartIdx == 32 &&
153 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
154
155 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
156 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
157 // want.
158
159 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
160 // alignment restrictions, but this probably isn't important.
161 return 1;
162 }
163
getRegBankFromRegClass(const TargetRegisterClass & RC) const164 const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
165 const TargetRegisterClass &RC) const {
166
167 if (TRI->isSGPRClass(&RC))
168 return getRegBank(AMDGPU::SGPRRegBankID);
169
170 return getRegBank(AMDGPU::VGPRRegBankID);
171 }
172
173 template <unsigned NumOps>
174 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const175 AMDGPURegisterBankInfo::addMappingFromTable(
176 const MachineInstr &MI, const MachineRegisterInfo &MRI,
177 const std::array<unsigned, NumOps> RegSrcOpIdx,
178 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
179
180 InstructionMappings AltMappings;
181
182 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
183
184 unsigned Sizes[NumOps];
185 for (unsigned I = 0; I < NumOps; ++I) {
186 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
187 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
188 }
189
190 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
191 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
192 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
193 }
194
195 unsigned MappingID = 0;
196 for (const auto &Entry : Table) {
197 for (unsigned I = 0; I < NumOps; ++I) {
198 int OpIdx = RegSrcOpIdx[I];
199 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
200 }
201
202 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
203 getOperandsMapping(Operands),
204 Operands.size()));
205 }
206
207 return AltMappings;
208 }
209
210 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const211 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
212 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
213 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
214 case Intrinsic::amdgcn_readlane: {
215 static const OpRegBankEntry<3> Table[2] = {
216 // Perfectly legal.
217 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
218
219 // Need a readfirstlane for the index.
220 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
221 };
222
223 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
224 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
225 }
226 case Intrinsic::amdgcn_writelane: {
227 static const OpRegBankEntry<4> Table[4] = {
228 // Perfectly legal.
229 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
230
231 // Need readfirstlane of first op
232 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
233
234 // Need readfirstlane of second op
235 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
236
237 // Need readfirstlane of both ops
238 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
239 };
240
241 // rsrc, voffset, offset
242 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
243 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
244 }
245 default:
246 return RegisterBankInfo::getInstrAlternativeMappings(MI);
247 }
248 }
249
250 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const251 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
252 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
253
254 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
255 case Intrinsic::amdgcn_buffer_load: {
256 static const OpRegBankEntry<3> Table[4] = {
257 // Perfectly legal.
258 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
259 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
260
261 // Waterfall loop needed for rsrc. In the worst case this will execute
262 // approximately an extra 10 * wavesize + 2 instructions.
263 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
264 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
265 };
266
267 // rsrc, voffset, offset
268 const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
269 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
270 }
271 case Intrinsic::amdgcn_s_buffer_load: {
272 static const OpRegBankEntry<2> Table[4] = {
273 // Perfectly legal.
274 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
275
276 // Only need 1 register in loop
277 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
278
279 // Have to waterfall the resource.
280 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
281
282 // Have to waterfall the resource, and the offset.
283 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
284 };
285
286 // rsrc, offset
287 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
288 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
289 }
290 case Intrinsic::amdgcn_ds_ordered_add:
291 case Intrinsic::amdgcn_ds_ordered_swap: {
292 // VGPR = M0, VGPR
293 static const OpRegBankEntry<3> Table[2] = {
294 // Perfectly legal.
295 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
296
297 // Need a readfirstlane for m0
298 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
299 };
300
301 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
302 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
303 }
304 case Intrinsic::amdgcn_s_sendmsg:
305 case Intrinsic::amdgcn_s_sendmsghalt: {
306 static const OpRegBankEntry<1> Table[2] = {
307 // Perfectly legal.
308 { { AMDGPU::SGPRRegBankID }, 1 },
309
310 // Need readlane
311 { { AMDGPU::VGPRRegBankID }, 3 }
312 };
313
314 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
315 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
316 }
317 default:
318 return RegisterBankInfo::getInstrAlternativeMappings(MI);
319 }
320 }
321
isInstrUniform(const MachineInstr & MI)322 static bool isInstrUniform(const MachineInstr &MI) {
323 if (!MI.hasOneMemOperand())
324 return false;
325
326 const MachineMemOperand *MMO = *MI.memoperands_begin();
327 return AMDGPUInstrInfo::isUniformMMO(MMO);
328 }
329
330 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const331 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
332 const MachineInstr &MI) const {
333
334 const MachineFunction &MF = *MI.getParent()->getParent();
335 const MachineRegisterInfo &MRI = MF.getRegInfo();
336
337
338 InstructionMappings AltMappings;
339 switch (MI.getOpcode()) {
340 case TargetOpcode::G_AND:
341 case TargetOpcode::G_OR:
342 case TargetOpcode::G_XOR: {
343 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
344
345 if (Size == 1) {
346 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
347 const InstructionMapping &SCCMapping = getInstructionMapping(
348 1, 1, getOperandsMapping(
349 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size),
350 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
351 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
352 3); // Num Operands
353 AltMappings.push_back(&SCCMapping);
354
355 const InstructionMapping &SGPRMapping = getInstructionMapping(
356 1, 1, getOperandsMapping(
357 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
358 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
359 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
360 3); // Num Operands
361 AltMappings.push_back(&SGPRMapping);
362
363 const InstructionMapping &VCCMapping0 = getInstructionMapping(
364 2, 10, getOperandsMapping(
365 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
366 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
367 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
368 3); // Num Operands
369 AltMappings.push_back(&VCCMapping0);
370 return AltMappings;
371 }
372
373 if (Size != 64)
374 break;
375
376 const InstructionMapping &SSMapping = getInstructionMapping(
377 1, 1, getOperandsMapping(
378 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
379 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
380 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
381 3); // Num Operands
382 AltMappings.push_back(&SSMapping);
383
384 const InstructionMapping &VVMapping = getInstructionMapping(
385 2, 2, getOperandsMapping(
386 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
387 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
388 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
389 3); // Num Operands
390 AltMappings.push_back(&VVMapping);
391
392 const InstructionMapping &SVMapping = getInstructionMapping(
393 3, 3, getOperandsMapping(
394 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
395 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
396 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
397 3); // Num Operands
398 AltMappings.push_back(&SVMapping);
399
400 // SGPR in LHS is slightly preferrable, so make it VS more expensive than
401 // SV.
402 const InstructionMapping &VSMapping = getInstructionMapping(
403 3, 4, getOperandsMapping(
404 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
405 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
406 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
407 3); // Num Operands
408 AltMappings.push_back(&VSMapping);
409 break;
410 }
411 case TargetOpcode::G_LOAD: {
412 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
413 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
414 // FIXME: Should we be hard coding the size for these mappings?
415 if (isInstrUniform(MI)) {
416 const InstructionMapping &SSMapping = getInstructionMapping(
417 1, 1, getOperandsMapping(
418 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
419 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
420 2); // Num Operands
421 AltMappings.push_back(&SSMapping);
422 }
423
424 const InstructionMapping &VVMapping = getInstructionMapping(
425 2, 1, getOperandsMapping(
426 {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
427 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
428 2); // Num Operands
429 AltMappings.push_back(&VVMapping);
430
431 // It may be possible to have a vgpr = load sgpr mapping here, because
432 // the mubuf instructions support this kind of load, but probably for only
433 // gfx7 and older. However, the addressing mode matching in the instruction
434 // selector should be able to do a better job of detecting and selecting
435 // these kinds of loads from the vgpr = load vgpr mapping.
436
437 return AltMappings;
438
439 }
440 case TargetOpcode::G_ICMP: {
441 unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
442 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
443 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
444 nullptr, // Predicate operand.
445 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
446 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
447 4); // Num Operands
448 AltMappings.push_back(&SSMapping);
449
450 const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
451 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
452 nullptr, // Predicate operand.
453 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
454 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
455 4); // Num Operands
456 AltMappings.push_back(&SVMapping);
457
458 const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
459 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
460 nullptr, // Predicate operand.
461 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
462 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
463 4); // Num Operands
464 AltMappings.push_back(&VSMapping);
465
466 const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
467 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
468 nullptr, // Predicate operand.
469 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
470 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
471 4); // Num Operands
472 AltMappings.push_back(&VVMapping);
473
474 return AltMappings;
475 }
476 case TargetOpcode::G_SELECT: {
477 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
478 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
479 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
480 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
481 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
482 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
483 4); // Num Operands
484 AltMappings.push_back(&SSMapping);
485
486 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
487 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
488 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
489 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
490 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
491 4); // Num Operands
492 AltMappings.push_back(&VVMapping);
493
494 return AltMappings;
495 }
496 case TargetOpcode::G_SMIN:
497 case TargetOpcode::G_SMAX:
498 case TargetOpcode::G_UMIN:
499 case TargetOpcode::G_UMAX: {
500 static const OpRegBankEntry<3> Table[4] = {
501 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
502 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
503 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
504
505 // Scalar requires cmp+select, and extends if 16-bit.
506 // FIXME: Should there be separate costs for 32 and 16-bit
507 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
508 };
509
510 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
511 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
512 }
513 case TargetOpcode::G_UADDE:
514 case TargetOpcode::G_USUBE:
515 case TargetOpcode::G_SADDE:
516 case TargetOpcode::G_SSUBE: {
517 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
518 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
519 getOperandsMapping(
520 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
521 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
522 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
523 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
524 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}),
525 5); // Num Operands
526 AltMappings.push_back(&SSMapping);
527
528 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
529 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
530 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
531 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
532 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
533 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
534 5); // Num Operands
535 AltMappings.push_back(&VVMapping);
536 return AltMappings;
537 }
538 case AMDGPU::G_BRCOND: {
539 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
540
541 const InstructionMapping &SMapping = getInstructionMapping(
542 1, 1, getOperandsMapping(
543 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}),
544 2); // Num Operands
545 AltMappings.push_back(&SMapping);
546
547 const InstructionMapping &VMapping = getInstructionMapping(
548 1, 1, getOperandsMapping(
549 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
550 2); // Num Operands
551 AltMappings.push_back(&VMapping);
552 return AltMappings;
553 }
554 case AMDGPU::G_INTRINSIC:
555 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
556 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
557 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
558 default:
559 break;
560 }
561 return RegisterBankInfo::getInstrAlternativeMappings(MI);
562 }
563
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const564 void AMDGPURegisterBankInfo::split64BitValueForMapping(
565 MachineIRBuilder &B,
566 SmallVector<Register, 2> &Regs,
567 LLT HalfTy,
568 Register Reg) const {
569 assert(HalfTy.getSizeInBits() == 32);
570 MachineRegisterInfo *MRI = B.getMRI();
571 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
572 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
573 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
574 MRI->setRegBank(LoLHS, *Bank);
575 MRI->setRegBank(HiLHS, *Bank);
576
577 Regs.push_back(LoLHS);
578 Regs.push_back(HiLHS);
579
580 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
581 .addDef(LoLHS)
582 .addDef(HiLHS)
583 .addUse(Reg);
584 }
585
586 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)587 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
588 LLT NewTy) {
589 for (Register Reg : Regs) {
590 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
591 MRI.setType(Reg, NewTy);
592 }
593 }
594
getHalfSizedType(LLT Ty)595 static LLT getHalfSizedType(LLT Ty) {
596 if (Ty.isVector()) {
597 assert(Ty.getNumElements() % 2 == 0);
598 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
599 }
600
601 assert(Ty.getSizeInBits() % 2 == 0);
602 return LLT::scalar(Ty.getSizeInBits() / 2);
603 }
604
605 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
606 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
607 /// execute the instruction for each unique combination of values in all lanes
608 /// in the wave. The block will be split such that rest of the instructions are
609 /// moved to a new block.
610 ///
611 /// Essentially performs this loop:
612 //
613 /// Save Execution Mask
614 /// For (Lane : Wavefront) {
615 /// Enable Lane, Disable all other lanes
616 /// SGPR = read SGPR value for current lane from VGPR
617 /// VGPRResult[Lane] = use_op SGPR
618 /// }
619 /// Restore Execution Mask
620 ///
621 /// There is additional complexity to try for compare values to identify the
622 /// unique values used.
executeInWaterfallLoop(MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const623 void AMDGPURegisterBankInfo::executeInWaterfallLoop(
624 MachineInstr &MI, MachineRegisterInfo &MRI,
625 ArrayRef<unsigned> OpIndices) const {
626 MachineFunction *MF = MI.getParent()->getParent();
627 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
628 const SIInstrInfo *TII = ST.getInstrInfo();
629 MachineBasicBlock::iterator I(MI);
630
631 MachineBasicBlock &MBB = *MI.getParent();
632 const DebugLoc &DL = MI.getDebugLoc();
633
634 // Use a set to avoid extra readfirstlanes in the case where multiple operands
635 // are the same register.
636 SmallSet<Register, 4> SGPROperandRegs;
637 for (unsigned Op : OpIndices) {
638 assert(MI.getOperand(Op).isUse());
639 Register Reg = MI.getOperand(Op).getReg();
640 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
641 if (OpBank->getID() == AMDGPU::VGPRRegBankID)
642 SGPROperandRegs.insert(Reg);
643 }
644
645 // No operands need to be replaced, so no need to loop.
646 if (SGPROperandRegs.empty())
647 return;
648
649 MachineIRBuilder B(MI);
650 SmallVector<Register, 4> ResultRegs;
651 SmallVector<Register, 4> InitResultRegs;
652 SmallVector<Register, 4> PhiRegs;
653 for (MachineOperand &Def : MI.defs()) {
654 LLT ResTy = MRI.getType(Def.getReg());
655 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
656 ResultRegs.push_back(Def.getReg());
657 Register InitReg = B.buildUndef(ResTy).getReg(0);
658 Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
659 InitResultRegs.push_back(InitReg);
660 PhiRegs.push_back(PhiReg);
661 MRI.setRegBank(PhiReg, *DefBank);
662 MRI.setRegBank(InitReg, *DefBank);
663 }
664
665 Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
666 Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
667
668 // Don't bother using generic instructions/registers for the exec mask.
669 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
670 .addDef(InitSaveExecReg);
671
672 Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
673 Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
674
675 // To insert the loop we need to split the block. Move everything before this
676 // point to a new block, and insert a new empty block before this instruction.
677 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
678 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
679 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
680 MachineFunction::iterator MBBI(MBB);
681 ++MBBI;
682 MF->insert(MBBI, LoopBB);
683 MF->insert(MBBI, RestoreExecBB);
684 MF->insert(MBBI, RemainderBB);
685
686 LoopBB->addSuccessor(RestoreExecBB);
687 LoopBB->addSuccessor(LoopBB);
688
689 // Move the rest of the block into a new block.
690 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
691 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
692
693 MBB.addSuccessor(LoopBB);
694 RestoreExecBB->addSuccessor(RemainderBB);
695
696 B.setInsertPt(*LoopBB, LoopBB->end());
697
698 B.buildInstr(TargetOpcode::PHI)
699 .addDef(PhiExec)
700 .addReg(InitSaveExecReg)
701 .addMBB(&MBB)
702 .addReg(NewExec)
703 .addMBB(LoopBB);
704
705 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
706 B.buildInstr(TargetOpcode::G_PHI)
707 .addDef(std::get<2>(Result))
708 .addReg(std::get<0>(Result)) // Initial value / implicit_def
709 .addMBB(&MBB)
710 .addReg(std::get<1>(Result)) // Mid-loop value.
711 .addMBB(LoopBB);
712 }
713
714 // Move the instruction into the loop.
715 LoopBB->splice(LoopBB->end(), &MBB, I);
716 I = std::prev(LoopBB->end());
717
718 B.setInstr(*I);
719
720 Register CondReg;
721
722 for (MachineOperand &Op : MI.uses()) {
723 if (!Op.isReg())
724 continue;
725
726 assert(!Op.isDef());
727 if (SGPROperandRegs.count(Op.getReg())) {
728 LLT OpTy = MRI.getType(Op.getReg());
729 unsigned OpSize = OpTy.getSizeInBits();
730
731 // Can only do a readlane of 32-bit pieces.
732 if (OpSize == 32) {
733 // Avoid extra copies in the simple case of one 32-bit register.
734 Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
735 MRI.setType(CurrentLaneOpReg, OpTy);
736
737 constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
738 // Read the next variant <- also loop target.
739 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg)
740 .addReg(Op.getReg());
741
742 Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
743 bool First = CondReg == AMDGPU::NoRegister;
744 if (First)
745 CondReg = NewCondReg;
746
747 // Compare the just read M0 value to all possible Idx values.
748 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
749 .addDef(NewCondReg)
750 .addReg(CurrentLaneOpReg)
751 .addReg(Op.getReg());
752 Op.setReg(CurrentLaneOpReg);
753
754 if (!First) {
755 Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
756
757 // If there are multiple operands to consider, and the conditions.
758 B.buildInstr(AMDGPU::S_AND_B64)
759 .addDef(AndReg)
760 .addReg(NewCondReg)
761 .addReg(CondReg);
762 CondReg = AndReg;
763 }
764 } else {
765 LLT S32 = LLT::scalar(32);
766 SmallVector<Register, 8> ReadlanePieces;
767
768 // The compares can be done as 64-bit, but the extract needs to be done
769 // in 32-bit pieces.
770
771 bool Is64 = OpSize % 64 == 0;
772
773 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
774 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
775 : AMDGPU::V_CMP_EQ_U32_e64;
776
777 // The compares can be done as 64-bit, but the extract needs to be done
778 // in 32-bit pieces.
779
780 // Insert the unmerge before the loop.
781
782 B.setMBB(MBB);
783 auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
784 B.setInstr(*I);
785
786 unsigned NumPieces = Unmerge->getNumOperands() - 1;
787 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
788 unsigned UnmergePiece = Unmerge.getReg(PieceIdx);
789
790 Register CurrentLaneOpReg;
791 if (Is64) {
792 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
793 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
794
795 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
796 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
797 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
798
799 // Read the next variant <- also loop target.
800 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
801 CurrentLaneOpRegLo)
802 .addReg(UnmergePiece, 0, AMDGPU::sub0);
803
804 // Read the next variant <- also loop target.
805 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
806 CurrentLaneOpRegHi)
807 .addReg(UnmergePiece, 0, AMDGPU::sub1);
808
809 CurrentLaneOpReg =
810 B.buildMerge(LLT::scalar(64),
811 {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
812 .getReg(0);
813
814 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
815
816 if (OpTy.getScalarSizeInBits() == 64) {
817 // If we need to produce a 64-bit element vector, so use the
818 // merged pieces
819 ReadlanePieces.push_back(CurrentLaneOpReg);
820 } else {
821 // 32-bit element type.
822 ReadlanePieces.push_back(CurrentLaneOpRegLo);
823 ReadlanePieces.push_back(CurrentLaneOpRegHi);
824 }
825 } else {
826 CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
827 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
828 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
829
830 // Read the next variant <- also loop target.
831 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
832 CurrentLaneOpReg)
833 .addReg(UnmergePiece);
834 ReadlanePieces.push_back(CurrentLaneOpReg);
835 }
836
837 Register NewCondReg
838 = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
839 bool First = CondReg == AMDGPU::NoRegister;
840 if (First)
841 CondReg = NewCondReg;
842
843 B.buildInstr(CmpOp)
844 .addDef(NewCondReg)
845 .addReg(CurrentLaneOpReg)
846 .addReg(UnmergePiece);
847
848 if (!First) {
849 Register AndReg
850 = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
851
852 // If there are multiple operands to consider, and the conditions.
853 B.buildInstr(AMDGPU::S_AND_B64)
854 .addDef(AndReg)
855 .addReg(NewCondReg)
856 .addReg(CondReg);
857 CondReg = AndReg;
858 }
859 }
860
861 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
862 // BUILD_VECTOR
863 if (OpTy.isVector()) {
864 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
865 Op.setReg(Merge.getReg(0));
866 } else {
867 auto Merge = B.buildMerge(OpTy, ReadlanePieces);
868 Op.setReg(Merge.getReg(0));
869 }
870
871 MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
872 }
873 }
874 }
875
876 B.setInsertPt(*LoopBB, LoopBB->end());
877
878 // Update EXEC, save the original EXEC value to VCC.
879 B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64)
880 .addDef(NewExec)
881 .addReg(CondReg, RegState::Kill);
882
883 MRI.setSimpleHint(NewExec, CondReg);
884
885 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
886 B.buildInstr(AMDGPU::S_XOR_B64_term)
887 .addDef(AMDGPU::EXEC)
888 .addReg(AMDGPU::EXEC)
889 .addReg(NewExec);
890
891 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
892 // s_cbranch_scc0?
893
894 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
895 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
896 .addMBB(LoopBB);
897
898 // Save the EXEC mask before the loop.
899 BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg)
900 .addReg(AMDGPU::EXEC);
901
902 // Restore the EXEC mask after the loop.
903 B.setMBB(*RestoreExecBB);
904 B.buildInstr(AMDGPU::S_MOV_B64_term)
905 .addDef(AMDGPU::EXEC)
906 .addReg(SaveExecReg);
907 }
908
909 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineInstr & MI,MachineRegisterInfo & MRI,unsigned OpIdx) const910 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
911 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
912 Register Reg = MI.getOperand(OpIdx).getReg();
913 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
914 if (Bank != &AMDGPU::VGPRRegBank)
915 return;
916
917 MachineIRBuilder B(MI);
918 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
919 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
920 .addDef(SGPR)
921 .addReg(Reg);
922
923 const TargetRegisterClass *Constrained =
924 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
925 (void)Constrained;
926 assert(Constrained && "Failed to constrain readfirstlane src reg");
927
928 MI.getOperand(OpIdx).setReg(SGPR);
929 }
930
931 // When regbankselect repairs registers, it will insert a repair instruction
932 // which defines the repaired register. Then it calls applyMapping and expects
933 // that the targets will either delete or rewrite the originally wrote to the
934 // repaired registers. Beccause of this, we end up in a situation where
935 // we have 2 instructions defining the same registers.
getOtherVRegDef(const MachineRegisterInfo & MRI,Register Reg,const MachineInstr & MI)936 static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
937 Register Reg,
938 const MachineInstr &MI) {
939 // Is there some way we can assert that there are exactly 2 def instructions?
940 for (MachineInstr &Other : MRI.def_instructions(Reg)) {
941 if (&Other != &MI)
942 return &Other;
943 }
944
945 return nullptr;
946 }
947
applyMappingWideLoad(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const948 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
949 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
950 MachineRegisterInfo &MRI) const {
951 Register DstReg = MI.getOperand(0).getReg();
952 const LLT LoadTy = MRI.getType(DstReg);
953 unsigned LoadSize = LoadTy.getSizeInBits();
954 const unsigned MaxNonSmrdLoadSize = 128;
955 // 128-bit loads are supported for all instruction types.
956 if (LoadSize <= MaxNonSmrdLoadSize)
957 return false;
958
959 SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
960 SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
961
962 // If the pointer is an SGPR, we have nothing to do.
963 if (SrcRegs.empty())
964 return false;
965
966 assert(LoadSize % MaxNonSmrdLoadSize == 0);
967
968 // We want to get the repair instruction now, because it will help us
969 // determine which instruction the legalizer inserts that will also
970 // write to DstReg.
971 MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
972
973 // RegBankSelect only emits scalar types, so we need to reset the pointer
974 // operand to a pointer type.
975 Register BasePtrReg = SrcRegs[0];
976 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
977 MRI.setType(BasePtrReg, PtrTy);
978
979 MachineIRBuilder B(MI);
980
981 unsigned SplitElts =
982 MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
983 const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
984 ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank);
985 GISelObserverWrapper Observer(&O);
986 B.setChangeObserver(Observer);
987 LegalizerHelper Helper(B.getMF(), Observer, B);
988 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
989 return false;
990
991 // At this point, the legalizer has split the original load into smaller
992 // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
993 // that combines the outputs of the lower loads and writes it to DstReg.
994 // The register bank selector has also added the RepairInst which writes to
995 // DstReg as well.
996
997 MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
998
999 // Replace the output of the LegalizedInst with a temporary register, since
1000 // RepairInst already defines DstReg.
1001 Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
1002 LegalizedInst->getOperand(0).setReg(TmpReg);
1003 B.setInsertPt(*RepairInst->getParent(), RepairInst);
1004
1005 for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
1006 Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
1007 B.buildConstant(IdxReg, DefIdx);
1008 MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID));
1009 B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
1010 }
1011
1012 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1013 return true;
1014 }
1015
1016 // For cases where only a single copy is inserted for matching register banks.
1017 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1018 static void substituteSimpleCopyRegs(
1019 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1020 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1021 if (!SrcReg.empty()) {
1022 assert(SrcReg.size() == 1);
1023 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1024 }
1025 }
1026
applyMappingImpl(const OperandsMapper & OpdMapper) const1027 void AMDGPURegisterBankInfo::applyMappingImpl(
1028 const OperandsMapper &OpdMapper) const {
1029 MachineInstr &MI = OpdMapper.getMI();
1030 unsigned Opc = MI.getOpcode();
1031 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1032 switch (Opc) {
1033 case AMDGPU::G_SELECT: {
1034 Register DstReg = MI.getOperand(0).getReg();
1035 LLT DstTy = MRI.getType(DstReg);
1036 if (DstTy.getSizeInBits() != 64)
1037 break;
1038
1039 LLT HalfTy = getHalfSizedType(DstTy);
1040
1041 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1042 SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1));
1043 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1044 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
1045
1046 // All inputs are SGPRs, nothing special to do.
1047 if (DefRegs.empty()) {
1048 assert(Src1Regs.empty() && Src2Regs.empty());
1049 break;
1050 }
1051
1052 MachineIRBuilder B(MI);
1053 if (Src0Regs.empty())
1054 Src0Regs.push_back(MI.getOperand(1).getReg());
1055 else {
1056 assert(Src0Regs.size() == 1);
1057 }
1058
1059 if (Src1Regs.empty())
1060 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1061 else {
1062 setRegsToType(MRI, Src1Regs, HalfTy);
1063 }
1064
1065 if (Src2Regs.empty())
1066 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
1067 else
1068 setRegsToType(MRI, Src2Regs, HalfTy);
1069
1070 setRegsToType(MRI, DefRegs, HalfTy);
1071
1072 B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]);
1073 B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]);
1074
1075 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1076 MI.eraseFromParent();
1077 return;
1078 }
1079 case AMDGPU::G_AND:
1080 case AMDGPU::G_OR:
1081 case AMDGPU::G_XOR: {
1082 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
1083 // there is a VGPR input.
1084 Register DstReg = MI.getOperand(0).getReg();
1085 LLT DstTy = MRI.getType(DstReg);
1086 if (DstTy.getSizeInBits() != 64)
1087 break;
1088
1089 LLT HalfTy = getHalfSizedType(DstTy);
1090 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1091 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
1092 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1093
1094 // All inputs are SGPRs, nothing special to do.
1095 if (DefRegs.empty()) {
1096 assert(Src0Regs.empty() && Src1Regs.empty());
1097 break;
1098 }
1099
1100 assert(DefRegs.size() == 2);
1101 assert(Src0Regs.size() == Src1Regs.size() &&
1102 (Src0Regs.empty() || Src0Regs.size() == 2));
1103
1104 // Depending on where the source registers came from, the generic code may
1105 // have decided to split the inputs already or not. If not, we still need to
1106 // extract the values.
1107 MachineIRBuilder B(MI);
1108
1109 if (Src0Regs.empty())
1110 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
1111 else
1112 setRegsToType(MRI, Src0Regs, HalfTy);
1113
1114 if (Src1Regs.empty())
1115 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1116 else
1117 setRegsToType(MRI, Src1Regs, HalfTy);
1118
1119 setRegsToType(MRI, DefRegs, HalfTy);
1120
1121 B.buildInstr(Opc)
1122 .addDef(DefRegs[0])
1123 .addUse(Src0Regs[0])
1124 .addUse(Src1Regs[0]);
1125
1126 B.buildInstr(Opc)
1127 .addDef(DefRegs[1])
1128 .addUse(Src0Regs[1])
1129 .addUse(Src1Regs[1]);
1130
1131 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1132 MI.eraseFromParent();
1133 return;
1134 }
1135 case AMDGPU::G_ADD:
1136 case AMDGPU::G_SUB:
1137 case AMDGPU::G_MUL: {
1138 Register DstReg = MI.getOperand(0).getReg();
1139 LLT DstTy = MRI.getType(DstReg);
1140 if (DstTy != LLT::scalar(16))
1141 break;
1142
1143 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1144 if (DstBank == &AMDGPU::VGPRRegBank)
1145 break;
1146
1147 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
1148 MachineFunction *MF = MI.getParent()->getParent();
1149 MachineIRBuilder B(MI);
1150 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
1151 GISelObserverWrapper Observer(&ApplySALU);
1152 LegalizerHelper Helper(*MF, Observer, B);
1153
1154 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
1155 LegalizerHelper::Legalized)
1156 llvm_unreachable("widen scalar should have succeeded");
1157 return;
1158 }
1159 case AMDGPU::G_SMIN:
1160 case AMDGPU::G_SMAX:
1161 case AMDGPU::G_UMIN:
1162 case AMDGPU::G_UMAX: {
1163 Register DstReg = MI.getOperand(0).getReg();
1164 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1165 if (DstBank == &AMDGPU::VGPRRegBank)
1166 break;
1167
1168 MachineFunction *MF = MI.getParent()->getParent();
1169 MachineIRBuilder B(MI);
1170 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
1171 GISelObserverWrapper Observer(&ApplySALU);
1172 LegalizerHelper Helper(*MF, Observer, B);
1173
1174 // Turn scalar min/max into a compare and select.
1175 LLT Ty = MRI.getType(DstReg);
1176 LLT S32 = LLT::scalar(32);
1177 LLT S16 = LLT::scalar(16);
1178
1179 if (Ty == S16) {
1180 // Need to widen to s32, and expand as cmp + select.
1181 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
1182 llvm_unreachable("widenScalar should have succeeded");
1183
1184 // FIXME: This is relying on widenScalar leaving MI in place.
1185 if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized)
1186 llvm_unreachable("lower should have succeeded");
1187 } else {
1188 if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized)
1189 llvm_unreachable("lower should have succeeded");
1190 }
1191
1192 return;
1193 }
1194 case AMDGPU::G_SEXT:
1195 case AMDGPU::G_ZEXT: {
1196 Register SrcReg = MI.getOperand(1).getReg();
1197 LLT SrcTy = MRI.getType(SrcReg);
1198 bool Signed = Opc == AMDGPU::G_SEXT;
1199
1200 MachineIRBuilder B(MI);
1201 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1202
1203 Register DstReg = MI.getOperand(0).getReg();
1204 LLT DstTy = MRI.getType(DstReg);
1205 if (DstTy.isScalar() &&
1206 SrcBank != &AMDGPU::SGPRRegBank &&
1207 SrcBank != &AMDGPU::SCCRegBank &&
1208 SrcBank != &AMDGPU::VCCRegBank &&
1209 // FIXME: Should handle any type that round to s64 when irregular
1210 // breakdowns supported.
1211 DstTy.getSizeInBits() == 64 &&
1212 SrcTy.getSizeInBits() <= 32) {
1213 const LLT S32 = LLT::scalar(32);
1214 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1215
1216 // Extend to 32-bit, and then extend the low half.
1217 if (Signed) {
1218 // TODO: Should really be buildSExtOrCopy
1219 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
1220
1221 // Replicate sign bit from 32-bit extended part.
1222 auto ShiftAmt = B.buildConstant(S32, 31);
1223 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1224 B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
1225 } else {
1226 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
1227 B.buildConstant(DefRegs[1], 0);
1228 }
1229
1230 MRI.setRegBank(DstReg, *SrcBank);
1231 MI.eraseFromParent();
1232 return;
1233 }
1234
1235 if (SrcTy != LLT::scalar(1))
1236 return;
1237
1238 if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) {
1239 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1240
1241 const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ?
1242 &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank;
1243
1244 unsigned DstSize = DstTy.getSizeInBits();
1245 // 64-bit select is SGPR only
1246 const bool UseSel64 = DstSize > 32 &&
1247 SrcBank->getID() == AMDGPU::SCCRegBankID;
1248
1249 // TODO: Should s16 select be legal?
1250 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
1251 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
1252 auto False = B.buildConstant(SelType, 0);
1253
1254 MRI.setRegBank(True.getReg(0), *DstBank);
1255 MRI.setRegBank(False.getReg(0), *DstBank);
1256 MRI.setRegBank(DstReg, *DstBank);
1257
1258 if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) {
1259 B.buildSelect(DefRegs[0], SrcReg, True, False);
1260 B.buildCopy(DefRegs[1], DefRegs[0]);
1261 } else if (DstSize < 32) {
1262 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
1263 MRI.setRegBank(Sel.getReg(0), *DstBank);
1264 B.buildTrunc(DstReg, Sel);
1265 } else {
1266 B.buildSelect(DstReg, SrcReg, True, False);
1267 }
1268
1269 MI.eraseFromParent();
1270 return;
1271 }
1272
1273 // Fixup the case with an s1 src that isn't a condition register. Use shifts
1274 // instead of introducing a compare to avoid an unnecessary condition
1275 // register (and since there's no scalar 16-bit compares).
1276 auto Ext = B.buildAnyExt(DstTy, SrcReg);
1277 auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
1278 auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
1279
1280 if (MI.getOpcode() == AMDGPU::G_SEXT)
1281 B.buildAShr(DstReg, Shl, ShiftAmt);
1282 else
1283 B.buildLShr(DstReg, Shl, ShiftAmt);
1284
1285 MRI.setRegBank(DstReg, *SrcBank);
1286 MRI.setRegBank(Ext.getReg(0), *SrcBank);
1287 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1288 MRI.setRegBank(Shl.getReg(0), *SrcBank);
1289 MI.eraseFromParent();
1290 return;
1291 }
1292 case AMDGPU::G_EXTRACT_VECTOR_ELT:
1293 applyDefaultMapping(OpdMapper);
1294 executeInWaterfallLoop(MI, MRI, { 2 });
1295 return;
1296 case AMDGPU::G_INTRINSIC: {
1297 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1298 case Intrinsic::amdgcn_s_buffer_load: {
1299 // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
1300 executeInWaterfallLoop(MI, MRI, { 2, 3 });
1301 return;
1302 }
1303 case Intrinsic::amdgcn_readlane: {
1304 substituteSimpleCopyRegs(OpdMapper, 2);
1305
1306 assert(empty(OpdMapper.getVRegs(0)));
1307 assert(empty(OpdMapper.getVRegs(3)));
1308
1309 // Make sure the index is an SGPR. It doesn't make sense to run this in a
1310 // waterfall loop, so assume it's a uniform value.
1311 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
1312 return;
1313 }
1314 case Intrinsic::amdgcn_writelane: {
1315 assert(empty(OpdMapper.getVRegs(0)));
1316 assert(empty(OpdMapper.getVRegs(2)));
1317 assert(empty(OpdMapper.getVRegs(3)));
1318
1319 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
1320 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
1321 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
1322 return;
1323 }
1324 default:
1325 break;
1326 }
1327 break;
1328 }
1329 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
1330 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1331 case Intrinsic::amdgcn_buffer_load: {
1332 executeInWaterfallLoop(MI, MRI, { 2 });
1333 return;
1334 }
1335 case Intrinsic::amdgcn_ds_ordered_add:
1336 case Intrinsic::amdgcn_ds_ordered_swap: {
1337 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
1338 assert(empty(OpdMapper.getVRegs(0)));
1339 substituteSimpleCopyRegs(OpdMapper, 3);
1340 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1341 return;
1342 }
1343 case Intrinsic::amdgcn_s_sendmsg:
1344 case Intrinsic::amdgcn_s_sendmsghalt: {
1345 // FIXME: Should this use a waterfall loop?
1346 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1347 return;
1348 }
1349 default:
1350 break;
1351 }
1352 break;
1353 }
1354 case AMDGPU::G_LOAD: {
1355 if (applyMappingWideLoad(MI, OpdMapper, MRI))
1356 return;
1357 break;
1358 }
1359 default:
1360 break;
1361 }
1362
1363 return applyDefaultMapping(OpdMapper);
1364 }
1365
isSALUMapping(const MachineInstr & MI) const1366 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
1367 const MachineFunction &MF = *MI.getParent()->getParent();
1368 const MachineRegisterInfo &MRI = MF.getRegInfo();
1369 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
1370 if (!MI.getOperand(i).isReg())
1371 continue;
1372 Register Reg = MI.getOperand(i).getReg();
1373 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
1374 if (Bank->getID() == AMDGPU::VGPRRegBankID)
1375 return false;
1376
1377 assert(Bank->getID() == AMDGPU::SGPRRegBankID ||
1378 Bank->getID() == AMDGPU::SCCRegBankID);
1379 }
1380 }
1381 return true;
1382 }
1383
1384 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const1385 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
1386 const MachineFunction &MF = *MI.getParent()->getParent();
1387 const MachineRegisterInfo &MRI = MF.getRegInfo();
1388 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1389
1390 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
1391 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
1392 unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID;
1393 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
1394 }
1395 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1396 MI.getNumOperands());
1397 }
1398
1399 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const1400 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
1401 const MachineFunction &MF = *MI.getParent()->getParent();
1402 const MachineRegisterInfo &MRI = MF.getRegInfo();
1403 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1404 unsigned OpdIdx = 0;
1405
1406 unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1407 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
1408
1409 if (MI.getOperand(OpdIdx).isIntrinsicID())
1410 OpdsMapping[OpdIdx++] = nullptr;
1411
1412 Register Reg1 = MI.getOperand(OpdIdx).getReg();
1413 unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
1414
1415 unsigned DefaultBankID = Size1 == 1 ?
1416 AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
1417 unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
1418
1419 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
1420
1421 for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
1422 const MachineOperand &MO = MI.getOperand(OpdIdx);
1423 if (!MO.isReg())
1424 continue;
1425
1426 unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
1427 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
1428 OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
1429 }
1430
1431 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1432 MI.getNumOperands());
1433 }
1434
1435 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const1436 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
1437 const MachineFunction &MF = *MI.getParent()->getParent();
1438 const MachineRegisterInfo &MRI = MF.getRegInfo();
1439 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1440
1441 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1442 const MachineOperand &Op = MI.getOperand(I);
1443 if (!Op.isReg())
1444 continue;
1445
1446 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
1447 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1448 }
1449
1450 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1451 MI.getNumOperands());
1452 }
1453
1454 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const1455 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
1456
1457 const MachineFunction &MF = *MI.getParent()->getParent();
1458 const MachineRegisterInfo &MRI = MF.getRegInfo();
1459 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1460 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1461 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
1462 unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
1463
1464 const ValueMapping *ValMapping;
1465 const ValueMapping *PtrMapping;
1466
1467 if (isInstrUniform(MI)) {
1468 // We have a uniform instruction so we want to use an SMRD load
1469 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
1470 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
1471 } else {
1472 ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
1473 // FIXME: What would happen if we used SGPRRegBankID here?
1474 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
1475 }
1476
1477 OpdsMapping[0] = ValMapping;
1478 OpdsMapping[1] = PtrMapping;
1479 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
1480 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
1481 return Mapping;
1482
1483 // FIXME: Do we want to add a mapping for FLAT load, or should we just
1484 // handle that during instruction selection?
1485 }
1486
1487 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,unsigned Default) const1488 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
1489 const MachineRegisterInfo &MRI,
1490 const TargetRegisterInfo &TRI,
1491 unsigned Default) const {
1492
1493 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
1494 return Bank ? Bank->getID() : Default;
1495 }
1496
1497 ///
1498 /// This function must return a legal mapping, because
1499 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
1500 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
1501 /// VGPR to SGPR generated is illegal.
1502 ///
1503 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const1504 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
1505 const MachineFunction &MF = *MI.getParent()->getParent();
1506 const MachineRegisterInfo &MRI = MF.getRegInfo();
1507
1508 if (MI.isRegSequence()) {
1509 // If any input is a VGPR, the result must be a VGPR. The default handling
1510 // assumes any copy between banks is legal.
1511 unsigned BankID = AMDGPU::SGPRRegBankID;
1512
1513 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1514 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
1515 // It doesn't make sense to use vcc or scc banks here, so just ignore
1516 // them.
1517 if (OpBank != AMDGPU::SGPRRegBankID) {
1518 BankID = AMDGPU::VGPRRegBankID;
1519 break;
1520 }
1521 }
1522 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1523
1524 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
1525 return getInstructionMapping(
1526 1, /*Cost*/ 1,
1527 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
1528 }
1529
1530 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
1531 // properly.
1532 //
1533 // TODO: There are additional exec masking dependencies to analyze.
1534 if (MI.getOpcode() == TargetOpcode::G_PHI) {
1535 // TODO: Generate proper invalid bank enum.
1536 int ResultBank = -1;
1537
1538 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1539 unsigned Reg = MI.getOperand(I).getReg();
1540 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1541
1542 // FIXME: Assuming VGPR for any undetermined inputs.
1543 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
1544 ResultBank = AMDGPU::VGPRRegBankID;
1545 break;
1546 }
1547
1548 unsigned OpBank = Bank->getID();
1549 // scc, scc -> sgpr
1550 if (OpBank == AMDGPU::SCCRegBankID) {
1551 // There's only one SCC register, so a phi requires copying to SGPR.
1552 OpBank = AMDGPU::SGPRRegBankID;
1553 } else if (OpBank == AMDGPU::VCCRegBankID) {
1554 // vcc, vcc -> vcc
1555 // vcc, sgpr -> vgpr
1556 if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) {
1557 ResultBank = AMDGPU::VGPRRegBankID;
1558 break;
1559 }
1560 }
1561
1562 ResultBank = OpBank;
1563 }
1564
1565 assert(ResultBank != -1);
1566
1567 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1568
1569 const ValueMapping &ValMap =
1570 getValueMapping(0, Size, getRegBank(ResultBank));
1571 return getInstructionMapping(
1572 1, /*Cost*/ 1,
1573 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
1574 }
1575
1576 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
1577 if (Mapping.isValid())
1578 return Mapping;
1579
1580 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1581
1582 switch (MI.getOpcode()) {
1583 default:
1584 return getInvalidInstructionMapping();
1585
1586 case AMDGPU::G_AND:
1587 case AMDGPU::G_OR:
1588 case AMDGPU::G_XOR: {
1589 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1590 if (Size == 1) {
1591 const RegisterBank *DstBank
1592 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
1593
1594 unsigned TargetBankID = -1;
1595 unsigned BankLHS = -1;
1596 unsigned BankRHS = -1;
1597 if (DstBank) {
1598 TargetBankID = DstBank->getID();
1599 if (DstBank == &AMDGPU::VCCRegBank) {
1600 TargetBankID = AMDGPU::VCCRegBankID;
1601 BankLHS = AMDGPU::VCCRegBankID;
1602 BankRHS = AMDGPU::VCCRegBankID;
1603 } else if (DstBank == &AMDGPU::SCCRegBank) {
1604 TargetBankID = AMDGPU::SCCRegBankID;
1605 BankLHS = AMDGPU::SGPRRegBankID;
1606 BankRHS = AMDGPU::SGPRRegBankID;
1607 } else {
1608 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
1609 AMDGPU::SGPRRegBankID);
1610 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
1611 AMDGPU::SGPRRegBankID);
1612 }
1613 } else {
1614 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
1615 AMDGPU::VCCRegBankID);
1616 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
1617 AMDGPU::VCCRegBankID);
1618
1619 // Both inputs should be true booleans to produce a boolean result.
1620 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
1621 TargetBankID = AMDGPU::VGPRRegBankID;
1622 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
1623 TargetBankID = AMDGPU::VCCRegBankID;
1624 BankLHS = AMDGPU::VCCRegBankID;
1625 BankRHS = AMDGPU::VCCRegBankID;
1626 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
1627 TargetBankID = AMDGPU::SGPRRegBankID;
1628 } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) {
1629 // The operation must be done on a 32-bit register, but it will set
1630 // scc. The result type could interchangably be SCC or SGPR, since
1631 // both values will be produced.
1632 TargetBankID = AMDGPU::SCCRegBankID;
1633 BankLHS = AMDGPU::SGPRRegBankID;
1634 BankRHS = AMDGPU::SGPRRegBankID;
1635 }
1636 }
1637
1638 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
1639 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
1640 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
1641 break;
1642 }
1643
1644 if (Size == 64) {
1645
1646 if (isSALUMapping(MI)) {
1647 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
1648 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
1649 } else {
1650 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
1651 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
1652 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
1653
1654 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
1655 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
1656 }
1657
1658 break;
1659 }
1660
1661 LLVM_FALLTHROUGH;
1662 }
1663
1664 case AMDGPU::G_GEP:
1665 case AMDGPU::G_ADD:
1666 case AMDGPU::G_SUB:
1667 case AMDGPU::G_MUL:
1668 case AMDGPU::G_SHL:
1669 case AMDGPU::G_LSHR:
1670 case AMDGPU::G_ASHR:
1671 case AMDGPU::G_UADDO:
1672 case AMDGPU::G_SADDO:
1673 case AMDGPU::G_USUBO:
1674 case AMDGPU::G_SSUBO:
1675 case AMDGPU::G_UADDE:
1676 case AMDGPU::G_SADDE:
1677 case AMDGPU::G_USUBE:
1678 case AMDGPU::G_SSUBE:
1679 case AMDGPU::G_UMULH:
1680 case AMDGPU::G_SMULH:
1681 case AMDGPU::G_SMIN:
1682 case AMDGPU::G_SMAX:
1683 case AMDGPU::G_UMIN:
1684 case AMDGPU::G_UMAX:
1685 if (isSALUMapping(MI))
1686 return getDefaultMappingSOP(MI);
1687 LLVM_FALLTHROUGH;
1688
1689 case AMDGPU::G_FADD:
1690 case AMDGPU::G_FSUB:
1691 case AMDGPU::G_FPTOSI:
1692 case AMDGPU::G_FPTOUI:
1693 case AMDGPU::G_FMUL:
1694 case AMDGPU::G_FMA:
1695 case AMDGPU::G_FSQRT:
1696 case AMDGPU::G_SITOFP:
1697 case AMDGPU::G_UITOFP:
1698 case AMDGPU::G_FPTRUNC:
1699 case AMDGPU::G_FPEXT:
1700 case AMDGPU::G_FEXP2:
1701 case AMDGPU::G_FLOG2:
1702 case AMDGPU::G_FCANONICALIZE:
1703 case AMDGPU::G_INTRINSIC_TRUNC:
1704 case AMDGPU::G_INTRINSIC_ROUND:
1705 return getDefaultMappingVOP(MI);
1706 case AMDGPU::G_IMPLICIT_DEF: {
1707 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1708 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
1709 break;
1710 }
1711 case AMDGPU::G_FCONSTANT:
1712 case AMDGPU::G_CONSTANT:
1713 case AMDGPU::G_FRAME_INDEX:
1714 case AMDGPU::G_BLOCK_ADDR: {
1715 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1716 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
1717 break;
1718 }
1719 case AMDGPU::G_INSERT: {
1720 unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
1721 AMDGPU::VGPRRegBankID;
1722 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1723 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
1724 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
1725 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
1726 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
1727 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
1728 OpdsMapping[3] = nullptr;
1729 break;
1730 }
1731 case AMDGPU::G_EXTRACT: {
1732 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
1733 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1734 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
1735 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
1736 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
1737 OpdsMapping[2] = nullptr;
1738 break;
1739 }
1740 case AMDGPU::G_MERGE_VALUES:
1741 case AMDGPU::G_BUILD_VECTOR:
1742 case AMDGPU::G_CONCAT_VECTORS: {
1743 unsigned Bank = isSALUMapping(MI) ?
1744 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
1745 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1746 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1747
1748 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
1749 // Op1 and Dst should use the same register bank.
1750 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
1751 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
1752 break;
1753 }
1754 case AMDGPU::G_BITCAST:
1755 case AMDGPU::G_INTTOPTR:
1756 case AMDGPU::G_PTRTOINT:
1757 case AMDGPU::G_CTLZ:
1758 case AMDGPU::G_CTLZ_ZERO_UNDEF:
1759 case AMDGPU::G_CTTZ:
1760 case AMDGPU::G_CTTZ_ZERO_UNDEF:
1761 case AMDGPU::G_CTPOP:
1762 case AMDGPU::G_BSWAP:
1763 case AMDGPU::G_FABS:
1764 case AMDGPU::G_FNEG: {
1765 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1766 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
1767 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
1768 break;
1769 }
1770 case AMDGPU::G_TRUNC: {
1771 Register Dst = MI.getOperand(0).getReg();
1772 Register Src = MI.getOperand(1).getReg();
1773 unsigned Bank = getRegBankID(Src, MRI, *TRI);
1774 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
1775 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
1776 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
1777 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
1778 break;
1779 }
1780 case AMDGPU::G_ZEXT:
1781 case AMDGPU::G_SEXT:
1782 case AMDGPU::G_ANYEXT: {
1783 Register Dst = MI.getOperand(0).getReg();
1784 Register Src = MI.getOperand(1).getReg();
1785 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
1786 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
1787
1788 unsigned DstBank;
1789 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
1790 assert(SrcBank);
1791 switch (SrcBank->getID()) {
1792 case AMDGPU::SCCRegBankID:
1793 case AMDGPU::SGPRRegBankID:
1794 DstBank = AMDGPU::SGPRRegBankID;
1795 break;
1796 default:
1797 DstBank = AMDGPU::VGPRRegBankID;
1798 break;
1799 }
1800
1801 // TODO: Should anyext be split into 32-bit part as well?
1802 if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
1803 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
1804 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
1805 } else {
1806 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
1807 // 32-bits, and then to 64.
1808 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
1809 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
1810 SrcSize);
1811 }
1812 break;
1813 }
1814 case AMDGPU::G_FCMP: {
1815 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
1816 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
1817 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
1818 OpdsMapping[1] = nullptr; // Predicate Operand.
1819 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
1820 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1821 break;
1822 }
1823 case AMDGPU::G_STORE: {
1824 assert(MI.getOperand(0).isReg());
1825 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1826 // FIXME: We need to specify a different reg bank once scalar stores
1827 // are supported.
1828 const ValueMapping *ValMapping =
1829 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1830 // FIXME: Depending on the type of store, the pointer could be in
1831 // the SGPR Reg bank.
1832 // FIXME: Pointer size should be based on the address space.
1833 const ValueMapping *PtrMapping =
1834 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
1835
1836 OpdsMapping[0] = ValMapping;
1837 OpdsMapping[1] = PtrMapping;
1838 break;
1839 }
1840
1841 case AMDGPU::G_ICMP: {
1842 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1843 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
1844 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
1845 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
1846
1847 bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID &&
1848 Op3Bank == AMDGPU::SGPRRegBankID &&
1849 (Size == 32 || (Size == 64 &&
1850 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
1851 MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64()));
1852
1853 unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
1854
1855 OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
1856 OpdsMapping[1] = nullptr; // Predicate Operand.
1857 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
1858 OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
1859 break;
1860 }
1861 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
1862 unsigned OutputBankID = isSALUMapping(MI) ?
1863 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
1864 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1865 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
1866 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
1867
1868 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
1869 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
1870
1871 // The index can be either if the source vector is VGPR.
1872 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
1873 break;
1874 }
1875 case AMDGPU::G_INSERT_VECTOR_ELT: {
1876 unsigned OutputBankID = isSALUMapping(MI) ?
1877 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
1878
1879 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1880 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
1881 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
1882 unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
1883 unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
1884
1885 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
1886 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
1887 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize);
1888
1889 // The index can be either if the source vector is VGPR.
1890 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
1891 break;
1892 }
1893 case AMDGPU::G_UNMERGE_VALUES: {
1894 unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
1895 AMDGPU::VGPRRegBankID;
1896
1897 // Op1 and Dst should use the same register bank.
1898 // FIXME: Shouldn't this be the default? Why do we need to handle this?
1899 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
1900 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
1901 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
1902 }
1903 break;
1904 }
1905 case AMDGPU::G_INTRINSIC: {
1906 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1907 default:
1908 return getInvalidInstructionMapping();
1909 case Intrinsic::maxnum:
1910 case Intrinsic::minnum:
1911 case Intrinsic::amdgcn_div_fmas:
1912 case Intrinsic::amdgcn_trig_preop:
1913 case Intrinsic::amdgcn_sin:
1914 case Intrinsic::amdgcn_cos:
1915 case Intrinsic::amdgcn_log_clamp:
1916 case Intrinsic::amdgcn_rcp:
1917 case Intrinsic::amdgcn_rcp_legacy:
1918 case Intrinsic::amdgcn_rsq:
1919 case Intrinsic::amdgcn_rsq_legacy:
1920 case Intrinsic::amdgcn_rsq_clamp:
1921 case Intrinsic::amdgcn_ldexp:
1922 case Intrinsic::amdgcn_frexp_mant:
1923 case Intrinsic::amdgcn_frexp_exp:
1924 case Intrinsic::amdgcn_fract:
1925 case Intrinsic::amdgcn_cvt_pkrtz:
1926 case Intrinsic::amdgcn_cvt_pknorm_i16:
1927 case Intrinsic::amdgcn_cvt_pknorm_u16:
1928 case Intrinsic::amdgcn_cvt_pk_i16:
1929 case Intrinsic::amdgcn_cvt_pk_u16:
1930 case Intrinsic::amdgcn_fmed3:
1931 case Intrinsic::amdgcn_cubeid:
1932 case Intrinsic::amdgcn_cubema:
1933 case Intrinsic::amdgcn_cubesc:
1934 case Intrinsic::amdgcn_cubetc:
1935 case Intrinsic::amdgcn_sffbh:
1936 case Intrinsic::amdgcn_fmad_ftz:
1937 case Intrinsic::amdgcn_mbcnt_lo:
1938 case Intrinsic::amdgcn_mbcnt_hi:
1939 case Intrinsic::amdgcn_ubfe:
1940 case Intrinsic::amdgcn_sbfe:
1941 case Intrinsic::amdgcn_lerp:
1942 case Intrinsic::amdgcn_sad_u8:
1943 case Intrinsic::amdgcn_msad_u8:
1944 case Intrinsic::amdgcn_sad_hi_u8:
1945 case Intrinsic::amdgcn_sad_u16:
1946 case Intrinsic::amdgcn_qsad_pk_u16_u8:
1947 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
1948 case Intrinsic::amdgcn_mqsad_u32_u8:
1949 case Intrinsic::amdgcn_cvt_pk_u8_f32:
1950 case Intrinsic::amdgcn_alignbit:
1951 case Intrinsic::amdgcn_alignbyte:
1952 case Intrinsic::amdgcn_fdot2:
1953 case Intrinsic::amdgcn_sdot2:
1954 case Intrinsic::amdgcn_udot2:
1955 case Intrinsic::amdgcn_sdot4:
1956 case Intrinsic::amdgcn_udot4:
1957 case Intrinsic::amdgcn_sdot8:
1958 case Intrinsic::amdgcn_udot8:
1959 case Intrinsic::amdgcn_fdiv_fast:
1960 case Intrinsic::amdgcn_wwm:
1961 case Intrinsic::amdgcn_wqm:
1962 return getDefaultMappingVOP(MI);
1963 case Intrinsic::amdgcn_ds_permute:
1964 case Intrinsic::amdgcn_ds_bpermute:
1965 case Intrinsic::amdgcn_update_dpp:
1966 return getDefaultMappingAllVGPR(MI);
1967 case Intrinsic::amdgcn_kernarg_segment_ptr:
1968 case Intrinsic::amdgcn_s_getpc:
1969 case Intrinsic::amdgcn_groupstaticsize: {
1970 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1971 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
1972 break;
1973 }
1974 case Intrinsic::amdgcn_wqm_vote: {
1975 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1976 OpdsMapping[0] = OpdsMapping[2]
1977 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
1978 break;
1979 }
1980 case Intrinsic::amdgcn_s_buffer_load: {
1981 // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
1982 Register RSrc = MI.getOperand(2).getReg(); // SGPR
1983 Register Offset = MI.getOperand(3).getReg(); // SGPR/imm
1984
1985 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1986 unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
1987 unsigned Size3 = MRI.getType(Offset).getSizeInBits();
1988
1989 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
1990 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
1991
1992 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0);
1993 OpdsMapping[1] = nullptr; // intrinsic id
1994
1995 // Lie and claim everything is legal, even though some need to be
1996 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
1997 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
1998 OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3);
1999 OpdsMapping[4] = nullptr;
2000 break;
2001 }
2002 case Intrinsic::amdgcn_div_scale: {
2003 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2004 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2005 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
2006 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
2007
2008 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2009 OpdsMapping[3] = AMDGPU::getValueMapping(
2010 getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize);
2011 OpdsMapping[4] = AMDGPU::getValueMapping(
2012 getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize);
2013
2014 break;
2015 }
2016 case Intrinsic::amdgcn_class: {
2017 Register Src0Reg = MI.getOperand(2).getReg();
2018 Register Src1Reg = MI.getOperand(3).getReg();
2019 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
2020 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
2021 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2022 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
2023 OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI),
2024 Src0Size);
2025 OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI),
2026 Src1Size);
2027 break;
2028 }
2029 case Intrinsic::amdgcn_icmp:
2030 case Intrinsic::amdgcn_fcmp: {
2031 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2032 // This is not VCCRegBank because this is not used in boolean contexts.
2033 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
2034 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2035 unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2036 unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2037 OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize);
2038 OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize);
2039 break;
2040 }
2041 case Intrinsic::amdgcn_readlane: {
2042 // This must be an SGPR, but accept a VGPR.
2043 unsigned IdxReg = MI.getOperand(3).getReg();
2044 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
2045 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2046 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2047 LLVM_FALLTHROUGH;
2048 }
2049 case Intrinsic::amdgcn_readfirstlane: {
2050 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2051 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2052 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
2053 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
2054 break;
2055 }
2056 case Intrinsic::amdgcn_writelane: {
2057 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2058 unsigned SrcReg = MI.getOperand(2).getReg();
2059 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
2060 unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2061 unsigned IdxReg = MI.getOperand(3).getReg();
2062 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
2063 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2064 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
2065
2066 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
2067 // to legalize.
2068 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
2069 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2070 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
2071 break;
2072 }
2073 case Intrinsic::amdgcn_if_break: {
2074 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2075 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2076 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2077 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2078 break;
2079 }
2080 }
2081 break;
2082 }
2083 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2084 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
2085 default:
2086 return getInvalidInstructionMapping();
2087 case Intrinsic::amdgcn_s_getreg:
2088 case Intrinsic::amdgcn_s_memtime:
2089 case Intrinsic::amdgcn_s_memrealtime:
2090 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
2091 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2092 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2093 break;
2094 }
2095 case Intrinsic::amdgcn_ds_append:
2096 case Intrinsic::amdgcn_ds_consume:
2097 case Intrinsic::amdgcn_ds_fadd:
2098 case Intrinsic::amdgcn_ds_fmin:
2099 case Intrinsic::amdgcn_ds_fmax:
2100 case Intrinsic::amdgcn_atomic_inc:
2101 case Intrinsic::amdgcn_atomic_dec:
2102 return getDefaultMappingAllVGPR(MI);
2103 case Intrinsic::amdgcn_ds_ordered_add:
2104 case Intrinsic::amdgcn_ds_ordered_swap: {
2105 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2106 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
2107 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2108 AMDGPU::SGPRRegBankID);
2109 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
2110 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2111 break;
2112 }
2113 case Intrinsic::amdgcn_exp_compr:
2114 OpdsMapping[0] = nullptr; // IntrinsicID
2115 // FIXME: These are immediate values which can't be read from registers.
2116 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2117 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2118 // FIXME: Could we support packed types here?
2119 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2120 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2121 // FIXME: These are immediate values which can't be read from registers.
2122 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2123 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2124 break;
2125 case Intrinsic::amdgcn_exp:
2126 OpdsMapping[0] = nullptr; // IntrinsicID
2127 // FIXME: These are immediate values which can't be read from registers.
2128 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2129 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2130 // FIXME: Could we support packed types here?
2131 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2132 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2133 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2134 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2135 // FIXME: These are immediate values which can't be read from registers.
2136 OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2137 OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2138 break;
2139 case Intrinsic::amdgcn_buffer_load: {
2140 Register RSrc = MI.getOperand(2).getReg(); // SGPR
2141 Register VIndex = MI.getOperand(3).getReg(); // VGPR
2142 Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
2143
2144 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2145 unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2146 unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
2147 unsigned Size4 = MRI.getType(Offset).getSizeInBits();
2148
2149 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2150 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2151
2152 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
2153 OpdsMapping[1] = nullptr; // intrinsic id
2154
2155 // Lie and claim everything is legal, even though some need to be
2156 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2157 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2158 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
2159 OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
2160 OpdsMapping[5] = nullptr;
2161 OpdsMapping[6] = nullptr;
2162 break;
2163 }
2164 case Intrinsic::amdgcn_s_sendmsg:
2165 case Intrinsic::amdgcn_s_sendmsghalt: {
2166 // This must be an SGPR, but accept a VGPR.
2167 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2168 AMDGPU::SGPRRegBankID);
2169 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
2170 break;
2171 }
2172 case Intrinsic::amdgcn_end_cf: {
2173 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2174 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2175 break;
2176 }
2177 }
2178 break;
2179 }
2180 case AMDGPU::G_SELECT: {
2181 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2182 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2183 AMDGPU::SGPRRegBankID);
2184 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
2185 AMDGPU::SGPRRegBankID);
2186 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
2187 Op3Bank == AMDGPU::SGPRRegBankID;
2188
2189 unsigned CondBankDefault = SGPRSrcs ?
2190 AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2191 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2192 CondBankDefault);
2193 if (CondBank == AMDGPU::SGPRRegBankID)
2194 CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2195 else if (CondBank == AMDGPU::VGPRRegBankID)
2196 CondBank = AMDGPU::VCCRegBankID;
2197
2198 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ?
2199 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2200
2201 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID);
2202
2203 if (Size == 64) {
2204 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2205 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
2206 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2207 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2208 } else {
2209 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
2210 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
2211 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
2212 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
2213 }
2214
2215 break;
2216 }
2217
2218 case AMDGPU::G_LOAD:
2219 return getInstrMappingForLoad(MI);
2220
2221 case AMDGPU::G_ATOMICRMW_XCHG:
2222 case AMDGPU::G_ATOMICRMW_ADD:
2223 case AMDGPU::G_ATOMICRMW_SUB:
2224 case AMDGPU::G_ATOMICRMW_AND:
2225 case AMDGPU::G_ATOMICRMW_OR:
2226 case AMDGPU::G_ATOMICRMW_XOR:
2227 case AMDGPU::G_ATOMICRMW_MAX:
2228 case AMDGPU::G_ATOMICRMW_MIN:
2229 case AMDGPU::G_ATOMICRMW_UMAX:
2230 case AMDGPU::G_ATOMICRMW_UMIN:
2231 case AMDGPU::G_ATOMIC_CMPXCHG: {
2232 return getDefaultMappingAllVGPR(MI);
2233 }
2234 case AMDGPU::G_BRCOND: {
2235 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
2236 AMDGPU::SGPRRegBankID);
2237 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
2238 if (Bank != AMDGPU::SCCRegBankID)
2239 Bank = AMDGPU::VCCRegBankID;
2240
2241 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
2242 break;
2243 }
2244 }
2245
2246 return getInstructionMapping(/*ID*/1, /*Cost*/1,
2247 getOperandsMapping(OpdsMapping),
2248 MI.getNumOperands());
2249 }
2250
2251