1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
13 // ==>
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 // the constant into the data register is placed between the stores, although
47 // this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 // one pair, and recomputes live intervals and moves on to the next pair. It
51 // would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 // cluster of loads have offsets that are too large to fit in the 8-bit
55 // offsets, but are close enough to fit in the 8 bits, we can add to the base
56 // pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66
67 using namespace llvm;
68
69 #define DEBUG_TYPE "si-load-store-opt"
70
71 namespace {
72 enum InstClassEnum {
73 UNKNOWN,
74 DS_READ,
75 DS_WRITE,
76 S_BUFFER_LOAD_IMM,
77 BUFFER_LOAD,
78 BUFFER_STORE,
79 MIMG,
80 TBUFFER_LOAD,
81 TBUFFER_STORE,
82 };
83
84 struct AddressRegs {
85 unsigned char NumVAddrs = 0;
86 bool SBase = false;
87 bool SRsrc = false;
88 bool SOffset = false;
89 bool VAddr = false;
90 bool Addr = false;
91 bool SSamp = false;
92 };
93
94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
95 const unsigned MaxAddressRegs = 12 + 1 + 1;
96
97 class SILoadStoreOptimizer : public MachineFunctionPass {
98 struct CombineInfo {
99 MachineBasicBlock::iterator I;
100 unsigned EltSize;
101 unsigned Offset;
102 unsigned Width;
103 unsigned Format;
104 unsigned BaseOff;
105 unsigned DMask;
106 InstClassEnum InstClass;
107 unsigned CPol = 0;
108 bool UseST64;
109 int AddrIdx[MaxAddressRegs];
110 const MachineOperand *AddrReg[MaxAddressRegs];
111 unsigned NumAddresses;
112 unsigned Order;
113
hasSameBaseAddress__anonbbeb860d0111::SILoadStoreOptimizer::CombineInfo114 bool hasSameBaseAddress(const MachineInstr &MI) {
115 for (unsigned i = 0; i < NumAddresses; i++) {
116 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
117
118 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
119 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
120 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
121 return false;
122 }
123 continue;
124 }
125
126 // Check same base pointer. Be careful of subregisters, which can occur
127 // with vectors of pointers.
128 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
129 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
130 return false;
131 }
132 }
133 return true;
134 }
135
hasMergeableAddress__anonbbeb860d0111::SILoadStoreOptimizer::CombineInfo136 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
137 for (unsigned i = 0; i < NumAddresses; ++i) {
138 const MachineOperand *AddrOp = AddrReg[i];
139 // Immediates are always OK.
140 if (AddrOp->isImm())
141 continue;
142
143 // Don't try to merge addresses that aren't either immediates or registers.
144 // TODO: Should be possible to merge FrameIndexes and maybe some other
145 // non-register
146 if (!AddrOp->isReg())
147 return false;
148
149 // TODO: We should be able to merge physical reg addreses.
150 if (AddrOp->getReg().isPhysical())
151 return false;
152
153 // If an address has only one use then there will be on other
154 // instructions with the same address, so we can't merge this one.
155 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
156 return false;
157 }
158 return true;
159 }
160
161 void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
162 const GCNSubtarget &STM);
163 };
164
165 struct BaseRegisters {
166 Register LoReg;
167 Register HiReg;
168
169 unsigned LoSubReg = 0;
170 unsigned HiSubReg = 0;
171 };
172
173 struct MemAddress {
174 BaseRegisters Base;
175 int64_t Offset = 0;
176 };
177
178 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
179
180 private:
181 const GCNSubtarget *STM = nullptr;
182 const SIInstrInfo *TII = nullptr;
183 const SIRegisterInfo *TRI = nullptr;
184 MachineRegisterInfo *MRI = nullptr;
185 AliasAnalysis *AA = nullptr;
186 bool OptimizeAgain;
187
188 static bool dmasksCanBeCombined(const CombineInfo &CI,
189 const SIInstrInfo &TII,
190 const CombineInfo &Paired);
191 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
192 CombineInfo &Paired, bool Modify = false);
193 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
194 const CombineInfo &Paired);
195 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
196 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
197 const CombineInfo &Paired);
198 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
199 const CombineInfo &Paired);
200 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
201
202 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
203 SmallVectorImpl<MachineInstr *> &InstsToMove);
204
205 unsigned read2Opcode(unsigned EltSize) const;
206 unsigned read2ST64Opcode(unsigned EltSize) const;
207 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
208 CombineInfo &Paired,
209 const SmallVectorImpl<MachineInstr *> &InstsToMove);
210
211 unsigned write2Opcode(unsigned EltSize) const;
212 unsigned write2ST64Opcode(unsigned EltSize) const;
213 MachineBasicBlock::iterator
214 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
215 const SmallVectorImpl<MachineInstr *> &InstsToMove);
216 MachineBasicBlock::iterator
217 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
218 const SmallVectorImpl<MachineInstr *> &InstsToMove);
219 MachineBasicBlock::iterator
220 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
221 const SmallVectorImpl<MachineInstr *> &InstsToMove);
222 MachineBasicBlock::iterator
223 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
224 const SmallVectorImpl<MachineInstr *> &InstsToMove);
225 MachineBasicBlock::iterator
226 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
227 const SmallVectorImpl<MachineInstr *> &InstsToMove);
228 MachineBasicBlock::iterator
229 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
230 const SmallVectorImpl<MachineInstr *> &InstsToMove);
231 MachineBasicBlock::iterator
232 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
233 const SmallVectorImpl<MachineInstr *> &InstsToMove);
234
235 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
236 int32_t NewOffset) const;
237 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
238 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
239 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
240 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
241 /// Promotes constant offset to the immediate by adjusting the base. It
242 /// tries to use a base from the nearby instructions that allows it to have
243 /// a 13bit constant offset which gets promoted to the immediate.
244 bool promoteConstantOffsetToImm(MachineInstr &CI,
245 MemInfoMap &Visited,
246 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
247 void addInstToMergeableList(const CombineInfo &CI,
248 std::list<std::list<CombineInfo> > &MergeableInsts) const;
249
250 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
251 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
252 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
253 std::list<std::list<CombineInfo>> &MergeableInsts) const;
254
255 public:
256 static char ID;
257
SILoadStoreOptimizer()258 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
259 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
260 }
261
262 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
263 bool &OptimizeListAgain);
264 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
265
266 bool runOnMachineFunction(MachineFunction &MF) override;
267
getPassName() const268 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
269
getAnalysisUsage(AnalysisUsage & AU) const270 void getAnalysisUsage(AnalysisUsage &AU) const override {
271 AU.setPreservesCFG();
272 AU.addRequired<AAResultsWrapperPass>();
273
274 MachineFunctionPass::getAnalysisUsage(AU);
275 }
276
getRequiredProperties() const277 MachineFunctionProperties getRequiredProperties() const override {
278 return MachineFunctionProperties()
279 .set(MachineFunctionProperties::Property::IsSSA);
280 }
281 };
282
getOpcodeWidth(const MachineInstr & MI,const SIInstrInfo & TII)283 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
284 const unsigned Opc = MI.getOpcode();
285
286 if (TII.isMUBUF(Opc)) {
287 // FIXME: Handle d16 correctly
288 return AMDGPU::getMUBUFElements(Opc);
289 }
290 if (TII.isMIMG(MI)) {
291 uint64_t DMaskImm =
292 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
293 return countPopulation(DMaskImm);
294 }
295 if (TII.isMTBUF(Opc)) {
296 return AMDGPU::getMTBUFElements(Opc);
297 }
298
299 switch (Opc) {
300 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
301 return 1;
302 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
303 return 2;
304 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
305 return 4;
306 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH;
307 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
308 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH;
309 case AMDGPU::DS_WRITE_B32_gfx9:
310 return 1;
311 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH;
312 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
313 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH;
314 case AMDGPU::DS_WRITE_B64_gfx9:
315 return 2;
316 default:
317 return 0;
318 }
319 }
320
321 /// Maps instruction opcode to enum InstClassEnum.
getInstClass(unsigned Opc,const SIInstrInfo & TII)322 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
323 switch (Opc) {
324 default:
325 if (TII.isMUBUF(Opc)) {
326 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
327 default:
328 return UNKNOWN;
329 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
330 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
331 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
332 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
333 return BUFFER_LOAD;
334 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
335 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
336 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
337 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
338 return BUFFER_STORE;
339 }
340 }
341 if (TII.isMIMG(Opc)) {
342 // Ignore instructions encoded without vaddr.
343 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
344 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
345 return UNKNOWN;
346 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
347 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
348 TII.isGather4(Opc))
349 return UNKNOWN;
350 return MIMG;
351 }
352 if (TII.isMTBUF(Opc)) {
353 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
354 default:
355 return UNKNOWN;
356 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
357 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
358 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
359 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
360 return TBUFFER_LOAD;
361 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
362 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
363 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
364 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
365 return TBUFFER_STORE;
366 }
367 }
368 return UNKNOWN;
369 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
370 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
371 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
372 return S_BUFFER_LOAD_IMM;
373 case AMDGPU::DS_READ_B32:
374 case AMDGPU::DS_READ_B32_gfx9:
375 case AMDGPU::DS_READ_B64:
376 case AMDGPU::DS_READ_B64_gfx9:
377 return DS_READ;
378 case AMDGPU::DS_WRITE_B32:
379 case AMDGPU::DS_WRITE_B32_gfx9:
380 case AMDGPU::DS_WRITE_B64:
381 case AMDGPU::DS_WRITE_B64_gfx9:
382 return DS_WRITE;
383 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa:
384 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa:
385 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa:
386 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa:
387 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa:
388 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa:
389 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa:
390 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa:
391 return UNKNOWN;
392 }
393 }
394
395 /// Determines instruction subclass from opcode. Only instructions
396 /// of the same subclass can be merged together.
getInstSubclass(unsigned Opc,const SIInstrInfo & TII)397 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
398 switch (Opc) {
399 default:
400 if (TII.isMUBUF(Opc))
401 return AMDGPU::getMUBUFBaseOpcode(Opc);
402 if (TII.isMIMG(Opc)) {
403 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
404 assert(Info);
405 return Info->BaseOpcode;
406 }
407 if (TII.isMTBUF(Opc))
408 return AMDGPU::getMTBUFBaseOpcode(Opc);
409 return -1;
410 case AMDGPU::DS_READ_B32:
411 case AMDGPU::DS_READ_B32_gfx9:
412 case AMDGPU::DS_READ_B64:
413 case AMDGPU::DS_READ_B64_gfx9:
414 case AMDGPU::DS_WRITE_B32:
415 case AMDGPU::DS_WRITE_B32_gfx9:
416 case AMDGPU::DS_WRITE_B64:
417 case AMDGPU::DS_WRITE_B64_gfx9:
418 return Opc;
419 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
420 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
421 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
422 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
423 }
424 }
425
getRegs(unsigned Opc,const SIInstrInfo & TII)426 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
427 AddressRegs Result;
428
429 if (TII.isMUBUF(Opc)) {
430 if (AMDGPU::getMUBUFHasVAddr(Opc))
431 Result.VAddr = true;
432 if (AMDGPU::getMUBUFHasSrsrc(Opc))
433 Result.SRsrc = true;
434 if (AMDGPU::getMUBUFHasSoffset(Opc))
435 Result.SOffset = true;
436
437 return Result;
438 }
439
440 if (TII.isMIMG(Opc)) {
441 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
442 if (VAddr0Idx >= 0) {
443 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
444 Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
445 } else {
446 Result.VAddr = true;
447 }
448 Result.SRsrc = true;
449 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
450 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
451 Result.SSamp = true;
452
453 return Result;
454 }
455 if (TII.isMTBUF(Opc)) {
456 if (AMDGPU::getMTBUFHasVAddr(Opc))
457 Result.VAddr = true;
458 if (AMDGPU::getMTBUFHasSrsrc(Opc))
459 Result.SRsrc = true;
460 if (AMDGPU::getMTBUFHasSoffset(Opc))
461 Result.SOffset = true;
462
463 return Result;
464 }
465
466 switch (Opc) {
467 default:
468 return Result;
469 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
470 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
471 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
472 Result.SBase = true;
473 return Result;
474 case AMDGPU::DS_READ_B32:
475 case AMDGPU::DS_READ_B64:
476 case AMDGPU::DS_READ_B32_gfx9:
477 case AMDGPU::DS_READ_B64_gfx9:
478 case AMDGPU::DS_WRITE_B32:
479 case AMDGPU::DS_WRITE_B64:
480 case AMDGPU::DS_WRITE_B32_gfx9:
481 case AMDGPU::DS_WRITE_B64_gfx9:
482 Result.Addr = true;
483 return Result;
484 }
485 }
486
setMI(MachineBasicBlock::iterator MI,const SIInstrInfo & TII,const GCNSubtarget & STM)487 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
488 const SIInstrInfo &TII,
489 const GCNSubtarget &STM) {
490 I = MI;
491 unsigned Opc = MI->getOpcode();
492 InstClass = getInstClass(Opc, TII);
493
494 if (InstClass == UNKNOWN)
495 return;
496
497 switch (InstClass) {
498 case DS_READ:
499 EltSize =
500 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
501 : 4;
502 break;
503 case DS_WRITE:
504 EltSize =
505 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
506 : 4;
507 break;
508 case S_BUFFER_LOAD_IMM:
509 EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4);
510 break;
511 default:
512 EltSize = 4;
513 break;
514 }
515
516 if (InstClass == MIMG) {
517 DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
518 // Offset is not considered for MIMG instructions.
519 Offset = 0;
520 } else {
521 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
522 Offset = I->getOperand(OffsetIdx).getImm();
523 }
524
525 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
526 Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
527
528 Width = getOpcodeWidth(*I, TII);
529
530 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
531 Offset &= 0xffff;
532 } else if (InstClass != MIMG) {
533 CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
534 }
535
536 AddressRegs Regs = getRegs(Opc, TII);
537
538 NumAddresses = 0;
539 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
540 AddrIdx[NumAddresses++] =
541 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
542 if (Regs.Addr)
543 AddrIdx[NumAddresses++] =
544 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
545 if (Regs.SBase)
546 AddrIdx[NumAddresses++] =
547 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
548 if (Regs.SRsrc)
549 AddrIdx[NumAddresses++] =
550 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
551 if (Regs.SOffset)
552 AddrIdx[NumAddresses++] =
553 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
554 if (Regs.VAddr)
555 AddrIdx[NumAddresses++] =
556 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
557 if (Regs.SSamp)
558 AddrIdx[NumAddresses++] =
559 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
560 assert(NumAddresses <= MaxAddressRegs);
561
562 for (unsigned J = 0; J < NumAddresses; J++)
563 AddrReg[J] = &I->getOperand(AddrIdx[J]);
564 }
565
566 } // end anonymous namespace.
567
568 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
569 "SI Load Store Optimizer", false, false)
570 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
571 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
572 false, false)
573
574 char SILoadStoreOptimizer::ID = 0;
575
576 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
577
createSILoadStoreOptimizerPass()578 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
579 return new SILoadStoreOptimizer();
580 }
581
moveInstsAfter(MachineBasicBlock::iterator I,ArrayRef<MachineInstr * > InstsToMove)582 static void moveInstsAfter(MachineBasicBlock::iterator I,
583 ArrayRef<MachineInstr *> InstsToMove) {
584 MachineBasicBlock *MBB = I->getParent();
585 ++I;
586 for (MachineInstr *MI : InstsToMove) {
587 MI->removeFromParent();
588 MBB->insert(I, MI);
589 }
590 }
591
addDefsUsesToList(const MachineInstr & MI,DenseSet<Register> & RegDefs,DenseSet<Register> & PhysRegUses)592 static void addDefsUsesToList(const MachineInstr &MI,
593 DenseSet<Register> &RegDefs,
594 DenseSet<Register> &PhysRegUses) {
595 for (const MachineOperand &Op : MI.operands()) {
596 if (Op.isReg()) {
597 if (Op.isDef())
598 RegDefs.insert(Op.getReg());
599 else if (Op.readsReg() && Op.getReg().isPhysical())
600 PhysRegUses.insert(Op.getReg());
601 }
602 }
603 }
604
memAccessesCanBeReordered(MachineBasicBlock::iterator A,MachineBasicBlock::iterator B,AliasAnalysis * AA)605 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
606 MachineBasicBlock::iterator B,
607 AliasAnalysis *AA) {
608 // RAW or WAR - cannot reorder
609 // WAW - cannot reorder
610 // RAR - safe to reorder
611 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
612 }
613
614 // Add MI and its defs to the lists if MI reads one of the defs that are
615 // already in the list. Returns true in that case.
addToListsIfDependent(MachineInstr & MI,DenseSet<Register> & RegDefs,DenseSet<Register> & PhysRegUses,SmallVectorImpl<MachineInstr * > & Insts)616 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
617 DenseSet<Register> &PhysRegUses,
618 SmallVectorImpl<MachineInstr *> &Insts) {
619 for (MachineOperand &Use : MI.operands()) {
620 // If one of the defs is read, then there is a use of Def between I and the
621 // instruction that I will potentially be merged with. We will need to move
622 // this instruction after the merged instructions.
623 //
624 // Similarly, if there is a def which is read by an instruction that is to
625 // be moved for merging, then we need to move the def-instruction as well.
626 // This can only happen for physical registers such as M0; virtual
627 // registers are in SSA form.
628 if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
629 (Use.isDef() && RegDefs.count(Use.getReg())) ||
630 (Use.isDef() && Use.getReg().isPhysical() &&
631 PhysRegUses.count(Use.getReg())))) {
632 Insts.push_back(&MI);
633 addDefsUsesToList(MI, RegDefs, PhysRegUses);
634 return true;
635 }
636 }
637
638 return false;
639 }
640
canMoveInstsAcrossMemOp(MachineInstr & MemOp,ArrayRef<MachineInstr * > InstsToMove,AliasAnalysis * AA)641 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
642 ArrayRef<MachineInstr *> InstsToMove,
643 AliasAnalysis *AA) {
644 assert(MemOp.mayLoadOrStore());
645
646 for (MachineInstr *InstToMove : InstsToMove) {
647 if (!InstToMove->mayLoadOrStore())
648 continue;
649 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
650 return false;
651 }
652 return true;
653 }
654
655 // This function assumes that \p A and \p B have are identical except for
656 // size and offset, and they referecne adjacent memory.
combineKnownAdjacentMMOs(MachineFunction & MF,const MachineMemOperand * A,const MachineMemOperand * B)657 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
658 const MachineMemOperand *A,
659 const MachineMemOperand *B) {
660 unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
661 unsigned Size = A->getSize() + B->getSize();
662 // This function adds the offset parameter to the existing offset for A,
663 // so we pass 0 here as the offset and then manually set it to the correct
664 // value after the call.
665 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
666 MMO->setOffset(MinOffset);
667 return MMO;
668 }
669
dmasksCanBeCombined(const CombineInfo & CI,const SIInstrInfo & TII,const CombineInfo & Paired)670 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
671 const SIInstrInfo &TII,
672 const CombineInfo &Paired) {
673 assert(CI.InstClass == MIMG);
674
675 // Ignore instructions with tfe/lwe set.
676 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
677 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
678
679 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
680 return false;
681
682 // Check other optional immediate operands for equality.
683 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
684 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
685 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
686
687 for (auto op : OperandsToMatch) {
688 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
689 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
690 return false;
691 if (Idx != -1 &&
692 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
693 return false;
694 }
695
696 // Check DMask for overlaps.
697 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
698 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
699
700 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
701 if ((1u << AllowedBitsForMin) <= MinMask)
702 return false;
703
704 return true;
705 }
706
getBufferFormatWithCompCount(unsigned OldFormat,unsigned ComponentCount,const GCNSubtarget & STI)707 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
708 unsigned ComponentCount,
709 const GCNSubtarget &STI) {
710 if (ComponentCount > 4)
711 return 0;
712
713 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
714 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
715 if (!OldFormatInfo)
716 return 0;
717
718 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
719 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
720 ComponentCount,
721 OldFormatInfo->NumFormat, STI);
722
723 if (!NewFormatInfo)
724 return 0;
725
726 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
727 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
728
729 return NewFormatInfo->Format;
730 }
731
732 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
733 // highest power of two. Note that the result is well defined for all inputs
734 // including corner cases like:
735 // - if Lo == Hi, return that value
736 // - if Lo == 0, return 0 (even though the "- 1" below underflows
737 // - if Lo > Hi, return 0 (as if the range wrapped around)
mostAlignedValueInRange(uint32_t Lo,uint32_t Hi)738 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
739 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
740 }
741
offsetsCanBeCombined(CombineInfo & CI,const GCNSubtarget & STI,CombineInfo & Paired,bool Modify)742 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
743 const GCNSubtarget &STI,
744 CombineInfo &Paired,
745 bool Modify) {
746 assert(CI.InstClass != MIMG);
747
748 // XXX - Would the same offset be OK? Is there any reason this would happen or
749 // be useful?
750 if (CI.Offset == Paired.Offset)
751 return false;
752
753 // This won't be valid if the offset isn't aligned.
754 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
755 return false;
756
757 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
758
759 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
760 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
761 if (!Info0)
762 return false;
763 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
764 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
765 if (!Info1)
766 return false;
767
768 if (Info0->BitsPerComp != Info1->BitsPerComp ||
769 Info0->NumFormat != Info1->NumFormat)
770 return false;
771
772 // TODO: Should be possible to support more formats, but if format loads
773 // are not dword-aligned, the merged load might not be valid.
774 if (Info0->BitsPerComp != 32)
775 return false;
776
777 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
778 return false;
779 }
780
781 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
782 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
783 CI.UseST64 = false;
784 CI.BaseOff = 0;
785
786 // Handle all non-DS instructions.
787 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
788 return (EltOffset0 + CI.Width == EltOffset1 ||
789 EltOffset1 + Paired.Width == EltOffset0) &&
790 CI.CPol == Paired.CPol &&
791 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
792 }
793
794 // If the offset in elements doesn't fit in 8-bits, we might be able to use
795 // the stride 64 versions.
796 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
797 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
798 if (Modify) {
799 CI.Offset = EltOffset0 / 64;
800 Paired.Offset = EltOffset1 / 64;
801 CI.UseST64 = true;
802 }
803 return true;
804 }
805
806 // Check if the new offsets fit in the reduced 8-bit range.
807 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
808 if (Modify) {
809 CI.Offset = EltOffset0;
810 Paired.Offset = EltOffset1;
811 }
812 return true;
813 }
814
815 // Try to shift base address to decrease offsets.
816 uint32_t Min = std::min(EltOffset0, EltOffset1);
817 uint32_t Max = std::max(EltOffset0, EltOffset1);
818
819 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
820 if (((Max - Min) & ~Mask) == 0) {
821 if (Modify) {
822 // From the range of values we could use for BaseOff, choose the one that
823 // is aligned to the highest power of two, to maximise the chance that
824 // the same offset can be reused for other load/store pairs.
825 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
826 // Copy the low bits of the offsets, so that when we adjust them by
827 // subtracting BaseOff they will be multiples of 64.
828 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
829 CI.BaseOff = BaseOff * CI.EltSize;
830 CI.Offset = (EltOffset0 - BaseOff) / 64;
831 Paired.Offset = (EltOffset1 - BaseOff) / 64;
832 CI.UseST64 = true;
833 }
834 return true;
835 }
836
837 if (isUInt<8>(Max - Min)) {
838 if (Modify) {
839 // From the range of values we could use for BaseOff, choose the one that
840 // is aligned to the highest power of two, to maximise the chance that
841 // the same offset can be reused for other load/store pairs.
842 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
843 CI.BaseOff = BaseOff * CI.EltSize;
844 CI.Offset = EltOffset0 - BaseOff;
845 Paired.Offset = EltOffset1 - BaseOff;
846 }
847 return true;
848 }
849
850 return false;
851 }
852
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI,const CombineInfo & Paired)853 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
854 const CombineInfo &CI,
855 const CombineInfo &Paired) {
856 const unsigned Width = (CI.Width + Paired.Width);
857 switch (CI.InstClass) {
858 default:
859 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
860 case S_BUFFER_LOAD_IMM:
861 switch (Width) {
862 default:
863 return false;
864 case 2:
865 case 4:
866 return true;
867 }
868 }
869 }
870
871 const TargetRegisterClass *
getDataRegClass(const MachineInstr & MI) const872 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
873 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
874 return TRI->getRegClassForReg(*MRI, Dst->getReg());
875 }
876 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
877 return TRI->getRegClassForReg(*MRI, Src->getReg());
878 }
879 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
880 return TRI->getRegClassForReg(*MRI, Src->getReg());
881 }
882 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
883 return TRI->getRegClassForReg(*MRI, Dst->getReg());
884 }
885 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
886 return TRI->getRegClassForReg(*MRI, Src->getReg());
887 }
888 return nullptr;
889 }
890
891 /// This function assumes that CI comes before Paired in a basic block.
checkAndPrepareMerge(CombineInfo & CI,CombineInfo & Paired,SmallVectorImpl<MachineInstr * > & InstsToMove)892 bool SILoadStoreOptimizer::checkAndPrepareMerge(
893 CombineInfo &CI, CombineInfo &Paired,
894 SmallVectorImpl<MachineInstr *> &InstsToMove) {
895
896 // Check both offsets (or masks for MIMG) can be combined and fit in the
897 // reduced range.
898 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
899 return false;
900
901 if (CI.InstClass != MIMG &&
902 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
903 return false;
904
905 const unsigned Opc = CI.I->getOpcode();
906 const InstClassEnum InstClass = getInstClass(Opc, *TII);
907
908 if (InstClass == UNKNOWN) {
909 return false;
910 }
911 const unsigned InstSubclass = getInstSubclass(Opc, *TII);
912
913 // Do not merge VMEM buffer instructions with "swizzled" bit set.
914 int Swizzled =
915 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
916 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
917 return false;
918
919 DenseSet<Register> RegDefsToMove;
920 DenseSet<Register> PhysRegUsesToMove;
921 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
922
923 const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
924 bool IsAGPR = TRI->hasAGPRs(DataRC);
925
926 MachineBasicBlock::iterator E = std::next(Paired.I);
927 MachineBasicBlock::iterator MBBI = std::next(CI.I);
928 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
929 for (; MBBI != E; ++MBBI) {
930
931 if (MBBI == MBBE) {
932 // CombineInfo::Order is a hint on the instruction ordering within the
933 // basic block. This hint suggests that CI precedes Paired, which is
934 // true most of the time. However, moveInstsAfter() processing a
935 // previous list may have changed this order in a situation when it
936 // moves an instruction which exists in some other merge list.
937 // In this case it must be dependent.
938 return false;
939 }
940
941 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
942 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
943 // This is not a matching instruction, but we can keep looking as
944 // long as one of these conditions are met:
945 // 1. It is safe to move I down past MBBI.
946 // 2. It is safe to move MBBI down past the instruction that I will
947 // be merged into.
948
949 if (MBBI->hasUnmodeledSideEffects()) {
950 // We can't re-order this instruction with respect to other memory
951 // operations, so we fail both conditions mentioned above.
952 return false;
953 }
954
955 if (MBBI->mayLoadOrStore() &&
956 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
957 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
958 // We fail condition #1, but we may still be able to satisfy condition
959 // #2. Add this instruction to the move list and then we will check
960 // if condition #2 holds once we have selected the matching instruction.
961 InstsToMove.push_back(&*MBBI);
962 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
963 continue;
964 }
965
966 // When we match I with another DS instruction we will be moving I down
967 // to the location of the matched instruction any uses of I will need to
968 // be moved down as well.
969 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
970 InstsToMove);
971 continue;
972 }
973
974 // Don't merge volatiles.
975 if (MBBI->hasOrderedMemoryRef())
976 return false;
977
978 int Swizzled =
979 AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz);
980 if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm())
981 return false;
982
983 // Handle a case like
984 // DS_WRITE_B32 addr, v, idx0
985 // w = DS_READ_B32 addr, idx0
986 // DS_WRITE_B32 addr, f(w), idx1
987 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
988 // merging of the two writes.
989 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
990 InstsToMove))
991 continue;
992
993 if (&*MBBI == &*Paired.I) {
994 if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
995 return false;
996 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
997 // operands. However we are reporting that ds_write2 shall have
998 // only VGPR data so that machine copy propagation does not
999 // create an illegal instruction with a VGPR and AGPR sources.
1000 // Consequenctially if we create such instruction the verifier
1001 // will complain.
1002 if (IsAGPR && CI.InstClass == DS_WRITE)
1003 return false;
1004
1005 // We need to go through the list of instructions that we plan to
1006 // move and make sure they are all safe to move down past the merged
1007 // instruction.
1008 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
1009
1010 // Call offsetsCanBeCombined with modify = true so that the offsets are
1011 // correct for the new instruction. This should return true, because
1012 // this function should only be called on CombineInfo objects that
1013 // have already been confirmed to be mergeable.
1014 if (CI.InstClass != MIMG)
1015 offsetsCanBeCombined(CI, *STM, Paired, true);
1016 return true;
1017 }
1018 return false;
1019 }
1020
1021 // We've found a load/store that we couldn't merge for some reason.
1022 // We could potentially keep looking, but we'd need to make sure that
1023 // it was safe to move I and also all the instruction in InstsToMove
1024 // down past this instruction.
1025 // check if we can move I across MBBI and if we can move all I's users
1026 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
1027 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
1028 break;
1029 }
1030 return false;
1031 }
1032
read2Opcode(unsigned EltSize) const1033 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1034 if (STM->ldsRequiresM0Init())
1035 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1036 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1037 }
1038
read2ST64Opcode(unsigned EltSize) const1039 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1040 if (STM->ldsRequiresM0Init())
1041 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1042
1043 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1044 : AMDGPU::DS_READ2ST64_B64_gfx9;
1045 }
1046
1047 MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI,CombineInfo & Paired,const SmallVectorImpl<MachineInstr * > & InstsToMove)1048 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1049 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1050 MachineBasicBlock *MBB = CI.I->getParent();
1051
1052 // Be careful, since the addresses could be subregisters themselves in weird
1053 // cases, like vectors of pointers.
1054 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1055
1056 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1057 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1058
1059 unsigned NewOffset0 = CI.Offset;
1060 unsigned NewOffset1 = Paired.Offset;
1061 unsigned Opc =
1062 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1063
1064 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1065 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1066
1067 if (NewOffset0 > NewOffset1) {
1068 // Canonicalize the merged instruction so the smaller offset comes first.
1069 std::swap(NewOffset0, NewOffset1);
1070 std::swap(SubRegIdx0, SubRegIdx1);
1071 }
1072
1073 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1074 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1075
1076 const MCInstrDesc &Read2Desc = TII->get(Opc);
1077
1078 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1079 Register DestReg = MRI->createVirtualRegister(SuperRC);
1080
1081 DebugLoc DL = CI.I->getDebugLoc();
1082
1083 Register BaseReg = AddrReg->getReg();
1084 unsigned BaseSubReg = AddrReg->getSubReg();
1085 unsigned BaseRegFlags = 0;
1086 if (CI.BaseOff) {
1087 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1088 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1089 .addImm(CI.BaseOff);
1090
1091 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1092 BaseRegFlags = RegState::Kill;
1093
1094 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
1095 .addReg(ImmReg)
1096 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1097 .addImm(0); // clamp bit
1098 BaseSubReg = 0;
1099 }
1100
1101 MachineInstrBuilder Read2 =
1102 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
1103 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1104 .addImm(NewOffset0) // offset0
1105 .addImm(NewOffset1) // offset1
1106 .addImm(0) // gds
1107 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1108
1109 (void)Read2;
1110
1111 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1112
1113 // Copy to the old destination registers.
1114 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1115 .add(*Dest0) // Copy to same destination including flags and sub reg.
1116 .addReg(DestReg, 0, SubRegIdx0);
1117 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1118 .add(*Dest1)
1119 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1120
1121 moveInstsAfter(Copy1, InstsToMove);
1122
1123 CI.I->eraseFromParent();
1124 Paired.I->eraseFromParent();
1125
1126 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1127 return Read2;
1128 }
1129
write2Opcode(unsigned EltSize) const1130 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1131 if (STM->ldsRequiresM0Init())
1132 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1133 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1134 : AMDGPU::DS_WRITE2_B64_gfx9;
1135 }
1136
write2ST64Opcode(unsigned EltSize) const1137 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1138 if (STM->ldsRequiresM0Init())
1139 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1140 : AMDGPU::DS_WRITE2ST64_B64;
1141
1142 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1143 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1144 }
1145
1146 MachineBasicBlock::iterator
mergeWrite2Pair(CombineInfo & CI,CombineInfo & Paired,const SmallVectorImpl<MachineInstr * > & InstsToMove)1147 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
1148 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1149 MachineBasicBlock *MBB = CI.I->getParent();
1150
1151 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1152 // sure we preserve the subregister index and any register flags set on them.
1153 const MachineOperand *AddrReg =
1154 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1155 const MachineOperand *Data0 =
1156 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1157 const MachineOperand *Data1 =
1158 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1159
1160 unsigned NewOffset0 = CI.Offset;
1161 unsigned NewOffset1 = Paired.Offset;
1162 unsigned Opc =
1163 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1164
1165 if (NewOffset0 > NewOffset1) {
1166 // Canonicalize the merged instruction so the smaller offset comes first.
1167 std::swap(NewOffset0, NewOffset1);
1168 std::swap(Data0, Data1);
1169 }
1170
1171 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1172 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1173
1174 const MCInstrDesc &Write2Desc = TII->get(Opc);
1175 DebugLoc DL = CI.I->getDebugLoc();
1176
1177 Register BaseReg = AddrReg->getReg();
1178 unsigned BaseSubReg = AddrReg->getSubReg();
1179 unsigned BaseRegFlags = 0;
1180 if (CI.BaseOff) {
1181 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1182 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1183 .addImm(CI.BaseOff);
1184
1185 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1186 BaseRegFlags = RegState::Kill;
1187
1188 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
1189 .addReg(ImmReg)
1190 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1191 .addImm(0); // clamp bit
1192 BaseSubReg = 0;
1193 }
1194
1195 MachineInstrBuilder Write2 =
1196 BuildMI(*MBB, Paired.I, DL, Write2Desc)
1197 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1198 .add(*Data0) // data0
1199 .add(*Data1) // data1
1200 .addImm(NewOffset0) // offset0
1201 .addImm(NewOffset1) // offset1
1202 .addImm(0) // gds
1203 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1204
1205 moveInstsAfter(Write2, InstsToMove);
1206
1207 CI.I->eraseFromParent();
1208 Paired.I->eraseFromParent();
1209
1210 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1211 return Write2;
1212 }
1213
1214 MachineBasicBlock::iterator
mergeImagePair(CombineInfo & CI,CombineInfo & Paired,const SmallVectorImpl<MachineInstr * > & InstsToMove)1215 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1216 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1217 MachineBasicBlock *MBB = CI.I->getParent();
1218 DebugLoc DL = CI.I->getDebugLoc();
1219 const unsigned Opcode = getNewOpcode(CI, Paired);
1220
1221 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1222
1223 Register DestReg = MRI->createVirtualRegister(SuperRC);
1224 unsigned MergedDMask = CI.DMask | Paired.DMask;
1225 unsigned DMaskIdx =
1226 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1227
1228 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1229 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1230 if (I == DMaskIdx)
1231 MIB.addImm(MergedDMask);
1232 else
1233 MIB.add((*CI.I).getOperand(I));
1234 }
1235
1236 // It shouldn't be possible to get this far if the two instructions
1237 // don't have a single memoperand, because MachineInstr::mayAlias()
1238 // will return true if this is the case.
1239 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1240
1241 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1242 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1243
1244 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1245
1246 unsigned SubRegIdx0, SubRegIdx1;
1247 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1248
1249 // Copy to the old destination registers.
1250 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1251 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1252 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1253
1254 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1255 .add(*Dest0) // Copy to same destination including flags and sub reg.
1256 .addReg(DestReg, 0, SubRegIdx0);
1257 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1258 .add(*Dest1)
1259 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1260
1261 moveInstsAfter(Copy1, InstsToMove);
1262
1263 CI.I->eraseFromParent();
1264 Paired.I->eraseFromParent();
1265 return New;
1266 }
1267
mergeSBufferLoadImmPair(CombineInfo & CI,CombineInfo & Paired,const SmallVectorImpl<MachineInstr * > & InstsToMove)1268 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1269 CombineInfo &CI, CombineInfo &Paired,
1270 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1271 MachineBasicBlock *MBB = CI.I->getParent();
1272 DebugLoc DL = CI.I->getDebugLoc();
1273 const unsigned Opcode = getNewOpcode(CI, Paired);
1274
1275 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1276
1277 Register DestReg = MRI->createVirtualRegister(SuperRC);
1278 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1279
1280 // It shouldn't be possible to get this far if the two instructions
1281 // don't have a single memoperand, because MachineInstr::mayAlias()
1282 // will return true if this is the case.
1283 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1284
1285 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1286 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1287
1288 MachineInstr *New =
1289 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
1290 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1291 .addImm(MergedOffset) // offset
1292 .addImm(CI.CPol) // cpol
1293 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1294
1295 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1296 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1297 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1298
1299 // Copy to the old destination registers.
1300 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1301 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1302 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1303
1304 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1305 .add(*Dest0) // Copy to same destination including flags and sub reg.
1306 .addReg(DestReg, 0, SubRegIdx0);
1307 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1308 .add(*Dest1)
1309 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1310
1311 moveInstsAfter(Copy1, InstsToMove);
1312
1313 CI.I->eraseFromParent();
1314 Paired.I->eraseFromParent();
1315 return New;
1316 }
1317
mergeBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,const SmallVectorImpl<MachineInstr * > & InstsToMove)1318 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1319 CombineInfo &CI, CombineInfo &Paired,
1320 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1321 MachineBasicBlock *MBB = CI.I->getParent();
1322 DebugLoc DL = CI.I->getDebugLoc();
1323
1324 const unsigned Opcode = getNewOpcode(CI, Paired);
1325
1326 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1327
1328 // Copy to the new source register.
1329 Register DestReg = MRI->createVirtualRegister(SuperRC);
1330 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1331
1332 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1333
1334 AddressRegs Regs = getRegs(Opcode, *TII);
1335
1336 if (Regs.VAddr)
1337 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1338
1339 // It shouldn't be possible to get this far if the two instructions
1340 // don't have a single memoperand, because MachineInstr::mayAlias()
1341 // will return true if this is the case.
1342 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1343
1344 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1345 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1346
1347 MachineInstr *New =
1348 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1349 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1350 .addImm(MergedOffset) // offset
1351 .addImm(CI.CPol) // cpol
1352 .addImm(0) // tfe
1353 .addImm(0) // swz
1354 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1355
1356 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1357 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1358 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1359
1360 // Copy to the old destination registers.
1361 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1362 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1363 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1364
1365 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1366 .add(*Dest0) // Copy to same destination including flags and sub reg.
1367 .addReg(DestReg, 0, SubRegIdx0);
1368 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1369 .add(*Dest1)
1370 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1371
1372 moveInstsAfter(Copy1, InstsToMove);
1373
1374 CI.I->eraseFromParent();
1375 Paired.I->eraseFromParent();
1376 return New;
1377 }
1378
mergeTBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,const SmallVectorImpl<MachineInstr * > & InstsToMove)1379 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1380 CombineInfo &CI, CombineInfo &Paired,
1381 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1382 MachineBasicBlock *MBB = CI.I->getParent();
1383 DebugLoc DL = CI.I->getDebugLoc();
1384
1385 const unsigned Opcode = getNewOpcode(CI, Paired);
1386
1387 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1388
1389 // Copy to the new source register.
1390 Register DestReg = MRI->createVirtualRegister(SuperRC);
1391 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1392
1393 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1394
1395 AddressRegs Regs = getRegs(Opcode, *TII);
1396
1397 if (Regs.VAddr)
1398 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1399
1400 unsigned JoinedFormat =
1401 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1402
1403 // It shouldn't be possible to get this far if the two instructions
1404 // don't have a single memoperand, because MachineInstr::mayAlias()
1405 // will return true if this is the case.
1406 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1407
1408 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1409 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1410
1411 MachineInstr *New =
1412 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1413 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1414 .addImm(MergedOffset) // offset
1415 .addImm(JoinedFormat) // format
1416 .addImm(CI.CPol) // cpol
1417 .addImm(0) // tfe
1418 .addImm(0) // swz
1419 .addMemOperand(
1420 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1421
1422 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1423 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1424 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1425
1426 // Copy to the old destination registers.
1427 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1428 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1429 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1430
1431 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1432 .add(*Dest0) // Copy to same destination including flags and sub reg.
1433 .addReg(DestReg, 0, SubRegIdx0);
1434 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1435 .add(*Dest1)
1436 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1437
1438 moveInstsAfter(Copy1, InstsToMove);
1439
1440 CI.I->eraseFromParent();
1441 Paired.I->eraseFromParent();
1442 return New;
1443 }
1444
mergeTBufferStorePair(CombineInfo & CI,CombineInfo & Paired,const SmallVectorImpl<MachineInstr * > & InstsToMove)1445 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1446 CombineInfo &CI, CombineInfo &Paired,
1447 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1448 MachineBasicBlock *MBB = CI.I->getParent();
1449 DebugLoc DL = CI.I->getDebugLoc();
1450
1451 const unsigned Opcode = getNewOpcode(CI, Paired);
1452
1453 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1454 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1455 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1456
1457 // Copy to the new source register.
1458 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1459 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1460
1461 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1462 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1463
1464 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1465 .add(*Src0)
1466 .addImm(SubRegIdx0)
1467 .add(*Src1)
1468 .addImm(SubRegIdx1);
1469
1470 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
1471 .addReg(SrcReg, RegState::Kill);
1472
1473 AddressRegs Regs = getRegs(Opcode, *TII);
1474
1475 if (Regs.VAddr)
1476 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1477
1478 unsigned JoinedFormat =
1479 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1480
1481 // It shouldn't be possible to get this far if the two instructions
1482 // don't have a single memoperand, because MachineInstr::mayAlias()
1483 // will return true if this is the case.
1484 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1485
1486 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1487 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1488
1489 MachineInstr *New =
1490 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1491 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1492 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1493 .addImm(JoinedFormat) // format
1494 .addImm(CI.CPol) // cpol
1495 .addImm(0) // tfe
1496 .addImm(0) // swz
1497 .addMemOperand(
1498 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1499
1500 moveInstsAfter(MIB, InstsToMove);
1501
1502 CI.I->eraseFromParent();
1503 Paired.I->eraseFromParent();
1504 return New;
1505 }
1506
getNewOpcode(const CombineInfo & CI,const CombineInfo & Paired)1507 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1508 const CombineInfo &Paired) {
1509 const unsigned Width = CI.Width + Paired.Width;
1510
1511 switch (CI.InstClass) {
1512 default:
1513 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1514 // FIXME: Handle d16 correctly
1515 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1516 Width);
1517 case TBUFFER_LOAD:
1518 case TBUFFER_STORE:
1519 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1520 Width);
1521
1522 case UNKNOWN:
1523 llvm_unreachable("Unknown instruction class");
1524 case S_BUFFER_LOAD_IMM:
1525 switch (Width) {
1526 default:
1527 return 0;
1528 case 2:
1529 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1530 case 4:
1531 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1532 }
1533 case MIMG:
1534 assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width));
1535 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1536 }
1537 }
1538
1539 std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI,const CombineInfo & Paired)1540 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) {
1541
1542 if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4)
1543 return std::make_pair(0, 0);
1544
1545 bool ReverseOrder;
1546 if (CI.InstClass == MIMG) {
1547 assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
1548 "No overlaps");
1549 ReverseOrder = CI.DMask > Paired.DMask;
1550 } else
1551 ReverseOrder = CI.Offset > Paired.Offset;
1552
1553 static const unsigned Idxs[4][4] = {
1554 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1555 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
1556 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
1557 {AMDGPU::sub3, 0, 0, 0},
1558 };
1559 unsigned Idx0;
1560 unsigned Idx1;
1561
1562 assert(CI.Width >= 1 && CI.Width <= 3);
1563 assert(Paired.Width >= 1 && Paired.Width <= 3);
1564
1565 if (ReverseOrder) {
1566 Idx1 = Idxs[0][Paired.Width - 1];
1567 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1568 } else {
1569 Idx0 = Idxs[0][CI.Width - 1];
1570 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1571 }
1572
1573 return std::make_pair(Idx0, Idx1);
1574 }
1575
1576 const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI,const CombineInfo & Paired)1577 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1578 const CombineInfo &Paired) {
1579 if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1580 switch (CI.Width + Paired.Width) {
1581 default:
1582 return nullptr;
1583 case 2:
1584 return &AMDGPU::SReg_64_XEXECRegClass;
1585 case 4:
1586 return &AMDGPU::SGPR_128RegClass;
1587 case 8:
1588 return &AMDGPU::SGPR_256RegClass;
1589 case 16:
1590 return &AMDGPU::SGPR_512RegClass;
1591 }
1592 }
1593
1594 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1595 return TRI->hasAGPRs(getDataRegClass(*CI.I))
1596 ? TRI->getAGPRClassForBitWidth(BitWidth)
1597 : TRI->getVGPRClassForBitWidth(BitWidth);
1598 }
1599
mergeBufferStorePair(CombineInfo & CI,CombineInfo & Paired,const SmallVectorImpl<MachineInstr * > & InstsToMove)1600 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1601 CombineInfo &CI, CombineInfo &Paired,
1602 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1603 MachineBasicBlock *MBB = CI.I->getParent();
1604 DebugLoc DL = CI.I->getDebugLoc();
1605
1606 const unsigned Opcode = getNewOpcode(CI, Paired);
1607
1608 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1609 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1610 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1611
1612 // Copy to the new source register.
1613 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1614 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1615
1616 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1617 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1618
1619 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1620 .add(*Src0)
1621 .addImm(SubRegIdx0)
1622 .add(*Src1)
1623 .addImm(SubRegIdx1);
1624
1625 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
1626 .addReg(SrcReg, RegState::Kill);
1627
1628 AddressRegs Regs = getRegs(Opcode, *TII);
1629
1630 if (Regs.VAddr)
1631 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1632
1633
1634 // It shouldn't be possible to get this far if the two instructions
1635 // don't have a single memoperand, because MachineInstr::mayAlias()
1636 // will return true if this is the case.
1637 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1638
1639 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1640 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1641
1642 MachineInstr *New =
1643 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1644 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1645 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1646 .addImm(CI.CPol) // cpol
1647 .addImm(0) // tfe
1648 .addImm(0) // swz
1649 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1650
1651 moveInstsAfter(MIB, InstsToMove);
1652
1653 CI.I->eraseFromParent();
1654 Paired.I->eraseFromParent();
1655 return New;
1656 }
1657
1658 MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI) const1659 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1660 APInt V(32, Val, true);
1661 if (TII->isInlineConstant(V))
1662 return MachineOperand::CreateImm(Val);
1663
1664 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1665 MachineInstr *Mov =
1666 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1667 TII->get(AMDGPU::S_MOV_B32), Reg)
1668 .addImm(Val);
1669 (void)Mov;
1670 LLVM_DEBUG(dbgs() << " "; Mov->dump());
1671 return MachineOperand::CreateReg(Reg, false);
1672 }
1673
1674 // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr) const1675 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1676 const MemAddress &Addr) const {
1677 MachineBasicBlock *MBB = MI.getParent();
1678 MachineBasicBlock::iterator MBBI = MI.getIterator();
1679 DebugLoc DL = MI.getDebugLoc();
1680
1681 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1682 Addr.Base.LoSubReg) &&
1683 "Expected 32-bit Base-Register-Low!!");
1684
1685 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1686 Addr.Base.HiSubReg) &&
1687 "Expected 32-bit Base-Register-Hi!!");
1688
1689 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1690 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1691 MachineOperand OffsetHi =
1692 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1693
1694 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1695 Register CarryReg = MRI->createVirtualRegister(CarryRC);
1696 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1697
1698 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1699 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1700 MachineInstr *LoHalf =
1701 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1702 .addReg(CarryReg, RegState::Define)
1703 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1704 .add(OffsetLo)
1705 .addImm(0); // clamp bit
1706 (void)LoHalf;
1707 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1708
1709 MachineInstr *HiHalf =
1710 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1711 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1712 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1713 .add(OffsetHi)
1714 .addReg(CarryReg, RegState::Kill)
1715 .addImm(0); // clamp bit
1716 (void)HiHalf;
1717 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1718
1719 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1720 MachineInstr *FullBase =
1721 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1722 .addReg(DestSub0)
1723 .addImm(AMDGPU::sub0)
1724 .addReg(DestSub1)
1725 .addImm(AMDGPU::sub1);
1726 (void)FullBase;
1727 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1728
1729 return FullDestReg;
1730 }
1731
1732 // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,Register NewBase,int32_t NewOffset) const1733 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1734 Register NewBase,
1735 int32_t NewOffset) const {
1736 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1737 Base->setReg(NewBase);
1738 Base->setIsKill(false);
1739 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1740 }
1741
1742 Optional<int32_t>
extractConstOffset(const MachineOperand & Op) const1743 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1744 if (Op.isImm())
1745 return Op.getImm();
1746
1747 if (!Op.isReg())
1748 return None;
1749
1750 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1751 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1752 !Def->getOperand(1).isImm())
1753 return None;
1754
1755 return Def->getOperand(1).getImm();
1756 }
1757
1758 // Analyze Base and extracts:
1759 // - 32bit base registers, subregisters
1760 // - 64bit constant offset
1761 // Expecting base computation as:
1762 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1763 // %LO:vgpr_32, %c:sreg_64_xexec =
1764 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1765 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1766 // %Base:vreg_64 =
1767 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr) const1768 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1769 MemAddress &Addr) const {
1770 if (!Base.isReg())
1771 return;
1772
1773 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1774 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1775 || Def->getNumOperands() != 5)
1776 return;
1777
1778 MachineOperand BaseLo = Def->getOperand(1);
1779 MachineOperand BaseHi = Def->getOperand(3);
1780 if (!BaseLo.isReg() || !BaseHi.isReg())
1781 return;
1782
1783 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1784 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1785
1786 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1787 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1788 return;
1789
1790 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1791 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1792
1793 auto Offset0P = extractConstOffset(*Src0);
1794 if (Offset0P)
1795 BaseLo = *Src1;
1796 else {
1797 if (!(Offset0P = extractConstOffset(*Src1)))
1798 return;
1799 BaseLo = *Src0;
1800 }
1801
1802 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1803 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1804
1805 if (Src0->isImm())
1806 std::swap(Src0, Src1);
1807
1808 if (!Src1->isImm())
1809 return;
1810
1811 uint64_t Offset1 = Src1->getImm();
1812 BaseHi = *Src0;
1813
1814 Addr.Base.LoReg = BaseLo.getReg();
1815 Addr.Base.HiReg = BaseHi.getReg();
1816 Addr.Base.LoSubReg = BaseLo.getSubReg();
1817 Addr.Base.HiSubReg = BaseHi.getSubReg();
1818 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1819 }
1820
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList) const1821 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1822 MachineInstr &MI,
1823 MemInfoMap &Visited,
1824 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1825
1826 if (!(MI.mayLoad() ^ MI.mayStore()))
1827 return false;
1828
1829 // TODO: Support flat and scratch.
1830 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1831 return false;
1832
1833 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1834 return false;
1835
1836 if (AnchorList.count(&MI))
1837 return false;
1838
1839 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1840
1841 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1842 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1843 return false;
1844 }
1845
1846 // Step1: Find the base-registers and a 64bit constant offset.
1847 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1848 MemAddress MAddr;
1849 if (Visited.find(&MI) == Visited.end()) {
1850 processBaseWithConstOffset(Base, MAddr);
1851 Visited[&MI] = MAddr;
1852 } else
1853 MAddr = Visited[&MI];
1854
1855 if (MAddr.Offset == 0) {
1856 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1857 " constant offsets that can be promoted.\n";);
1858 return false;
1859 }
1860
1861 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
1862 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1863
1864 // Step2: Traverse through MI's basic block and find an anchor(that has the
1865 // same base-registers) with the highest 13bit distance from MI's offset.
1866 // E.g. (64bit loads)
1867 // bb:
1868 // addr1 = &a + 4096; load1 = load(addr1, 0)
1869 // addr2 = &a + 6144; load2 = load(addr2, 0)
1870 // addr3 = &a + 8192; load3 = load(addr3, 0)
1871 // addr4 = &a + 10240; load4 = load(addr4, 0)
1872 // addr5 = &a + 12288; load5 = load(addr5, 0)
1873 //
1874 // Starting from the first load, the optimization will try to find a new base
1875 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1876 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1877 // as the new-base(anchor) because of the maximum distance which can
1878 // accomodate more intermediate bases presumeably.
1879 //
1880 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1881 // (&a + 8192) for load1, load2, load4.
1882 // addr = &a + 8192
1883 // load1 = load(addr, -4096)
1884 // load2 = load(addr, -2048)
1885 // load3 = load(addr, 0)
1886 // load4 = load(addr, 2048)
1887 // addr5 = &a + 12288; load5 = load(addr5, 0)
1888 //
1889 MachineInstr *AnchorInst = nullptr;
1890 MemAddress AnchorAddr;
1891 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1892 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1893
1894 MachineBasicBlock *MBB = MI.getParent();
1895 MachineBasicBlock::iterator E = MBB->end();
1896 MachineBasicBlock::iterator MBBI = MI.getIterator();
1897 ++MBBI;
1898 const SITargetLowering *TLI =
1899 static_cast<const SITargetLowering *>(STM->getTargetLowering());
1900
1901 for ( ; MBBI != E; ++MBBI) {
1902 MachineInstr &MINext = *MBBI;
1903 // TODO: Support finding an anchor(with same base) from store addresses or
1904 // any other load addresses where the opcodes are different.
1905 if (MINext.getOpcode() != MI.getOpcode() ||
1906 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1907 continue;
1908
1909 const MachineOperand &BaseNext =
1910 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1911 MemAddress MAddrNext;
1912 if (Visited.find(&MINext) == Visited.end()) {
1913 processBaseWithConstOffset(BaseNext, MAddrNext);
1914 Visited[&MINext] = MAddrNext;
1915 } else
1916 MAddrNext = Visited[&MINext];
1917
1918 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1919 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1920 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1921 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1922 continue;
1923
1924 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1925
1926 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1927 TargetLoweringBase::AddrMode AM;
1928 AM.HasBaseReg = true;
1929 AM.BaseOffs = Dist;
1930 if (TLI->isLegalGlobalAddressingMode(AM) &&
1931 (uint32_t)std::abs(Dist) > MaxDist) {
1932 MaxDist = std::abs(Dist);
1933
1934 AnchorAddr = MAddrNext;
1935 AnchorInst = &MINext;
1936 }
1937 }
1938
1939 if (AnchorInst) {
1940 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1941 AnchorInst->dump());
1942 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1943 << AnchorAddr.Offset << "\n\n");
1944
1945 // Instead of moving up, just re-compute anchor-instruction's base address.
1946 Register Base = computeBase(MI, AnchorAddr);
1947
1948 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1949 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
1950
1951 for (auto P : InstsWCommonBase) {
1952 TargetLoweringBase::AddrMode AM;
1953 AM.HasBaseReg = true;
1954 AM.BaseOffs = P.second - AnchorAddr.Offset;
1955
1956 if (TLI->isLegalGlobalAddressingMode(AM)) {
1957 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
1958 dbgs() << ")"; P.first->dump());
1959 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1960 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
1961 }
1962 }
1963 AnchorList.insert(AnchorInst);
1964 return true;
1965 }
1966
1967 return false;
1968 }
1969
addInstToMergeableList(const CombineInfo & CI,std::list<std::list<CombineInfo>> & MergeableInsts) const1970 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1971 std::list<std::list<CombineInfo> > &MergeableInsts) const {
1972 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1973 if (AddrList.front().InstClass == CI.InstClass &&
1974 AddrList.front().hasSameBaseAddress(*CI.I)) {
1975 AddrList.emplace_back(CI);
1976 return;
1977 }
1978 }
1979
1980 // Base address not found, so add a new list.
1981 MergeableInsts.emplace_back(1, CI);
1982 }
1983
1984 std::pair<MachineBasicBlock::iterator, bool>
collectMergeableInsts(MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList,std::list<std::list<CombineInfo>> & MergeableInsts) const1985 SILoadStoreOptimizer::collectMergeableInsts(
1986 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
1987 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
1988 std::list<std::list<CombineInfo>> &MergeableInsts) const {
1989 bool Modified = false;
1990
1991 // Sort potential mergeable instructions into lists. One list per base address.
1992 unsigned Order = 0;
1993 MachineBasicBlock::iterator BlockI = Begin;
1994 for (; BlockI != End; ++BlockI) {
1995 MachineInstr &MI = *BlockI;
1996
1997 // We run this before checking if an address is mergeable, because it can produce
1998 // better code even if the instructions aren't mergeable.
1999 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2000 Modified = true;
2001
2002 // Don't combine if volatile. We also won't be able to merge across this, so
2003 // break the search. We can look after this barrier for separate merges.
2004 if (MI.hasOrderedMemoryRef()) {
2005 LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI);
2006
2007 // Search will resume after this instruction in a separate merge list.
2008 ++BlockI;
2009 break;
2010 }
2011
2012 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2013 if (InstClass == UNKNOWN)
2014 continue;
2015
2016 CombineInfo CI;
2017 CI.setMI(MI, *TII, *STM);
2018 CI.Order = Order++;
2019
2020 if (!CI.hasMergeableAddress(*MRI))
2021 continue;
2022
2023 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2024
2025 addInstToMergeableList(CI, MergeableInsts);
2026 }
2027
2028 // At this point we have lists of Mergeable instructions.
2029 //
2030 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2031 // list try to find an instruction that can be merged with I. If an instruction
2032 // is found, it is stored in the Paired field. If no instructions are found, then
2033 // the CombineInfo object is deleted from the list.
2034
2035 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2036 E = MergeableInsts.end(); I != E;) {
2037
2038 std::list<CombineInfo> &MergeList = *I;
2039 if (MergeList.size() <= 1) {
2040 // This means we have found only one instruction with a given address
2041 // that can be merged, and we need at least 2 instructions to do a merge,
2042 // so this list can be discarded.
2043 I = MergeableInsts.erase(I);
2044 continue;
2045 }
2046
2047 // Sort the lists by offsets, this way mergeable instructions will be
2048 // adjacent to each other in the list, which will make it easier to find
2049 // matches.
2050 MergeList.sort(
2051 [] (const CombineInfo &A, const CombineInfo &B) {
2052 return A.Offset < B.Offset;
2053 });
2054 ++I;
2055 }
2056
2057 return std::make_pair(BlockI, Modified);
2058 }
2059
2060 // Scan through looking for adjacent LDS operations with constant offsets from
2061 // the same base register. We rely on the scheduler to do the hard work of
2062 // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(std::list<std::list<CombineInfo>> & MergeableInsts)2063 bool SILoadStoreOptimizer::optimizeBlock(
2064 std::list<std::list<CombineInfo> > &MergeableInsts) {
2065 bool Modified = false;
2066
2067 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2068 E = MergeableInsts.end(); I != E;) {
2069 std::list<CombineInfo> &MergeList = *I;
2070
2071 bool OptimizeListAgain = false;
2072 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2073 // We weren't able to make any changes, so delete the list so we don't
2074 // process the same instructions the next time we try to optimize this
2075 // block.
2076 I = MergeableInsts.erase(I);
2077 continue;
2078 }
2079
2080 Modified = true;
2081
2082 // We made changes, but also determined that there were no more optimization
2083 // opportunities, so we don't need to reprocess the list
2084 if (!OptimizeListAgain) {
2085 I = MergeableInsts.erase(I);
2086 continue;
2087 }
2088 OptimizeAgain = true;
2089 }
2090 return Modified;
2091 }
2092
2093 bool
optimizeInstsWithSameBaseAddr(std::list<CombineInfo> & MergeList,bool & OptimizeListAgain)2094 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2095 std::list<CombineInfo> &MergeList,
2096 bool &OptimizeListAgain) {
2097 if (MergeList.empty())
2098 return false;
2099
2100 bool Modified = false;
2101
2102 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2103 Next = std::next(I)) {
2104
2105 auto First = I;
2106 auto Second = Next;
2107
2108 if ((*First).Order > (*Second).Order)
2109 std::swap(First, Second);
2110 CombineInfo &CI = *First;
2111 CombineInfo &Paired = *Second;
2112
2113 SmallVector<MachineInstr *, 8> InstsToMove;
2114 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
2115 ++I;
2116 continue;
2117 }
2118
2119 Modified = true;
2120
2121 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2122
2123 switch (CI.InstClass) {
2124 default:
2125 llvm_unreachable("unknown InstClass");
2126 break;
2127 case DS_READ: {
2128 MachineBasicBlock::iterator NewMI =
2129 mergeRead2Pair(CI, Paired, InstsToMove);
2130 CI.setMI(NewMI, *TII, *STM);
2131 break;
2132 }
2133 case DS_WRITE: {
2134 MachineBasicBlock::iterator NewMI =
2135 mergeWrite2Pair(CI, Paired, InstsToMove);
2136 CI.setMI(NewMI, *TII, *STM);
2137 break;
2138 }
2139 case S_BUFFER_LOAD_IMM: {
2140 MachineBasicBlock::iterator NewMI =
2141 mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
2142 CI.setMI(NewMI, *TII, *STM);
2143 OptimizeListAgain |= (CI.Width + Paired.Width) < 16;
2144 break;
2145 }
2146 case BUFFER_LOAD: {
2147 MachineBasicBlock::iterator NewMI =
2148 mergeBufferLoadPair(CI, Paired, InstsToMove);
2149 CI.setMI(NewMI, *TII, *STM);
2150 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2151 break;
2152 }
2153 case BUFFER_STORE: {
2154 MachineBasicBlock::iterator NewMI =
2155 mergeBufferStorePair(CI, Paired, InstsToMove);
2156 CI.setMI(NewMI, *TII, *STM);
2157 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2158 break;
2159 }
2160 case MIMG: {
2161 MachineBasicBlock::iterator NewMI =
2162 mergeImagePair(CI, Paired, InstsToMove);
2163 CI.setMI(NewMI, *TII, *STM);
2164 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2165 break;
2166 }
2167 case TBUFFER_LOAD: {
2168 MachineBasicBlock::iterator NewMI =
2169 mergeTBufferLoadPair(CI, Paired, InstsToMove);
2170 CI.setMI(NewMI, *TII, *STM);
2171 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2172 break;
2173 }
2174 case TBUFFER_STORE: {
2175 MachineBasicBlock::iterator NewMI =
2176 mergeTBufferStorePair(CI, Paired, InstsToMove);
2177 CI.setMI(NewMI, *TII, *STM);
2178 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2179 break;
2180 }
2181 }
2182 CI.Order = Paired.Order;
2183 if (I == Second)
2184 I = Next;
2185
2186 MergeList.erase(Second);
2187 }
2188
2189 return Modified;
2190 }
2191
runOnMachineFunction(MachineFunction & MF)2192 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2193 if (skipFunction(MF.getFunction()))
2194 return false;
2195
2196 STM = &MF.getSubtarget<GCNSubtarget>();
2197 if (!STM->loadStoreOptEnabled())
2198 return false;
2199
2200 TII = STM->getInstrInfo();
2201 TRI = &TII->getRegisterInfo();
2202
2203 MRI = &MF.getRegInfo();
2204 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2205
2206 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2207
2208 bool Modified = false;
2209
2210 // Contains the list of instructions for which constant offsets are being
2211 // promoted to the IMM. This is tracked for an entire block at time.
2212 SmallPtrSet<MachineInstr *, 4> AnchorList;
2213 MemInfoMap Visited;
2214
2215 for (MachineBasicBlock &MBB : MF) {
2216 MachineBasicBlock::iterator SectionEnd;
2217 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2218 I = SectionEnd) {
2219 bool CollectModified;
2220 std::list<std::list<CombineInfo>> MergeableInsts;
2221
2222 // First pass: Collect list of all instructions we know how to merge in a
2223 // subset of the block.
2224 std::tie(SectionEnd, CollectModified) =
2225 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2226
2227 Modified |= CollectModified;
2228
2229 do {
2230 OptimizeAgain = false;
2231 Modified |= optimizeBlock(MergeableInsts);
2232 } while (OptimizeAgain);
2233 }
2234
2235 Visited.clear();
2236 AnchorList.clear();
2237 }
2238
2239 return Modified;
2240 }
2241