1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   S_BUFFER_LOAD_SGPR_IMM,
78   S_LOAD_IMM,
79   BUFFER_LOAD,
80   BUFFER_STORE,
81   MIMG,
82   TBUFFER_LOAD,
83   TBUFFER_STORE,
84   GLOBAL_LOAD_SADDR,
85   GLOBAL_STORE_SADDR,
86   FLAT_LOAD,
87   FLAT_STORE,
88   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89   GLOBAL_STORE // any CombineInfo, they are only ever returned by
90                // getCommonInstClass.
91 };
92 
93 struct AddressRegs {
94   unsigned char NumVAddrs = 0;
95   bool SBase = false;
96   bool SRsrc = false;
97   bool SOffset = false;
98   bool SAddr = false;
99   bool VAddr = false;
100   bool Addr = false;
101   bool SSamp = false;
102 };
103 
104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105 const unsigned MaxAddressRegs = 12 + 1 + 1;
106 
107 class SILoadStoreOptimizer : public MachineFunctionPass {
108   struct CombineInfo {
109     MachineBasicBlock::iterator I;
110     unsigned EltSize;
111     unsigned Offset;
112     unsigned Width;
113     unsigned Format;
114     unsigned BaseOff;
115     unsigned DMask;
116     InstClassEnum InstClass;
117     unsigned CPol = 0;
118     bool IsAGPR;
119     bool UseST64;
120     int AddrIdx[MaxAddressRegs];
121     const MachineOperand *AddrReg[MaxAddressRegs];
122     unsigned NumAddresses;
123     unsigned Order;
124 
hasSameBaseAddress__anon92177a470111::SILoadStoreOptimizer::CombineInfo125     bool hasSameBaseAddress(const CombineInfo &CI) {
126       if (NumAddresses != CI.NumAddresses)
127         return false;
128 
129       const MachineInstr &MI = *CI.I;
130       for (unsigned i = 0; i < NumAddresses; i++) {
131         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132 
133         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136             return false;
137           }
138           continue;
139         }
140 
141         // Check same base pointer. Be careful of subregisters, which can occur
142         // with vectors of pointers.
143         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145          return false;
146         }
147       }
148       return true;
149     }
150 
hasMergeableAddress__anon92177a470111::SILoadStoreOptimizer::CombineInfo151     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152       for (unsigned i = 0; i < NumAddresses; ++i) {
153         const MachineOperand *AddrOp = AddrReg[i];
154         // Immediates are always OK.
155         if (AddrOp->isImm())
156           continue;
157 
158         // Don't try to merge addresses that aren't either immediates or registers.
159         // TODO: Should be possible to merge FrameIndexes and maybe some other
160         // non-register
161         if (!AddrOp->isReg())
162           return false;
163 
164         // TODO: We should be able to merge instructions with other physical reg
165         // addresses too.
166         if (AddrOp->getReg().isPhysical() &&
167             AddrOp->getReg() != AMDGPU::SGPR_NULL)
168           return false;
169 
170         // If an address has only one use then there will be no other
171         // instructions with the same address, so we can't merge this one.
172         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
173           return false;
174       }
175       return true;
176     }
177 
178     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179 
180     // Compare by pointer order.
operator <__anon92177a470111::SILoadStoreOptimizer::CombineInfo181     bool operator<(const CombineInfo& Other) const {
182       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183     }
184   };
185 
186   struct BaseRegisters {
187     Register LoReg;
188     Register HiReg;
189 
190     unsigned LoSubReg = 0;
191     unsigned HiSubReg = 0;
192   };
193 
194   struct MemAddress {
195     BaseRegisters Base;
196     int64_t Offset = 0;
197   };
198 
199   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200 
201 private:
202   const GCNSubtarget *STM = nullptr;
203   const SIInstrInfo *TII = nullptr;
204   const SIRegisterInfo *TRI = nullptr;
205   MachineRegisterInfo *MRI = nullptr;
206   AliasAnalysis *AA = nullptr;
207   bool OptimizeAgain;
208 
209   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210                            const DenseSet<Register> &ARegUses,
211                            const MachineInstr &A, const MachineInstr &B) const;
212   static bool dmasksCanBeCombined(const CombineInfo &CI,
213                                   const SIInstrInfo &TII,
214                                   const CombineInfo &Paired);
215   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216                                    CombineInfo &Paired, bool Modify = false);
217   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218                         const CombineInfo &Paired);
219   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221                                                      const CombineInfo &Paired);
222   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
223                                                     const CombineInfo &Paired);
224   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
225 
226   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
227 
228   unsigned read2Opcode(unsigned EltSize) const;
229   unsigned read2ST64Opcode(unsigned EltSize) const;
230   MachineBasicBlock::iterator
231   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
232                  MachineBasicBlock::iterator InsertBefore);
233 
234   unsigned write2Opcode(unsigned EltSize) const;
235   unsigned write2ST64Opcode(unsigned EltSize) const;
236   MachineBasicBlock::iterator
237   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
238                   MachineBasicBlock::iterator InsertBefore);
239   MachineBasicBlock::iterator
240   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
241                  MachineBasicBlock::iterator InsertBefore);
242   MachineBasicBlock::iterator
243   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
244                        MachineBasicBlock::iterator InsertBefore);
245   MachineBasicBlock::iterator
246   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
247                       MachineBasicBlock::iterator InsertBefore);
248   MachineBasicBlock::iterator
249   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
250                        MachineBasicBlock::iterator InsertBefore);
251   MachineBasicBlock::iterator
252   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
253                        MachineBasicBlock::iterator InsertBefore);
254   MachineBasicBlock::iterator
255   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
256                         MachineBasicBlock::iterator InsertBefore);
257   MachineBasicBlock::iterator
258   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
259                     MachineBasicBlock::iterator InsertBefore);
260   MachineBasicBlock::iterator
261   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
262                      MachineBasicBlock::iterator InsertBefore);
263 
264   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
265                            int32_t NewOffset) const;
266   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
267   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
268   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
269   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
270   /// Promotes constant offset to the immediate by adjusting the base. It
271   /// tries to use a base from the nearby instructions that allows it to have
272   /// a 13bit constant offset which gets promoted to the immediate.
273   bool promoteConstantOffsetToImm(MachineInstr &CI,
274                                   MemInfoMap &Visited,
275                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
276   void addInstToMergeableList(const CombineInfo &CI,
277                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
278 
279   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
280       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
281       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
282       std::list<std::list<CombineInfo>> &MergeableInsts) const;
283 
284   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
285                                                      const CombineInfo &Paired);
286 
287   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
288                                           const CombineInfo &Paired);
289 
290 public:
291   static char ID;
292 
SILoadStoreOptimizer()293   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
294     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
295   }
296 
297   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
298                                      bool &OptimizeListAgain);
299   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
300 
301   bool runOnMachineFunction(MachineFunction &MF) override;
302 
getPassName() const303   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
304 
getAnalysisUsage(AnalysisUsage & AU) const305   void getAnalysisUsage(AnalysisUsage &AU) const override {
306     AU.setPreservesCFG();
307     AU.addRequired<AAResultsWrapperPass>();
308 
309     MachineFunctionPass::getAnalysisUsage(AU);
310   }
311 
getRequiredProperties() const312   MachineFunctionProperties getRequiredProperties() const override {
313     return MachineFunctionProperties()
314       .set(MachineFunctionProperties::Property::IsSSA);
315   }
316 };
317 
getOpcodeWidth(const MachineInstr & MI,const SIInstrInfo & TII)318 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
319   const unsigned Opc = MI.getOpcode();
320 
321   if (TII.isMUBUF(Opc)) {
322     // FIXME: Handle d16 correctly
323     return AMDGPU::getMUBUFElements(Opc);
324   }
325   if (TII.isImage(MI)) {
326     uint64_t DMaskImm =
327         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
328     return llvm::popcount(DMaskImm);
329   }
330   if (TII.isMTBUF(Opc)) {
331     return AMDGPU::getMTBUFElements(Opc);
332   }
333 
334   switch (Opc) {
335   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
336   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
337   case AMDGPU::S_LOAD_DWORD_IMM:
338   case AMDGPU::GLOBAL_LOAD_DWORD:
339   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
340   case AMDGPU::GLOBAL_STORE_DWORD:
341   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
342   case AMDGPU::FLAT_LOAD_DWORD:
343   case AMDGPU::FLAT_STORE_DWORD:
344     return 1;
345   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
346   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347   case AMDGPU::S_LOAD_DWORDX2_IMM:
348   case AMDGPU::GLOBAL_LOAD_DWORDX2:
349   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350   case AMDGPU::GLOBAL_STORE_DWORDX2:
351   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352   case AMDGPU::FLAT_LOAD_DWORDX2:
353   case AMDGPU::FLAT_STORE_DWORDX2:
354     return 2;
355   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
356   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
357   case AMDGPU::S_LOAD_DWORDX3_IMM:
358   case AMDGPU::GLOBAL_LOAD_DWORDX3:
359   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
360   case AMDGPU::GLOBAL_STORE_DWORDX3:
361   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
362   case AMDGPU::FLAT_LOAD_DWORDX3:
363   case AMDGPU::FLAT_STORE_DWORDX3:
364     return 3;
365   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
366   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
367   case AMDGPU::S_LOAD_DWORDX4_IMM:
368   case AMDGPU::GLOBAL_LOAD_DWORDX4:
369   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
370   case AMDGPU::GLOBAL_STORE_DWORDX4:
371   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
372   case AMDGPU::FLAT_LOAD_DWORDX4:
373   case AMDGPU::FLAT_STORE_DWORDX4:
374     return 4;
375   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
376   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
377   case AMDGPU::S_LOAD_DWORDX8_IMM:
378     return 8;
379   case AMDGPU::DS_READ_B32:      [[fallthrough]];
380   case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
381   case AMDGPU::DS_WRITE_B32:     [[fallthrough]];
382   case AMDGPU::DS_WRITE_B32_gfx9:
383     return 1;
384   case AMDGPU::DS_READ_B64:      [[fallthrough]];
385   case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
386   case AMDGPU::DS_WRITE_B64:     [[fallthrough]];
387   case AMDGPU::DS_WRITE_B64_gfx9:
388     return 2;
389   default:
390     return 0;
391   }
392 }
393 
394 /// Maps instruction opcode to enum InstClassEnum.
getInstClass(unsigned Opc,const SIInstrInfo & TII)395 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
396   switch (Opc) {
397   default:
398     if (TII.isMUBUF(Opc)) {
399       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
400       default:
401         return UNKNOWN;
402       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
403       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
404       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
405       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
406       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
407       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
408       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
409       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
410         return BUFFER_LOAD;
411       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
412       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
413       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
414       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
415       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
416       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
417       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
418       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
419         return BUFFER_STORE;
420       }
421     }
422     if (TII.isImage(Opc)) {
423       // Ignore instructions encoded without vaddr.
424       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
425           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
426         return UNKNOWN;
427       // Ignore BVH instructions
428       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
429         return UNKNOWN;
430       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
431       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
432           TII.isGather4(Opc))
433         return UNKNOWN;
434       return MIMG;
435     }
436     if (TII.isMTBUF(Opc)) {
437       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
438       default:
439         return UNKNOWN;
440       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
441       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
442       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
443       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
444       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
445       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
446       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
447       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
448       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
449       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
450       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
451       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
452       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
453       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
454       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
455       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
456         return TBUFFER_LOAD;
457       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
458       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
459       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
460       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
461       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
462       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
463       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
464       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
465         return TBUFFER_STORE;
466       }
467     }
468     return UNKNOWN;
469   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
470   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
471   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
472   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
473   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
474     return S_BUFFER_LOAD_IMM;
475   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
476   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
477   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
478   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
479   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
480     return S_BUFFER_LOAD_SGPR_IMM;
481   case AMDGPU::S_LOAD_DWORD_IMM:
482   case AMDGPU::S_LOAD_DWORDX2_IMM:
483   case AMDGPU::S_LOAD_DWORDX3_IMM:
484   case AMDGPU::S_LOAD_DWORDX4_IMM:
485   case AMDGPU::S_LOAD_DWORDX8_IMM:
486     return S_LOAD_IMM;
487   case AMDGPU::DS_READ_B32:
488   case AMDGPU::DS_READ_B32_gfx9:
489   case AMDGPU::DS_READ_B64:
490   case AMDGPU::DS_READ_B64_gfx9:
491     return DS_READ;
492   case AMDGPU::DS_WRITE_B32:
493   case AMDGPU::DS_WRITE_B32_gfx9:
494   case AMDGPU::DS_WRITE_B64:
495   case AMDGPU::DS_WRITE_B64_gfx9:
496     return DS_WRITE;
497   case AMDGPU::GLOBAL_LOAD_DWORD:
498   case AMDGPU::GLOBAL_LOAD_DWORDX2:
499   case AMDGPU::GLOBAL_LOAD_DWORDX3:
500   case AMDGPU::GLOBAL_LOAD_DWORDX4:
501   case AMDGPU::FLAT_LOAD_DWORD:
502   case AMDGPU::FLAT_LOAD_DWORDX2:
503   case AMDGPU::FLAT_LOAD_DWORDX3:
504   case AMDGPU::FLAT_LOAD_DWORDX4:
505     return FLAT_LOAD;
506   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
507   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
508   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
509   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
510     return GLOBAL_LOAD_SADDR;
511   case AMDGPU::GLOBAL_STORE_DWORD:
512   case AMDGPU::GLOBAL_STORE_DWORDX2:
513   case AMDGPU::GLOBAL_STORE_DWORDX3:
514   case AMDGPU::GLOBAL_STORE_DWORDX4:
515   case AMDGPU::FLAT_STORE_DWORD:
516   case AMDGPU::FLAT_STORE_DWORDX2:
517   case AMDGPU::FLAT_STORE_DWORDX3:
518   case AMDGPU::FLAT_STORE_DWORDX4:
519     return FLAT_STORE;
520   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
521   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
522   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
523   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
524     return GLOBAL_STORE_SADDR;
525   }
526 }
527 
528 /// Determines instruction subclass from opcode. Only instructions
529 /// of the same subclass can be merged together. The merged instruction may have
530 /// a different subclass but must have the same class.
getInstSubclass(unsigned Opc,const SIInstrInfo & TII)531 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
532   switch (Opc) {
533   default:
534     if (TII.isMUBUF(Opc))
535       return AMDGPU::getMUBUFBaseOpcode(Opc);
536     if (TII.isImage(Opc)) {
537       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
538       assert(Info);
539       return Info->BaseOpcode;
540     }
541     if (TII.isMTBUF(Opc))
542       return AMDGPU::getMTBUFBaseOpcode(Opc);
543     return -1;
544   case AMDGPU::DS_READ_B32:
545   case AMDGPU::DS_READ_B32_gfx9:
546   case AMDGPU::DS_READ_B64:
547   case AMDGPU::DS_READ_B64_gfx9:
548   case AMDGPU::DS_WRITE_B32:
549   case AMDGPU::DS_WRITE_B32_gfx9:
550   case AMDGPU::DS_WRITE_B64:
551   case AMDGPU::DS_WRITE_B64_gfx9:
552     return Opc;
553   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
554   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
555   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
556   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
557   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
558     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
559   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
560   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
561   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
562   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
563   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
564     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
565   case AMDGPU::S_LOAD_DWORD_IMM:
566   case AMDGPU::S_LOAD_DWORDX2_IMM:
567   case AMDGPU::S_LOAD_DWORDX3_IMM:
568   case AMDGPU::S_LOAD_DWORDX4_IMM:
569   case AMDGPU::S_LOAD_DWORDX8_IMM:
570     return AMDGPU::S_LOAD_DWORD_IMM;
571   case AMDGPU::GLOBAL_LOAD_DWORD:
572   case AMDGPU::GLOBAL_LOAD_DWORDX2:
573   case AMDGPU::GLOBAL_LOAD_DWORDX3:
574   case AMDGPU::GLOBAL_LOAD_DWORDX4:
575   case AMDGPU::FLAT_LOAD_DWORD:
576   case AMDGPU::FLAT_LOAD_DWORDX2:
577   case AMDGPU::FLAT_LOAD_DWORDX3:
578   case AMDGPU::FLAT_LOAD_DWORDX4:
579     return AMDGPU::FLAT_LOAD_DWORD;
580   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
581   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
582   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
583   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
584     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
585   case AMDGPU::GLOBAL_STORE_DWORD:
586   case AMDGPU::GLOBAL_STORE_DWORDX2:
587   case AMDGPU::GLOBAL_STORE_DWORDX3:
588   case AMDGPU::GLOBAL_STORE_DWORDX4:
589   case AMDGPU::FLAT_STORE_DWORD:
590   case AMDGPU::FLAT_STORE_DWORDX2:
591   case AMDGPU::FLAT_STORE_DWORDX3:
592   case AMDGPU::FLAT_STORE_DWORDX4:
593     return AMDGPU::FLAT_STORE_DWORD;
594   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
595   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
596   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
597   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
598     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
599   }
600 }
601 
602 // GLOBAL loads and stores are classified as FLAT initially. If both combined
603 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
604 // If either or both instructions are non segment specific FLAT the resulting
605 // combined operation will be FLAT, potentially promoting one of the GLOBAL
606 // operations to FLAT.
607 // For other instructions return the original unmodified class.
608 InstClassEnum
getCommonInstClass(const CombineInfo & CI,const CombineInfo & Paired)609 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
610                                          const CombineInfo &Paired) {
611   assert(CI.InstClass == Paired.InstClass);
612 
613   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
614       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
615     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
616 
617   return CI.InstClass;
618 }
619 
getRegs(unsigned Opc,const SIInstrInfo & TII)620 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
621   AddressRegs Result;
622 
623   if (TII.isMUBUF(Opc)) {
624     if (AMDGPU::getMUBUFHasVAddr(Opc))
625       Result.VAddr = true;
626     if (AMDGPU::getMUBUFHasSrsrc(Opc))
627       Result.SRsrc = true;
628     if (AMDGPU::getMUBUFHasSoffset(Opc))
629       Result.SOffset = true;
630 
631     return Result;
632   }
633 
634   if (TII.isImage(Opc)) {
635     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
636     if (VAddr0Idx >= 0) {
637       int RsrcName =
638           TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
639       int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
640       Result.NumVAddrs = RsrcIdx - VAddr0Idx;
641     } else {
642       Result.VAddr = true;
643     }
644     Result.SRsrc = true;
645     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
646     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
647       Result.SSamp = true;
648 
649     return Result;
650   }
651   if (TII.isMTBUF(Opc)) {
652     if (AMDGPU::getMTBUFHasVAddr(Opc))
653       Result.VAddr = true;
654     if (AMDGPU::getMTBUFHasSrsrc(Opc))
655       Result.SRsrc = true;
656     if (AMDGPU::getMTBUFHasSoffset(Opc))
657       Result.SOffset = true;
658 
659     return Result;
660   }
661 
662   switch (Opc) {
663   default:
664     return Result;
665   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
666   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
667   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
668   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
669   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
670     Result.SOffset = true;
671     [[fallthrough]];
672   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
673   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
674   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
675   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
676   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
677   case AMDGPU::S_LOAD_DWORD_IMM:
678   case AMDGPU::S_LOAD_DWORDX2_IMM:
679   case AMDGPU::S_LOAD_DWORDX3_IMM:
680   case AMDGPU::S_LOAD_DWORDX4_IMM:
681   case AMDGPU::S_LOAD_DWORDX8_IMM:
682     Result.SBase = true;
683     return Result;
684   case AMDGPU::DS_READ_B32:
685   case AMDGPU::DS_READ_B64:
686   case AMDGPU::DS_READ_B32_gfx9:
687   case AMDGPU::DS_READ_B64_gfx9:
688   case AMDGPU::DS_WRITE_B32:
689   case AMDGPU::DS_WRITE_B64:
690   case AMDGPU::DS_WRITE_B32_gfx9:
691   case AMDGPU::DS_WRITE_B64_gfx9:
692     Result.Addr = true;
693     return Result;
694   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
695   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
696   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
697   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
698   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
699   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
700   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
701   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
702     Result.SAddr = true;
703     [[fallthrough]];
704   case AMDGPU::GLOBAL_LOAD_DWORD:
705   case AMDGPU::GLOBAL_LOAD_DWORDX2:
706   case AMDGPU::GLOBAL_LOAD_DWORDX3:
707   case AMDGPU::GLOBAL_LOAD_DWORDX4:
708   case AMDGPU::GLOBAL_STORE_DWORD:
709   case AMDGPU::GLOBAL_STORE_DWORDX2:
710   case AMDGPU::GLOBAL_STORE_DWORDX3:
711   case AMDGPU::GLOBAL_STORE_DWORDX4:
712   case AMDGPU::FLAT_LOAD_DWORD:
713   case AMDGPU::FLAT_LOAD_DWORDX2:
714   case AMDGPU::FLAT_LOAD_DWORDX3:
715   case AMDGPU::FLAT_LOAD_DWORDX4:
716   case AMDGPU::FLAT_STORE_DWORD:
717   case AMDGPU::FLAT_STORE_DWORDX2:
718   case AMDGPU::FLAT_STORE_DWORDX3:
719   case AMDGPU::FLAT_STORE_DWORDX4:
720     Result.VAddr = true;
721     return Result;
722   }
723 }
724 
setMI(MachineBasicBlock::iterator MI,const SILoadStoreOptimizer & LSO)725 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
726                                               const SILoadStoreOptimizer &LSO) {
727   I = MI;
728   unsigned Opc = MI->getOpcode();
729   InstClass = getInstClass(Opc, *LSO.TII);
730 
731   if (InstClass == UNKNOWN)
732     return;
733 
734   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
735 
736   switch (InstClass) {
737   case DS_READ:
738    EltSize =
739           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
740                                                                           : 4;
741    break;
742   case DS_WRITE:
743     EltSize =
744           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
745                                                                             : 4;
746     break;
747   case S_BUFFER_LOAD_IMM:
748   case S_BUFFER_LOAD_SGPR_IMM:
749   case S_LOAD_IMM:
750     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
751     break;
752   default:
753     EltSize = 4;
754     break;
755   }
756 
757   if (InstClass == MIMG) {
758     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
759     // Offset is not considered for MIMG instructions.
760     Offset = 0;
761   } else {
762     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
763     Offset = I->getOperand(OffsetIdx).getImm();
764   }
765 
766   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
767     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
768 
769   Width = getOpcodeWidth(*I, *LSO.TII);
770 
771   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
772     Offset &= 0xffff;
773   } else if (InstClass != MIMG) {
774     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
775   }
776 
777   AddressRegs Regs = getRegs(Opc, *LSO.TII);
778   bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
779 
780   NumAddresses = 0;
781   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
782     AddrIdx[NumAddresses++] =
783         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
784   if (Regs.Addr)
785     AddrIdx[NumAddresses++] =
786         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
787   if (Regs.SBase)
788     AddrIdx[NumAddresses++] =
789         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
790   if (Regs.SRsrc)
791     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
792         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
793   if (Regs.SOffset)
794     AddrIdx[NumAddresses++] =
795         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
796   if (Regs.SAddr)
797     AddrIdx[NumAddresses++] =
798         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
799   if (Regs.VAddr)
800     AddrIdx[NumAddresses++] =
801         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
802   if (Regs.SSamp)
803     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
804         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
805   assert(NumAddresses <= MaxAddressRegs);
806 
807   for (unsigned J = 0; J < NumAddresses; J++)
808     AddrReg[J] = &I->getOperand(AddrIdx[J]);
809 }
810 
811 } // end anonymous namespace.
812 
813 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
814                       "SI Load Store Optimizer", false, false)
815 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
816 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
817                     false, false)
818 
819 char SILoadStoreOptimizer::ID = 0;
820 
821 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
822 
createSILoadStoreOptimizerPass()823 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
824   return new SILoadStoreOptimizer();
825 }
826 
addDefsUsesToList(const MachineInstr & MI,DenseSet<Register> & RegDefs,DenseSet<Register> & RegUses)827 static void addDefsUsesToList(const MachineInstr &MI,
828                               DenseSet<Register> &RegDefs,
829                               DenseSet<Register> &RegUses) {
830   for (const auto &Op : MI.operands()) {
831     if (!Op.isReg())
832       continue;
833     if (Op.isDef())
834       RegDefs.insert(Op.getReg());
835     if (Op.readsReg())
836       RegUses.insert(Op.getReg());
837   }
838 }
839 
canSwapInstructions(const DenseSet<Register> & ARegDefs,const DenseSet<Register> & ARegUses,const MachineInstr & A,const MachineInstr & B) const840 bool SILoadStoreOptimizer::canSwapInstructions(
841     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
842     const MachineInstr &A, const MachineInstr &B) const {
843   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
844       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
845     return false;
846   for (const auto &BOp : B.operands()) {
847     if (!BOp.isReg())
848       continue;
849     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
850       return false;
851     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
852       return false;
853   }
854   return true;
855 }
856 
857 // Given that \p CI and \p Paired are adjacent memory operations produce a new
858 // MMO for the combined operation with a new access size.
859 MachineMemOperand *
combineKnownAdjacentMMOs(const CombineInfo & CI,const CombineInfo & Paired)860 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
861                                                const CombineInfo &Paired) {
862   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
863   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
864 
865   unsigned Size = MMOa->getSize() + MMOb->getSize();
866 
867   // A base pointer for the combined operation is the same as the leading
868   // operation's pointer.
869   if (Paired < CI)
870     std::swap(MMOa, MMOb);
871 
872   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
873   // If merging FLAT and GLOBAL set address space to FLAT.
874   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
875     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
876 
877   MachineFunction *MF = CI.I->getMF();
878   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
879 }
880 
dmasksCanBeCombined(const CombineInfo & CI,const SIInstrInfo & TII,const CombineInfo & Paired)881 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
882                                                const SIInstrInfo &TII,
883                                                const CombineInfo &Paired) {
884   assert(CI.InstClass == MIMG);
885 
886   // Ignore instructions with tfe/lwe set.
887   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
888   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
889 
890   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
891     return false;
892 
893   // Check other optional immediate operands for equality.
894   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
895                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
896                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
897 
898   for (auto op : OperandsToMatch) {
899     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
900     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
901       return false;
902     if (Idx != -1 &&
903         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
904       return false;
905   }
906 
907   // Check DMask for overlaps.
908   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
909   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
910 
911   if (!MaxMask)
912     return false;
913 
914   unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
915   if ((1u << AllowedBitsForMin) <= MinMask)
916     return false;
917 
918   return true;
919 }
920 
getBufferFormatWithCompCount(unsigned OldFormat,unsigned ComponentCount,const GCNSubtarget & STI)921 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
922                                        unsigned ComponentCount,
923                                        const GCNSubtarget &STI) {
924   if (ComponentCount > 4)
925     return 0;
926 
927   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
928       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
929   if (!OldFormatInfo)
930     return 0;
931 
932   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
933       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
934                                            ComponentCount,
935                                            OldFormatInfo->NumFormat, STI);
936 
937   if (!NewFormatInfo)
938     return 0;
939 
940   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
941          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
942 
943   return NewFormatInfo->Format;
944 }
945 
946 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
947 // highest power of two. Note that the result is well defined for all inputs
948 // including corner cases like:
949 // - if Lo == Hi, return that value
950 // - if Lo == 0, return 0 (even though the "- 1" below underflows
951 // - if Lo > Hi, return 0 (as if the range wrapped around)
mostAlignedValueInRange(uint32_t Lo,uint32_t Hi)952 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
953   return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
954 }
955 
offsetsCanBeCombined(CombineInfo & CI,const GCNSubtarget & STI,CombineInfo & Paired,bool Modify)956 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
957                                                 const GCNSubtarget &STI,
958                                                 CombineInfo &Paired,
959                                                 bool Modify) {
960   assert(CI.InstClass != MIMG);
961 
962   // XXX - Would the same offset be OK? Is there any reason this would happen or
963   // be useful?
964   if (CI.Offset == Paired.Offset)
965     return false;
966 
967   // This won't be valid if the offset isn't aligned.
968   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
969     return false;
970 
971   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
972 
973     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
974         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
975     if (!Info0)
976       return false;
977     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
978         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
979     if (!Info1)
980       return false;
981 
982     if (Info0->BitsPerComp != Info1->BitsPerComp ||
983         Info0->NumFormat != Info1->NumFormat)
984       return false;
985 
986     // TODO: Should be possible to support more formats, but if format loads
987     // are not dword-aligned, the merged load might not be valid.
988     if (Info0->BitsPerComp != 32)
989       return false;
990 
991     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
992       return false;
993   }
994 
995   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
996   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
997   CI.UseST64 = false;
998   CI.BaseOff = 0;
999 
1000   // Handle all non-DS instructions.
1001   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1002     if (EltOffset0 + CI.Width != EltOffset1 &&
1003             EltOffset1 + Paired.Width != EltOffset0)
1004       return false;
1005     if (CI.CPol != Paired.CPol)
1006       return false;
1007     if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1008         CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1009       // Reject cases like:
1010       //   dword + dwordx2 -> dwordx3
1011       //   dword + dwordx3 -> dwordx4
1012       // If we tried to combine these cases, we would fail to extract a subreg
1013       // for the result of the second load due to SGPR alignment requirements.
1014       if (CI.Width != Paired.Width &&
1015           (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1016         return false;
1017     }
1018     return true;
1019   }
1020 
1021   // If the offset in elements doesn't fit in 8-bits, we might be able to use
1022   // the stride 64 versions.
1023   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1024       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1025     if (Modify) {
1026       CI.Offset = EltOffset0 / 64;
1027       Paired.Offset = EltOffset1 / 64;
1028       CI.UseST64 = true;
1029     }
1030     return true;
1031   }
1032 
1033   // Check if the new offsets fit in the reduced 8-bit range.
1034   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1035     if (Modify) {
1036       CI.Offset = EltOffset0;
1037       Paired.Offset = EltOffset1;
1038     }
1039     return true;
1040   }
1041 
1042   // Try to shift base address to decrease offsets.
1043   uint32_t Min = std::min(EltOffset0, EltOffset1);
1044   uint32_t Max = std::max(EltOffset0, EltOffset1);
1045 
1046   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1047   if (((Max - Min) & ~Mask) == 0) {
1048     if (Modify) {
1049       // From the range of values we could use for BaseOff, choose the one that
1050       // is aligned to the highest power of two, to maximise the chance that
1051       // the same offset can be reused for other load/store pairs.
1052       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1053       // Copy the low bits of the offsets, so that when we adjust them by
1054       // subtracting BaseOff they will be multiples of 64.
1055       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1056       CI.BaseOff = BaseOff * CI.EltSize;
1057       CI.Offset = (EltOffset0 - BaseOff) / 64;
1058       Paired.Offset = (EltOffset1 - BaseOff) / 64;
1059       CI.UseST64 = true;
1060     }
1061     return true;
1062   }
1063 
1064   if (isUInt<8>(Max - Min)) {
1065     if (Modify) {
1066       // From the range of values we could use for BaseOff, choose the one that
1067       // is aligned to the highest power of two, to maximise the chance that
1068       // the same offset can be reused for other load/store pairs.
1069       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1070       CI.BaseOff = BaseOff * CI.EltSize;
1071       CI.Offset = EltOffset0 - BaseOff;
1072       Paired.Offset = EltOffset1 - BaseOff;
1073     }
1074     return true;
1075   }
1076 
1077   return false;
1078 }
1079 
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI,const CombineInfo & Paired)1080 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1081                                      const CombineInfo &CI,
1082                                      const CombineInfo &Paired) {
1083   const unsigned Width = (CI.Width + Paired.Width);
1084   switch (CI.InstClass) {
1085   default:
1086     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1087   case S_BUFFER_LOAD_IMM:
1088   case S_BUFFER_LOAD_SGPR_IMM:
1089   case S_LOAD_IMM:
1090     switch (Width) {
1091     default:
1092       return false;
1093     case 2:
1094     case 4:
1095     case 8:
1096       return true;
1097     case 3:
1098       return STM.hasScalarDwordx3Loads();
1099     }
1100   }
1101 }
1102 
1103 const TargetRegisterClass *
getDataRegClass(const MachineInstr & MI) const1104 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1105   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1106     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1107   }
1108   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1109     return TRI->getRegClassForReg(*MRI, Src->getReg());
1110   }
1111   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1112     return TRI->getRegClassForReg(*MRI, Src->getReg());
1113   }
1114   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1115     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1116   }
1117   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1118     return TRI->getRegClassForReg(*MRI, Src->getReg());
1119   }
1120   return nullptr;
1121 }
1122 
1123 /// This function assumes that CI comes before Paired in a basic block. Return
1124 /// an insertion point for the merged instruction or nullptr on failure.
1125 SILoadStoreOptimizer::CombineInfo *
checkAndPrepareMerge(CombineInfo & CI,CombineInfo & Paired)1126 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1127                                            CombineInfo &Paired) {
1128   // If another instruction has already been merged into CI, it may now be a
1129   // type that we can't do any further merging into.
1130   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1131     return nullptr;
1132   assert(CI.InstClass == Paired.InstClass);
1133 
1134   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1135       getInstSubclass(Paired.I->getOpcode(), *TII))
1136     return nullptr;
1137 
1138   // Check both offsets (or masks for MIMG) can be combined and fit in the
1139   // reduced range.
1140   if (CI.InstClass == MIMG) {
1141     if (!dmasksCanBeCombined(CI, *TII, Paired))
1142       return nullptr;
1143   } else {
1144     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1145       return nullptr;
1146   }
1147 
1148   DenseSet<Register> RegDefs;
1149   DenseSet<Register> RegUses;
1150   CombineInfo *Where;
1151   if (CI.I->mayLoad()) {
1152     // Try to hoist Paired up to CI.
1153     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1154     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1155       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1156         return nullptr;
1157     }
1158     Where = &CI;
1159   } else {
1160     // Try to sink CI down to Paired.
1161     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1162     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1163       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1164         return nullptr;
1165     }
1166     Where = &Paired;
1167   }
1168 
1169   // Call offsetsCanBeCombined with modify = true so that the offsets are
1170   // correct for the new instruction.  This should return true, because
1171   // this function should only be called on CombineInfo objects that
1172   // have already been confirmed to be mergeable.
1173   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1174     offsetsCanBeCombined(CI, *STM, Paired, true);
1175   return Where;
1176 }
1177 
read2Opcode(unsigned EltSize) const1178 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1179   if (STM->ldsRequiresM0Init())
1180     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1181   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1182 }
1183 
read2ST64Opcode(unsigned EltSize) const1184 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1185   if (STM->ldsRequiresM0Init())
1186     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1187 
1188   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1189                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1190 }
1191 
1192 MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1193 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1194                                      MachineBasicBlock::iterator InsertBefore) {
1195   MachineBasicBlock *MBB = CI.I->getParent();
1196 
1197   // Be careful, since the addresses could be subregisters themselves in weird
1198   // cases, like vectors of pointers.
1199   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1200 
1201   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1202   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1203 
1204   unsigned NewOffset0 = CI.Offset;
1205   unsigned NewOffset1 = Paired.Offset;
1206   unsigned Opc =
1207       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1208 
1209   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1210   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1211 
1212   if (NewOffset0 > NewOffset1) {
1213     // Canonicalize the merged instruction so the smaller offset comes first.
1214     std::swap(NewOffset0, NewOffset1);
1215     std::swap(SubRegIdx0, SubRegIdx1);
1216   }
1217 
1218   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1219          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1220 
1221   const MCInstrDesc &Read2Desc = TII->get(Opc);
1222 
1223   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1224   Register DestReg = MRI->createVirtualRegister(SuperRC);
1225 
1226   DebugLoc DL = CI.I->getDebugLoc();
1227 
1228   Register BaseReg = AddrReg->getReg();
1229   unsigned BaseSubReg = AddrReg->getSubReg();
1230   unsigned BaseRegFlags = 0;
1231   if (CI.BaseOff) {
1232     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1233     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1234         .addImm(CI.BaseOff);
1235 
1236     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1237     BaseRegFlags = RegState::Kill;
1238 
1239     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1240         .addReg(ImmReg)
1241         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1242         .addImm(0); // clamp bit
1243     BaseSubReg = 0;
1244   }
1245 
1246   MachineInstrBuilder Read2 =
1247       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1248           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1249           .addImm(NewOffset0)                        // offset0
1250           .addImm(NewOffset1)                        // offset1
1251           .addImm(0)                                 // gds
1252           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1253 
1254   (void)Read2;
1255 
1256   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1257 
1258   // Copy to the old destination registers.
1259   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1260       .add(*Dest0) // Copy to same destination including flags and sub reg.
1261       .addReg(DestReg, 0, SubRegIdx0);
1262   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1263       .add(*Dest1)
1264       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1265 
1266   CI.I->eraseFromParent();
1267   Paired.I->eraseFromParent();
1268 
1269   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1270   return Read2;
1271 }
1272 
write2Opcode(unsigned EltSize) const1273 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1274   if (STM->ldsRequiresM0Init())
1275     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1276   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1277                         : AMDGPU::DS_WRITE2_B64_gfx9;
1278 }
1279 
write2ST64Opcode(unsigned EltSize) const1280 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1281   if (STM->ldsRequiresM0Init())
1282     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1283                           : AMDGPU::DS_WRITE2ST64_B64;
1284 
1285   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1286                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1287 }
1288 
mergeWrite2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1289 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1290     CombineInfo &CI, CombineInfo &Paired,
1291     MachineBasicBlock::iterator InsertBefore) {
1292   MachineBasicBlock *MBB = CI.I->getParent();
1293 
1294   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1295   // sure we preserve the subregister index and any register flags set on them.
1296   const MachineOperand *AddrReg =
1297       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1298   const MachineOperand *Data0 =
1299       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1300   const MachineOperand *Data1 =
1301       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1302 
1303   unsigned NewOffset0 = CI.Offset;
1304   unsigned NewOffset1 = Paired.Offset;
1305   unsigned Opc =
1306       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1307 
1308   if (NewOffset0 > NewOffset1) {
1309     // Canonicalize the merged instruction so the smaller offset comes first.
1310     std::swap(NewOffset0, NewOffset1);
1311     std::swap(Data0, Data1);
1312   }
1313 
1314   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1315          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1316 
1317   const MCInstrDesc &Write2Desc = TII->get(Opc);
1318   DebugLoc DL = CI.I->getDebugLoc();
1319 
1320   Register BaseReg = AddrReg->getReg();
1321   unsigned BaseSubReg = AddrReg->getSubReg();
1322   unsigned BaseRegFlags = 0;
1323   if (CI.BaseOff) {
1324     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1325     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1326         .addImm(CI.BaseOff);
1327 
1328     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1329     BaseRegFlags = RegState::Kill;
1330 
1331     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1332         .addReg(ImmReg)
1333         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1334         .addImm(0); // clamp bit
1335     BaseSubReg = 0;
1336   }
1337 
1338   MachineInstrBuilder Write2 =
1339       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1340           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1341           .add(*Data0)                               // data0
1342           .add(*Data1)                               // data1
1343           .addImm(NewOffset0)                        // offset0
1344           .addImm(NewOffset1)                        // offset1
1345           .addImm(0)                                 // gds
1346           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1347 
1348   CI.I->eraseFromParent();
1349   Paired.I->eraseFromParent();
1350 
1351   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1352   return Write2;
1353 }
1354 
1355 MachineBasicBlock::iterator
mergeImagePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1356 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1357                                      MachineBasicBlock::iterator InsertBefore) {
1358   MachineBasicBlock *MBB = CI.I->getParent();
1359   DebugLoc DL = CI.I->getDebugLoc();
1360   const unsigned Opcode = getNewOpcode(CI, Paired);
1361 
1362   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1363 
1364   Register DestReg = MRI->createVirtualRegister(SuperRC);
1365   unsigned MergedDMask = CI.DMask | Paired.DMask;
1366   unsigned DMaskIdx =
1367       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1368 
1369   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1370   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1371     if (I == DMaskIdx)
1372       MIB.addImm(MergedDMask);
1373     else
1374       MIB.add((*CI.I).getOperand(I));
1375   }
1376 
1377   // It shouldn't be possible to get this far if the two instructions
1378   // don't have a single memoperand, because MachineInstr::mayAlias()
1379   // will return true if this is the case.
1380   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1381 
1382   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1383 
1384   unsigned SubRegIdx0, SubRegIdx1;
1385   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1386 
1387   // Copy to the old destination registers.
1388   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1389   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1390   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1391 
1392   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1393       .add(*Dest0) // Copy to same destination including flags and sub reg.
1394       .addReg(DestReg, 0, SubRegIdx0);
1395   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1396       .add(*Dest1)
1397       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1398 
1399   CI.I->eraseFromParent();
1400   Paired.I->eraseFromParent();
1401   return New;
1402 }
1403 
mergeSMemLoadImmPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1404 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1405     CombineInfo &CI, CombineInfo &Paired,
1406     MachineBasicBlock::iterator InsertBefore) {
1407   MachineBasicBlock *MBB = CI.I->getParent();
1408   DebugLoc DL = CI.I->getDebugLoc();
1409   const unsigned Opcode = getNewOpcode(CI, Paired);
1410 
1411   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1412 
1413   Register DestReg = MRI->createVirtualRegister(SuperRC);
1414   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1415 
1416   // It shouldn't be possible to get this far if the two instructions
1417   // don't have a single memoperand, because MachineInstr::mayAlias()
1418   // will return true if this is the case.
1419   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1420 
1421   MachineInstrBuilder New =
1422       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1423           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1424   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1425     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1426   New.addImm(MergedOffset);
1427   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1428 
1429   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1430   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1431   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1432 
1433   // Copy to the old destination registers.
1434   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1435   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1436   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1437 
1438   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1439       .add(*Dest0) // Copy to same destination including flags and sub reg.
1440       .addReg(DestReg, 0, SubRegIdx0);
1441   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1442       .add(*Dest1)
1443       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1444 
1445   CI.I->eraseFromParent();
1446   Paired.I->eraseFromParent();
1447   return New;
1448 }
1449 
mergeBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1450 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1451     CombineInfo &CI, CombineInfo &Paired,
1452     MachineBasicBlock::iterator InsertBefore) {
1453   MachineBasicBlock *MBB = CI.I->getParent();
1454   DebugLoc DL = CI.I->getDebugLoc();
1455 
1456   const unsigned Opcode = getNewOpcode(CI, Paired);
1457 
1458   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1459 
1460   // Copy to the new source register.
1461   Register DestReg = MRI->createVirtualRegister(SuperRC);
1462   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1463 
1464   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1465 
1466   AddressRegs Regs = getRegs(Opcode, *TII);
1467 
1468   if (Regs.VAddr)
1469     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1470 
1471   // It shouldn't be possible to get this far if the two instructions
1472   // don't have a single memoperand, because MachineInstr::mayAlias()
1473   // will return true if this is the case.
1474   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1475 
1476   MachineInstr *New =
1477     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1478         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1479         .addImm(MergedOffset) // offset
1480         .addImm(CI.CPol)      // cpol
1481         .addImm(0)            // swz
1482         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1483 
1484   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1485   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1486   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1487 
1488   // Copy to the old destination registers.
1489   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1490   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1491   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1492 
1493   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1494       .add(*Dest0) // Copy to same destination including flags and sub reg.
1495       .addReg(DestReg, 0, SubRegIdx0);
1496   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1497       .add(*Dest1)
1498       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1499 
1500   CI.I->eraseFromParent();
1501   Paired.I->eraseFromParent();
1502   return New;
1503 }
1504 
mergeTBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1505 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1506     CombineInfo &CI, CombineInfo &Paired,
1507     MachineBasicBlock::iterator InsertBefore) {
1508   MachineBasicBlock *MBB = CI.I->getParent();
1509   DebugLoc DL = CI.I->getDebugLoc();
1510 
1511   const unsigned Opcode = getNewOpcode(CI, Paired);
1512 
1513   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1514 
1515   // Copy to the new source register.
1516   Register DestReg = MRI->createVirtualRegister(SuperRC);
1517   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1518 
1519   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1520 
1521   AddressRegs Regs = getRegs(Opcode, *TII);
1522 
1523   if (Regs.VAddr)
1524     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1525 
1526   unsigned JoinedFormat =
1527       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1528 
1529   // It shouldn't be possible to get this far if the two instructions
1530   // don't have a single memoperand, because MachineInstr::mayAlias()
1531   // will return true if this is the case.
1532   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1533 
1534   MachineInstr *New =
1535       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1536           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1537           .addImm(MergedOffset) // offset
1538           .addImm(JoinedFormat) // format
1539           .addImm(CI.CPol)      // cpol
1540           .addImm(0)            // swz
1541           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1542 
1543   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1544   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1545   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1546 
1547   // Copy to the old destination registers.
1548   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1549   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1550   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1551 
1552   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1553       .add(*Dest0) // Copy to same destination including flags and sub reg.
1554       .addReg(DestReg, 0, SubRegIdx0);
1555   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1556       .add(*Dest1)
1557       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1558 
1559   CI.I->eraseFromParent();
1560   Paired.I->eraseFromParent();
1561   return New;
1562 }
1563 
mergeTBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1564 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1565     CombineInfo &CI, CombineInfo &Paired,
1566     MachineBasicBlock::iterator InsertBefore) {
1567   MachineBasicBlock *MBB = CI.I->getParent();
1568   DebugLoc DL = CI.I->getDebugLoc();
1569 
1570   const unsigned Opcode = getNewOpcode(CI, Paired);
1571 
1572   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1573   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1574   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1575 
1576   // Copy to the new source register.
1577   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1578   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1579 
1580   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1581   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1582 
1583   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1584       .add(*Src0)
1585       .addImm(SubRegIdx0)
1586       .add(*Src1)
1587       .addImm(SubRegIdx1);
1588 
1589   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1590                  .addReg(SrcReg, RegState::Kill);
1591 
1592   AddressRegs Regs = getRegs(Opcode, *TII);
1593 
1594   if (Regs.VAddr)
1595     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1596 
1597   unsigned JoinedFormat =
1598       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1599 
1600   // It shouldn't be possible to get this far if the two instructions
1601   // don't have a single memoperand, because MachineInstr::mayAlias()
1602   // will return true if this is the case.
1603   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1604 
1605   MachineInstr *New =
1606       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1607           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1608           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1609           .addImm(JoinedFormat)                     // format
1610           .addImm(CI.CPol)                          // cpol
1611           .addImm(0)                                // swz
1612           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1613 
1614   CI.I->eraseFromParent();
1615   Paired.I->eraseFromParent();
1616   return New;
1617 }
1618 
mergeFlatLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1619 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1620     CombineInfo &CI, CombineInfo &Paired,
1621     MachineBasicBlock::iterator InsertBefore) {
1622   MachineBasicBlock *MBB = CI.I->getParent();
1623   DebugLoc DL = CI.I->getDebugLoc();
1624 
1625   const unsigned Opcode = getNewOpcode(CI, Paired);
1626 
1627   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1628   Register DestReg = MRI->createVirtualRegister(SuperRC);
1629 
1630   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1631 
1632   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1633     MIB.add(*SAddr);
1634 
1635   MachineInstr *New =
1636     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1637        .addImm(std::min(CI.Offset, Paired.Offset))
1638        .addImm(CI.CPol)
1639        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1640 
1641   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1642   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1643   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1644 
1645   // Copy to the old destination registers.
1646   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1647   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1648   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1649 
1650   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1651       .add(*Dest0) // Copy to same destination including flags and sub reg.
1652       .addReg(DestReg, 0, SubRegIdx0);
1653   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1654       .add(*Dest1)
1655       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1656 
1657   CI.I->eraseFromParent();
1658   Paired.I->eraseFromParent();
1659   return New;
1660 }
1661 
mergeFlatStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1662 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1663     CombineInfo &CI, CombineInfo &Paired,
1664     MachineBasicBlock::iterator InsertBefore) {
1665   MachineBasicBlock *MBB = CI.I->getParent();
1666   DebugLoc DL = CI.I->getDebugLoc();
1667 
1668   const unsigned Opcode = getNewOpcode(CI, Paired);
1669 
1670   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1671   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1672   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1673 
1674   // Copy to the new source register.
1675   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1676   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1677 
1678   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1679   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1680 
1681   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1682       .add(*Src0)
1683       .addImm(SubRegIdx0)
1684       .add(*Src1)
1685       .addImm(SubRegIdx1);
1686 
1687   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1688                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1689                  .addReg(SrcReg, RegState::Kill);
1690 
1691   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1692     MIB.add(*SAddr);
1693 
1694   MachineInstr *New =
1695     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1696        .addImm(CI.CPol)
1697        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1698 
1699   CI.I->eraseFromParent();
1700   Paired.I->eraseFromParent();
1701   return New;
1702 }
1703 
getNewOpcode(const CombineInfo & CI,const CombineInfo & Paired)1704 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1705                                             const CombineInfo &Paired) {
1706   const unsigned Width = CI.Width + Paired.Width;
1707 
1708   switch (getCommonInstClass(CI, Paired)) {
1709   default:
1710     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1711     // FIXME: Handle d16 correctly
1712     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1713                                   Width);
1714   case TBUFFER_LOAD:
1715   case TBUFFER_STORE:
1716     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1717                                   Width);
1718 
1719   case UNKNOWN:
1720     llvm_unreachable("Unknown instruction class");
1721   case S_BUFFER_LOAD_IMM:
1722     switch (Width) {
1723     default:
1724       return 0;
1725     case 2:
1726       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1727     case 3:
1728       return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1729     case 4:
1730       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1731     case 8:
1732       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1733     }
1734   case S_BUFFER_LOAD_SGPR_IMM:
1735     switch (Width) {
1736     default:
1737       return 0;
1738     case 2:
1739       return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1740     case 3:
1741       return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1742     case 4:
1743       return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1744     case 8:
1745       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1746     }
1747   case S_LOAD_IMM:
1748     switch (Width) {
1749     default:
1750       return 0;
1751     case 2:
1752       return AMDGPU::S_LOAD_DWORDX2_IMM;
1753     case 3:
1754       return AMDGPU::S_LOAD_DWORDX3_IMM;
1755     case 4:
1756       return AMDGPU::S_LOAD_DWORDX4_IMM;
1757     case 8:
1758       return AMDGPU::S_LOAD_DWORDX8_IMM;
1759     }
1760   case GLOBAL_LOAD:
1761     switch (Width) {
1762     default:
1763       return 0;
1764     case 2:
1765       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1766     case 3:
1767       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1768     case 4:
1769       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1770     }
1771   case GLOBAL_LOAD_SADDR:
1772     switch (Width) {
1773     default:
1774       return 0;
1775     case 2:
1776       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1777     case 3:
1778       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1779     case 4:
1780       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1781     }
1782   case GLOBAL_STORE:
1783     switch (Width) {
1784     default:
1785       return 0;
1786     case 2:
1787       return AMDGPU::GLOBAL_STORE_DWORDX2;
1788     case 3:
1789       return AMDGPU::GLOBAL_STORE_DWORDX3;
1790     case 4:
1791       return AMDGPU::GLOBAL_STORE_DWORDX4;
1792     }
1793   case GLOBAL_STORE_SADDR:
1794     switch (Width) {
1795     default:
1796       return 0;
1797     case 2:
1798       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1799     case 3:
1800       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1801     case 4:
1802       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1803     }
1804   case FLAT_LOAD:
1805     switch (Width) {
1806     default:
1807       return 0;
1808     case 2:
1809       return AMDGPU::FLAT_LOAD_DWORDX2;
1810     case 3:
1811       return AMDGPU::FLAT_LOAD_DWORDX3;
1812     case 4:
1813       return AMDGPU::FLAT_LOAD_DWORDX4;
1814     }
1815   case FLAT_STORE:
1816     switch (Width) {
1817     default:
1818       return 0;
1819     case 2:
1820       return AMDGPU::FLAT_STORE_DWORDX2;
1821     case 3:
1822       return AMDGPU::FLAT_STORE_DWORDX3;
1823     case 4:
1824       return AMDGPU::FLAT_STORE_DWORDX4;
1825     }
1826   case MIMG:
1827     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1828            "No overlaps");
1829     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1830   }
1831 }
1832 
1833 std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI,const CombineInfo & Paired)1834 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1835                                     const CombineInfo &Paired) {
1836   assert((CI.InstClass != MIMG ||
1837           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1838            CI.Width + Paired.Width)) &&
1839          "No overlaps");
1840 
1841   unsigned Idx0;
1842   unsigned Idx1;
1843 
1844   static const unsigned Idxs[5][4] = {
1845       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1846       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1847       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1848       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1849       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1850   };
1851 
1852   assert(CI.Width >= 1 && CI.Width <= 4);
1853   assert(Paired.Width >= 1 && Paired.Width <= 4);
1854 
1855   if (Paired < CI) {
1856     Idx1 = Idxs[0][Paired.Width - 1];
1857     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1858   } else {
1859     Idx0 = Idxs[0][CI.Width - 1];
1860     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1861   }
1862 
1863   return std::pair(Idx0, Idx1);
1864 }
1865 
1866 const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI,const CombineInfo & Paired)1867 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1868                                              const CombineInfo &Paired) {
1869   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1870       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1871     switch (CI.Width + Paired.Width) {
1872     default:
1873       return nullptr;
1874     case 2:
1875       return &AMDGPU::SReg_64_XEXECRegClass;
1876     case 3:
1877       return &AMDGPU::SGPR_96RegClass;
1878     case 4:
1879       return &AMDGPU::SGPR_128RegClass;
1880     case 8:
1881       return &AMDGPU::SGPR_256RegClass;
1882     case 16:
1883       return &AMDGPU::SGPR_512RegClass;
1884     }
1885   }
1886 
1887   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1888   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1889              ? TRI->getAGPRClassForBitWidth(BitWidth)
1890              : TRI->getVGPRClassForBitWidth(BitWidth);
1891 }
1892 
mergeBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1893 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1894     CombineInfo &CI, CombineInfo &Paired,
1895     MachineBasicBlock::iterator InsertBefore) {
1896   MachineBasicBlock *MBB = CI.I->getParent();
1897   DebugLoc DL = CI.I->getDebugLoc();
1898 
1899   const unsigned Opcode = getNewOpcode(CI, Paired);
1900 
1901   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1902   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1903   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1904 
1905   // Copy to the new source register.
1906   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1907   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1908 
1909   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1910   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1911 
1912   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1913       .add(*Src0)
1914       .addImm(SubRegIdx0)
1915       .add(*Src1)
1916       .addImm(SubRegIdx1);
1917 
1918   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1919                  .addReg(SrcReg, RegState::Kill);
1920 
1921   AddressRegs Regs = getRegs(Opcode, *TII);
1922 
1923   if (Regs.VAddr)
1924     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1925 
1926 
1927   // It shouldn't be possible to get this far if the two instructions
1928   // don't have a single memoperand, because MachineInstr::mayAlias()
1929   // will return true if this is the case.
1930   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1931 
1932   MachineInstr *New =
1933     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1934         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1935         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1936         .addImm(CI.CPol)      // cpol
1937         .addImm(0)            // swz
1938         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1939 
1940   CI.I->eraseFromParent();
1941   Paired.I->eraseFromParent();
1942   return New;
1943 }
1944 
1945 MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI) const1946 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1947   APInt V(32, Val, true);
1948   if (TII->isInlineConstant(V))
1949     return MachineOperand::CreateImm(Val);
1950 
1951   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1952   MachineInstr *Mov =
1953   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1954           TII->get(AMDGPU::S_MOV_B32), Reg)
1955     .addImm(Val);
1956   (void)Mov;
1957   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1958   return MachineOperand::CreateReg(Reg, false);
1959 }
1960 
1961 // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr) const1962 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1963                                            const MemAddress &Addr) const {
1964   MachineBasicBlock *MBB = MI.getParent();
1965   MachineBasicBlock::iterator MBBI = MI.getIterator();
1966   DebugLoc DL = MI.getDebugLoc();
1967 
1968   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1969           Addr.Base.LoSubReg) &&
1970          "Expected 32-bit Base-Register-Low!!");
1971 
1972   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1973           Addr.Base.HiSubReg) &&
1974          "Expected 32-bit Base-Register-Hi!!");
1975 
1976   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1977   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1978   MachineOperand OffsetHi =
1979     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1980 
1981   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1982   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1983   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1984 
1985   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1986   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1987   MachineInstr *LoHalf =
1988     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1989       .addReg(CarryReg, RegState::Define)
1990       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1991       .add(OffsetLo)
1992       .addImm(0); // clamp bit
1993   (void)LoHalf;
1994   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1995 
1996   MachineInstr *HiHalf =
1997   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1998     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1999     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2000     .add(OffsetHi)
2001     .addReg(CarryReg, RegState::Kill)
2002     .addImm(0); // clamp bit
2003   (void)HiHalf;
2004   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
2005 
2006   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2007   MachineInstr *FullBase =
2008     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2009       .addReg(DestSub0)
2010       .addImm(AMDGPU::sub0)
2011       .addReg(DestSub1)
2012       .addImm(AMDGPU::sub1);
2013   (void)FullBase;
2014   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
2015 
2016   return FullDestReg;
2017 }
2018 
2019 // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,Register NewBase,int32_t NewOffset) const2020 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2021                                                Register NewBase,
2022                                                int32_t NewOffset) const {
2023   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2024   Base->setReg(NewBase);
2025   Base->setIsKill(false);
2026   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2027 }
2028 
2029 std::optional<int32_t>
extractConstOffset(const MachineOperand & Op) const2030 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2031   if (Op.isImm())
2032     return Op.getImm();
2033 
2034   if (!Op.isReg())
2035     return std::nullopt;
2036 
2037   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2038   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2039       !Def->getOperand(1).isImm())
2040     return std::nullopt;
2041 
2042   return Def->getOperand(1).getImm();
2043 }
2044 
2045 // Analyze Base and extracts:
2046 //  - 32bit base registers, subregisters
2047 //  - 64bit constant offset
2048 // Expecting base computation as:
2049 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
2050 //   %LO:vgpr_32, %c:sreg_64_xexec =
2051 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2052 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2053 //   %Base:vreg_64 =
2054 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr) const2055 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2056                                                       MemAddress &Addr) const {
2057   if (!Base.isReg())
2058     return;
2059 
2060   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2061   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2062       || Def->getNumOperands() != 5)
2063     return;
2064 
2065   MachineOperand BaseLo = Def->getOperand(1);
2066   MachineOperand BaseHi = Def->getOperand(3);
2067   if (!BaseLo.isReg() || !BaseHi.isReg())
2068     return;
2069 
2070   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2071   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2072 
2073   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2074       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2075     return;
2076 
2077   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2078   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2079 
2080   auto Offset0P = extractConstOffset(*Src0);
2081   if (Offset0P)
2082     BaseLo = *Src1;
2083   else {
2084     if (!(Offset0P = extractConstOffset(*Src1)))
2085       return;
2086     BaseLo = *Src0;
2087   }
2088 
2089   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2090   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2091 
2092   if (Src0->isImm())
2093     std::swap(Src0, Src1);
2094 
2095   if (!Src1->isImm())
2096     return;
2097 
2098   uint64_t Offset1 = Src1->getImm();
2099   BaseHi = *Src0;
2100 
2101   Addr.Base.LoReg = BaseLo.getReg();
2102   Addr.Base.HiReg = BaseHi.getReg();
2103   Addr.Base.LoSubReg = BaseLo.getSubReg();
2104   Addr.Base.HiSubReg = BaseHi.getSubReg();
2105   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2106 }
2107 
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList) const2108 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2109     MachineInstr &MI,
2110     MemInfoMap &Visited,
2111     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2112 
2113   if (!(MI.mayLoad() ^ MI.mayStore()))
2114     return false;
2115 
2116   // TODO: Support flat and scratch.
2117   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
2118     return false;
2119 
2120   if (MI.mayLoad() &&
2121       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2122     return false;
2123 
2124   if (AnchorList.count(&MI))
2125     return false;
2126 
2127   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2128 
2129   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2130     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
2131     return false;
2132   }
2133 
2134   // Step1: Find the base-registers and a 64bit constant offset.
2135   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2136   MemAddress MAddr;
2137   if (!Visited.contains(&MI)) {
2138     processBaseWithConstOffset(Base, MAddr);
2139     Visited[&MI] = MAddr;
2140   } else
2141     MAddr = Visited[&MI];
2142 
2143   if (MAddr.Offset == 0) {
2144     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2145                          " constant offsets that can be promoted.\n";);
2146     return false;
2147   }
2148 
2149   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
2150              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2151 
2152   // Step2: Traverse through MI's basic block and find an anchor(that has the
2153   // same base-registers) with the highest 13bit distance from MI's offset.
2154   // E.g. (64bit loads)
2155   // bb:
2156   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2157   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2158   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2159   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2160   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2161   //
2162   // Starting from the first load, the optimization will try to find a new base
2163   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2164   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2165   // as the new-base(anchor) because of the maximum distance which can
2166   // accommodate more intermediate bases presumably.
2167   //
2168   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2169   // (&a + 8192) for load1, load2, load4.
2170   //   addr = &a + 8192
2171   //   load1 = load(addr,       -4096)
2172   //   load2 = load(addr,       -2048)
2173   //   load3 = load(addr,       0)
2174   //   load4 = load(addr,       2048)
2175   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2176   //
2177   MachineInstr *AnchorInst = nullptr;
2178   MemAddress AnchorAddr;
2179   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2180   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2181 
2182   MachineBasicBlock *MBB = MI.getParent();
2183   MachineBasicBlock::iterator E = MBB->end();
2184   MachineBasicBlock::iterator MBBI = MI.getIterator();
2185   ++MBBI;
2186   const SITargetLowering *TLI =
2187     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2188 
2189   for ( ; MBBI != E; ++MBBI) {
2190     MachineInstr &MINext = *MBBI;
2191     // TODO: Support finding an anchor(with same base) from store addresses or
2192     // any other load addresses where the opcodes are different.
2193     if (MINext.getOpcode() != MI.getOpcode() ||
2194         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2195       continue;
2196 
2197     const MachineOperand &BaseNext =
2198       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2199     MemAddress MAddrNext;
2200     if (!Visited.contains(&MINext)) {
2201       processBaseWithConstOffset(BaseNext, MAddrNext);
2202       Visited[&MINext] = MAddrNext;
2203     } else
2204       MAddrNext = Visited[&MINext];
2205 
2206     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2207         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2208         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2209         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2210       continue;
2211 
2212     InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
2213 
2214     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2215     TargetLoweringBase::AddrMode AM;
2216     AM.HasBaseReg = true;
2217     AM.BaseOffs = Dist;
2218     if (TLI->isLegalGlobalAddressingMode(AM) &&
2219         (uint32_t)std::abs(Dist) > MaxDist) {
2220       MaxDist = std::abs(Dist);
2221 
2222       AnchorAddr = MAddrNext;
2223       AnchorInst = &MINext;
2224     }
2225   }
2226 
2227   if (AnchorInst) {
2228     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2229                AnchorInst->dump());
2230     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2231                <<  AnchorAddr.Offset << "\n\n");
2232 
2233     // Instead of moving up, just re-compute anchor-instruction's base address.
2234     Register Base = computeBase(MI, AnchorAddr);
2235 
2236     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2237     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2238 
2239     for (auto P : InstsWCommonBase) {
2240       TargetLoweringBase::AddrMode AM;
2241       AM.HasBaseReg = true;
2242       AM.BaseOffs = P.second - AnchorAddr.Offset;
2243 
2244       if (TLI->isLegalGlobalAddressingMode(AM)) {
2245         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
2246                    dbgs() << ")"; P.first->dump());
2247         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2248         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
2249       }
2250     }
2251     AnchorList.insert(AnchorInst);
2252     return true;
2253   }
2254 
2255   return false;
2256 }
2257 
addInstToMergeableList(const CombineInfo & CI,std::list<std::list<CombineInfo>> & MergeableInsts) const2258 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2259                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2260   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2261     if (AddrList.front().InstClass == CI.InstClass &&
2262         AddrList.front().IsAGPR == CI.IsAGPR &&
2263         AddrList.front().hasSameBaseAddress(CI)) {
2264       AddrList.emplace_back(CI);
2265       return;
2266     }
2267   }
2268 
2269   // Base address not found, so add a new list.
2270   MergeableInsts.emplace_back(1, CI);
2271 }
2272 
2273 std::pair<MachineBasicBlock::iterator, bool>
collectMergeableInsts(MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList,std::list<std::list<CombineInfo>> & MergeableInsts) const2274 SILoadStoreOptimizer::collectMergeableInsts(
2275     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2276     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2277     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2278   bool Modified = false;
2279 
2280   // Sort potential mergeable instructions into lists.  One list per base address.
2281   unsigned Order = 0;
2282   MachineBasicBlock::iterator BlockI = Begin;
2283   for (; BlockI != End; ++BlockI) {
2284     MachineInstr &MI = *BlockI;
2285 
2286     // We run this before checking if an address is mergeable, because it can produce
2287     // better code even if the instructions aren't mergeable.
2288     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2289       Modified = true;
2290 
2291     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2292     // barriers. We can look after this barrier for separate merges.
2293     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2294       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2295 
2296       // Search will resume after this instruction in a separate merge list.
2297       ++BlockI;
2298       break;
2299     }
2300 
2301     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2302     if (InstClass == UNKNOWN)
2303       continue;
2304 
2305     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2306     int Swizzled =
2307         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2308     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2309       continue;
2310 
2311     CombineInfo CI;
2312     CI.setMI(MI, *this);
2313     CI.Order = Order++;
2314 
2315     if (!CI.hasMergeableAddress(*MRI))
2316       continue;
2317 
2318     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2319       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2320       //        operands. However we are reporting that ds_write2 shall have
2321       //        only VGPR data so that machine copy propagation does not
2322       //        create an illegal instruction with a VGPR and AGPR sources.
2323       //        Consequenctially if we create such instruction the verifier
2324       //        will complain.
2325       continue;
2326     }
2327 
2328     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2329 
2330     addInstToMergeableList(CI, MergeableInsts);
2331   }
2332 
2333   // At this point we have lists of Mergeable instructions.
2334   //
2335   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2336   // list try to find an instruction that can be merged with I.  If an instruction
2337   // is found, it is stored in the Paired field.  If no instructions are found, then
2338   // the CombineInfo object is deleted from the list.
2339 
2340   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2341                                                    E = MergeableInsts.end(); I != E;) {
2342 
2343     std::list<CombineInfo> &MergeList = *I;
2344     if (MergeList.size() <= 1) {
2345       // This means we have found only one instruction with a given address
2346       // that can be merged, and we need at least 2 instructions to do a merge,
2347       // so this list can be discarded.
2348       I = MergeableInsts.erase(I);
2349       continue;
2350     }
2351 
2352     // Sort the lists by offsets, this way mergeable instructions will be
2353     // adjacent to each other in the list, which will make it easier to find
2354     // matches.
2355     MergeList.sort(
2356         [] (const CombineInfo &A, const CombineInfo &B) {
2357           return A.Offset < B.Offset;
2358         });
2359     ++I;
2360   }
2361 
2362   return std::pair(BlockI, Modified);
2363 }
2364 
2365 // Scan through looking for adjacent LDS operations with constant offsets from
2366 // the same base register. We rely on the scheduler to do the hard work of
2367 // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(std::list<std::list<CombineInfo>> & MergeableInsts)2368 bool SILoadStoreOptimizer::optimizeBlock(
2369                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2370   bool Modified = false;
2371 
2372   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2373                                                    E = MergeableInsts.end(); I != E;) {
2374     std::list<CombineInfo> &MergeList = *I;
2375 
2376     bool OptimizeListAgain = false;
2377     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2378       // We weren't able to make any changes, so delete the list so we don't
2379       // process the same instructions the next time we try to optimize this
2380       // block.
2381       I = MergeableInsts.erase(I);
2382       continue;
2383     }
2384 
2385     Modified = true;
2386 
2387     // We made changes, but also determined that there were no more optimization
2388     // opportunities, so we don't need to reprocess the list
2389     if (!OptimizeListAgain) {
2390       I = MergeableInsts.erase(I);
2391       continue;
2392     }
2393     OptimizeAgain = true;
2394   }
2395   return Modified;
2396 }
2397 
2398 bool
optimizeInstsWithSameBaseAddr(std::list<CombineInfo> & MergeList,bool & OptimizeListAgain)2399 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2400                                           std::list<CombineInfo> &MergeList,
2401                                           bool &OptimizeListAgain) {
2402   if (MergeList.empty())
2403     return false;
2404 
2405   bool Modified = false;
2406 
2407   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2408        Next = std::next(I)) {
2409 
2410     auto First = I;
2411     auto Second = Next;
2412 
2413     if ((*First).Order > (*Second).Order)
2414       std::swap(First, Second);
2415     CombineInfo &CI = *First;
2416     CombineInfo &Paired = *Second;
2417 
2418     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2419     if (!Where) {
2420       ++I;
2421       continue;
2422     }
2423 
2424     Modified = true;
2425 
2426     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2427 
2428     MachineBasicBlock::iterator NewMI;
2429     switch (CI.InstClass) {
2430     default:
2431       llvm_unreachable("unknown InstClass");
2432       break;
2433     case DS_READ:
2434       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2435       break;
2436     case DS_WRITE:
2437       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2438       break;
2439     case S_BUFFER_LOAD_IMM:
2440     case S_BUFFER_LOAD_SGPR_IMM:
2441     case S_LOAD_IMM:
2442       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2443       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2444       break;
2445     case BUFFER_LOAD:
2446       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2447       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2448       break;
2449     case BUFFER_STORE:
2450       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2451       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2452       break;
2453     case MIMG:
2454       NewMI = mergeImagePair(CI, Paired, Where->I);
2455       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2456       break;
2457     case TBUFFER_LOAD:
2458       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2459       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2460       break;
2461     case TBUFFER_STORE:
2462       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2463       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2464       break;
2465     case FLAT_LOAD:
2466     case GLOBAL_LOAD:
2467     case GLOBAL_LOAD_SADDR:
2468       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2469       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2470       break;
2471     case FLAT_STORE:
2472     case GLOBAL_STORE:
2473     case GLOBAL_STORE_SADDR:
2474       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2475       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2476       break;
2477     }
2478     CI.setMI(NewMI, *this);
2479     CI.Order = Where->Order;
2480     if (I == Second)
2481       I = Next;
2482 
2483     MergeList.erase(Second);
2484   }
2485 
2486   return Modified;
2487 }
2488 
runOnMachineFunction(MachineFunction & MF)2489 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2490   if (skipFunction(MF.getFunction()))
2491     return false;
2492 
2493   STM = &MF.getSubtarget<GCNSubtarget>();
2494   if (!STM->loadStoreOptEnabled())
2495     return false;
2496 
2497   TII = STM->getInstrInfo();
2498   TRI = &TII->getRegisterInfo();
2499 
2500   MRI = &MF.getRegInfo();
2501   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2502 
2503   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2504 
2505   bool Modified = false;
2506 
2507   // Contains the list of instructions for which constant offsets are being
2508   // promoted to the IMM. This is tracked for an entire block at time.
2509   SmallPtrSet<MachineInstr *, 4> AnchorList;
2510   MemInfoMap Visited;
2511 
2512   for (MachineBasicBlock &MBB : MF) {
2513     MachineBasicBlock::iterator SectionEnd;
2514     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2515          I = SectionEnd) {
2516       bool CollectModified;
2517       std::list<std::list<CombineInfo>> MergeableInsts;
2518 
2519       // First pass: Collect list of all instructions we know how to merge in a
2520       // subset of the block.
2521       std::tie(SectionEnd, CollectModified) =
2522           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2523 
2524       Modified |= CollectModified;
2525 
2526       do {
2527         OptimizeAgain = false;
2528         Modified |= optimizeBlock(MergeableInsts);
2529       } while (OptimizeAgain);
2530     }
2531 
2532     Visited.clear();
2533     AnchorList.clear();
2534   }
2535 
2536   return Modified;
2537 }
2538