1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   BUFFER_LOAD,
78   BUFFER_STORE,
79   MIMG,
80   TBUFFER_LOAD,
81   TBUFFER_STORE,
82   GLOBAL_LOAD_SADDR,
83   GLOBAL_STORE_SADDR,
84   FLAT_LOAD,
85   FLAT_STORE,
86   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
87   GLOBAL_STORE // any CombineInfo, they are only ever returned by
88                // getCommonInstClass.
89 };
90 
91 struct AddressRegs {
92   unsigned char NumVAddrs = 0;
93   bool SBase = false;
94   bool SRsrc = false;
95   bool SOffset = false;
96   bool SAddr = false;
97   bool VAddr = false;
98   bool Addr = false;
99   bool SSamp = false;
100 };
101 
102 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
103 const unsigned MaxAddressRegs = 12 + 1 + 1;
104 
105 class SILoadStoreOptimizer : public MachineFunctionPass {
106   struct CombineInfo {
107     MachineBasicBlock::iterator I;
108     unsigned EltSize;
109     unsigned Offset;
110     unsigned Width;
111     unsigned Format;
112     unsigned BaseOff;
113     unsigned DMask;
114     InstClassEnum InstClass;
115     unsigned CPol = 0;
116     bool IsAGPR;
117     bool UseST64;
118     int AddrIdx[MaxAddressRegs];
119     const MachineOperand *AddrReg[MaxAddressRegs];
120     unsigned NumAddresses;
121     unsigned Order;
122 
123     bool hasSameBaseAddress(const MachineInstr &MI) {
124       for (unsigned i = 0; i < NumAddresses; i++) {
125         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
126 
127         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
128           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
129               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
130             return false;
131           }
132           continue;
133         }
134 
135         // Check same base pointer. Be careful of subregisters, which can occur
136         // with vectors of pointers.
137         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
138             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
139          return false;
140         }
141       }
142       return true;
143     }
144 
145     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
146       for (unsigned i = 0; i < NumAddresses; ++i) {
147         const MachineOperand *AddrOp = AddrReg[i];
148         // Immediates are always OK.
149         if (AddrOp->isImm())
150           continue;
151 
152         // Don't try to merge addresses that aren't either immediates or registers.
153         // TODO: Should be possible to merge FrameIndexes and maybe some other
154         // non-register
155         if (!AddrOp->isReg())
156           return false;
157 
158         // TODO: We should be able to merge physical reg addresses.
159         if (AddrOp->getReg().isPhysical())
160           return false;
161 
162         // If an address has only one use then there will be on other
163         // instructions with the same address, so we can't merge this one.
164         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
165           return false;
166       }
167       return true;
168     }
169 
170     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
171 
172     // Compare by pointer order.
173     bool operator<(const CombineInfo& Other) const {
174       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
175     }
176   };
177 
178   struct BaseRegisters {
179     Register LoReg;
180     Register HiReg;
181 
182     unsigned LoSubReg = 0;
183     unsigned HiSubReg = 0;
184   };
185 
186   struct MemAddress {
187     BaseRegisters Base;
188     int64_t Offset = 0;
189   };
190 
191   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
192 
193 private:
194   const GCNSubtarget *STM = nullptr;
195   const SIInstrInfo *TII = nullptr;
196   const SIRegisterInfo *TRI = nullptr;
197   MachineRegisterInfo *MRI = nullptr;
198   AliasAnalysis *AA = nullptr;
199   bool OptimizeAgain;
200 
201   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
202                            const DenseSet<Register> &ARegUses,
203                            const MachineInstr &A, const MachineInstr &B) const;
204   static bool dmasksCanBeCombined(const CombineInfo &CI,
205                                   const SIInstrInfo &TII,
206                                   const CombineInfo &Paired);
207   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
208                                    CombineInfo &Paired, bool Modify = false);
209   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
210                         const CombineInfo &Paired);
211   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
212   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
213                                                      const CombineInfo &Paired);
214   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
215                                                     const CombineInfo &Paired);
216   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
217 
218   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
219 
220   unsigned read2Opcode(unsigned EltSize) const;
221   unsigned read2ST64Opcode(unsigned EltSize) const;
222   MachineBasicBlock::iterator
223   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
224                  MachineBasicBlock::iterator InsertBefore);
225 
226   unsigned write2Opcode(unsigned EltSize) const;
227   unsigned write2ST64Opcode(unsigned EltSize) const;
228   MachineBasicBlock::iterator
229   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
230                   MachineBasicBlock::iterator InsertBefore);
231   MachineBasicBlock::iterator
232   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
233                  MachineBasicBlock::iterator InsertBefore);
234   MachineBasicBlock::iterator
235   mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
236                           MachineBasicBlock::iterator InsertBefore);
237   MachineBasicBlock::iterator
238   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
239                       MachineBasicBlock::iterator InsertBefore);
240   MachineBasicBlock::iterator
241   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
242                        MachineBasicBlock::iterator InsertBefore);
243   MachineBasicBlock::iterator
244   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
245                        MachineBasicBlock::iterator InsertBefore);
246   MachineBasicBlock::iterator
247   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
248                         MachineBasicBlock::iterator InsertBefore);
249   MachineBasicBlock::iterator
250   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
251                     MachineBasicBlock::iterator InsertBefore);
252   MachineBasicBlock::iterator
253   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
254                      MachineBasicBlock::iterator InsertBefore);
255 
256   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
257                            int32_t NewOffset) const;
258   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
259   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
260   Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
261   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
262   /// Promotes constant offset to the immediate by adjusting the base. It
263   /// tries to use a base from the nearby instructions that allows it to have
264   /// a 13bit constant offset which gets promoted to the immediate.
265   bool promoteConstantOffsetToImm(MachineInstr &CI,
266                                   MemInfoMap &Visited,
267                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
268   void addInstToMergeableList(const CombineInfo &CI,
269                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
270 
271   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
272       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
273       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
274       std::list<std::list<CombineInfo>> &MergeableInsts) const;
275 
276   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
277                                                      const CombineInfo &Paired);
278 
279   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
280                                           const CombineInfo &Paired);
281 
282 public:
283   static char ID;
284 
285   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
286     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
287   }
288 
289   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
290                                      bool &OptimizeListAgain);
291   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
292 
293   bool runOnMachineFunction(MachineFunction &MF) override;
294 
295   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
296 
297   void getAnalysisUsage(AnalysisUsage &AU) const override {
298     AU.setPreservesCFG();
299     AU.addRequired<AAResultsWrapperPass>();
300 
301     MachineFunctionPass::getAnalysisUsage(AU);
302   }
303 
304   MachineFunctionProperties getRequiredProperties() const override {
305     return MachineFunctionProperties()
306       .set(MachineFunctionProperties::Property::IsSSA);
307   }
308 };
309 
310 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
311   const unsigned Opc = MI.getOpcode();
312 
313   if (TII.isMUBUF(Opc)) {
314     // FIXME: Handle d16 correctly
315     return AMDGPU::getMUBUFElements(Opc);
316   }
317   if (TII.isMIMG(MI)) {
318     uint64_t DMaskImm =
319         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
320     return countPopulation(DMaskImm);
321   }
322   if (TII.isMTBUF(Opc)) {
323     return AMDGPU::getMTBUFElements(Opc);
324   }
325 
326   switch (Opc) {
327   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
328   case AMDGPU::GLOBAL_LOAD_DWORD:
329   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
330   case AMDGPU::GLOBAL_STORE_DWORD:
331   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
332   case AMDGPU::FLAT_LOAD_DWORD:
333   case AMDGPU::FLAT_STORE_DWORD:
334     return 1;
335   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
336   case AMDGPU::GLOBAL_LOAD_DWORDX2:
337   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
338   case AMDGPU::GLOBAL_STORE_DWORDX2:
339   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
340   case AMDGPU::FLAT_LOAD_DWORDX2:
341   case AMDGPU::FLAT_STORE_DWORDX2:
342     return 2;
343   case AMDGPU::GLOBAL_LOAD_DWORDX3:
344   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
345   case AMDGPU::GLOBAL_STORE_DWORDX3:
346   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
347   case AMDGPU::FLAT_LOAD_DWORDX3:
348   case AMDGPU::FLAT_STORE_DWORDX3:
349     return 3;
350   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
351   case AMDGPU::GLOBAL_LOAD_DWORDX4:
352   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
353   case AMDGPU::GLOBAL_STORE_DWORDX4:
354   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
355   case AMDGPU::FLAT_LOAD_DWORDX4:
356   case AMDGPU::FLAT_STORE_DWORDX4:
357     return 4;
358   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
359     return 8;
360   case AMDGPU::DS_READ_B32:      LLVM_FALLTHROUGH;
361   case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
362   case AMDGPU::DS_WRITE_B32:     LLVM_FALLTHROUGH;
363   case AMDGPU::DS_WRITE_B32_gfx9:
364     return 1;
365   case AMDGPU::DS_READ_B64:      LLVM_FALLTHROUGH;
366   case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
367   case AMDGPU::DS_WRITE_B64:     LLVM_FALLTHROUGH;
368   case AMDGPU::DS_WRITE_B64_gfx9:
369     return 2;
370   default:
371     return 0;
372   }
373 }
374 
375 /// Maps instruction opcode to enum InstClassEnum.
376 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
377   switch (Opc) {
378   default:
379     if (TII.isMUBUF(Opc)) {
380       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
381       default:
382         return UNKNOWN;
383       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
384       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
385       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
386       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
387         return BUFFER_LOAD;
388       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
389       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
390       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
391       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
392         return BUFFER_STORE;
393       }
394     }
395     if (TII.isMIMG(Opc)) {
396       // Ignore instructions encoded without vaddr.
397       if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
398           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
399         return UNKNOWN;
400       // Ignore BVH instructions
401       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
402         return UNKNOWN;
403       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
404       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
405           TII.isGather4(Opc))
406         return UNKNOWN;
407       return MIMG;
408     }
409     if (TII.isMTBUF(Opc)) {
410       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
411       default:
412         return UNKNOWN;
413       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
414       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
415       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
416       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
417         return TBUFFER_LOAD;
418       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
419       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
420       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
421       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
422         return TBUFFER_STORE;
423       }
424     }
425     return UNKNOWN;
426   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
427   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
428   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
429   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
430     return S_BUFFER_LOAD_IMM;
431   case AMDGPU::DS_READ_B32:
432   case AMDGPU::DS_READ_B32_gfx9:
433   case AMDGPU::DS_READ_B64:
434   case AMDGPU::DS_READ_B64_gfx9:
435     return DS_READ;
436   case AMDGPU::DS_WRITE_B32:
437   case AMDGPU::DS_WRITE_B32_gfx9:
438   case AMDGPU::DS_WRITE_B64:
439   case AMDGPU::DS_WRITE_B64_gfx9:
440     return DS_WRITE;
441   case AMDGPU::GLOBAL_LOAD_DWORD:
442   case AMDGPU::GLOBAL_LOAD_DWORDX2:
443   case AMDGPU::GLOBAL_LOAD_DWORDX3:
444   case AMDGPU::GLOBAL_LOAD_DWORDX4:
445   case AMDGPU::FLAT_LOAD_DWORD:
446   case AMDGPU::FLAT_LOAD_DWORDX2:
447   case AMDGPU::FLAT_LOAD_DWORDX3:
448   case AMDGPU::FLAT_LOAD_DWORDX4:
449     return FLAT_LOAD;
450   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
451   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
452   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
453   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
454     return GLOBAL_LOAD_SADDR;
455   case AMDGPU::GLOBAL_STORE_DWORD:
456   case AMDGPU::GLOBAL_STORE_DWORDX2:
457   case AMDGPU::GLOBAL_STORE_DWORDX3:
458   case AMDGPU::GLOBAL_STORE_DWORDX4:
459   case AMDGPU::FLAT_STORE_DWORD:
460   case AMDGPU::FLAT_STORE_DWORDX2:
461   case AMDGPU::FLAT_STORE_DWORDX3:
462   case AMDGPU::FLAT_STORE_DWORDX4:
463     return FLAT_STORE;
464   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
465   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
466   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
467   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
468     return GLOBAL_STORE_SADDR;
469   }
470 }
471 
472 /// Determines instruction subclass from opcode. Only instructions
473 /// of the same subclass can be merged together. The merged instruction may have
474 /// a different subclass but must have the same class.
475 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
476   switch (Opc) {
477   default:
478     if (TII.isMUBUF(Opc))
479       return AMDGPU::getMUBUFBaseOpcode(Opc);
480     if (TII.isMIMG(Opc)) {
481       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
482       assert(Info);
483       return Info->BaseOpcode;
484     }
485     if (TII.isMTBUF(Opc))
486       return AMDGPU::getMTBUFBaseOpcode(Opc);
487     return -1;
488   case AMDGPU::DS_READ_B32:
489   case AMDGPU::DS_READ_B32_gfx9:
490   case AMDGPU::DS_READ_B64:
491   case AMDGPU::DS_READ_B64_gfx9:
492   case AMDGPU::DS_WRITE_B32:
493   case AMDGPU::DS_WRITE_B32_gfx9:
494   case AMDGPU::DS_WRITE_B64:
495   case AMDGPU::DS_WRITE_B64_gfx9:
496     return Opc;
497   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
498   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
499   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
500   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
501     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
502   case AMDGPU::GLOBAL_LOAD_DWORD:
503   case AMDGPU::GLOBAL_LOAD_DWORDX2:
504   case AMDGPU::GLOBAL_LOAD_DWORDX3:
505   case AMDGPU::GLOBAL_LOAD_DWORDX4:
506   case AMDGPU::FLAT_LOAD_DWORD:
507   case AMDGPU::FLAT_LOAD_DWORDX2:
508   case AMDGPU::FLAT_LOAD_DWORDX3:
509   case AMDGPU::FLAT_LOAD_DWORDX4:
510     return AMDGPU::FLAT_LOAD_DWORD;
511   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
512   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
513   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
514   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
515     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
516   case AMDGPU::GLOBAL_STORE_DWORD:
517   case AMDGPU::GLOBAL_STORE_DWORDX2:
518   case AMDGPU::GLOBAL_STORE_DWORDX3:
519   case AMDGPU::GLOBAL_STORE_DWORDX4:
520   case AMDGPU::FLAT_STORE_DWORD:
521   case AMDGPU::FLAT_STORE_DWORDX2:
522   case AMDGPU::FLAT_STORE_DWORDX3:
523   case AMDGPU::FLAT_STORE_DWORDX4:
524     return AMDGPU::FLAT_STORE_DWORD;
525   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
526   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
527   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
528   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
529     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
530   }
531 }
532 
533 // GLOBAL loads and stores are classified as FLAT initially. If both combined
534 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
535 // If either or both instructions are non segment specific FLAT the resulting
536 // combined operation will be FLAT, potentially promoting one of the GLOBAL
537 // operations to FLAT.
538 // For other instructions return the original unmodified class.
539 InstClassEnum
540 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
541                                          const CombineInfo &Paired) {
542   assert(CI.InstClass == Paired.InstClass);
543 
544   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
545       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
546     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
547 
548   return CI.InstClass;
549 }
550 
551 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
552   AddressRegs Result;
553 
554   if (TII.isMUBUF(Opc)) {
555     if (AMDGPU::getMUBUFHasVAddr(Opc))
556       Result.VAddr = true;
557     if (AMDGPU::getMUBUFHasSrsrc(Opc))
558       Result.SRsrc = true;
559     if (AMDGPU::getMUBUFHasSoffset(Opc))
560       Result.SOffset = true;
561 
562     return Result;
563   }
564 
565   if (TII.isMIMG(Opc)) {
566     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
567     if (VAddr0Idx >= 0) {
568       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
569       Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
570     } else {
571       Result.VAddr = true;
572     }
573     Result.SRsrc = true;
574     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
575     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
576       Result.SSamp = true;
577 
578     return Result;
579   }
580   if (TII.isMTBUF(Opc)) {
581     if (AMDGPU::getMTBUFHasVAddr(Opc))
582       Result.VAddr = true;
583     if (AMDGPU::getMTBUFHasSrsrc(Opc))
584       Result.SRsrc = true;
585     if (AMDGPU::getMTBUFHasSoffset(Opc))
586       Result.SOffset = true;
587 
588     return Result;
589   }
590 
591   switch (Opc) {
592   default:
593     return Result;
594   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
595   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
596   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
597   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
598     Result.SBase = true;
599     return Result;
600   case AMDGPU::DS_READ_B32:
601   case AMDGPU::DS_READ_B64:
602   case AMDGPU::DS_READ_B32_gfx9:
603   case AMDGPU::DS_READ_B64_gfx9:
604   case AMDGPU::DS_WRITE_B32:
605   case AMDGPU::DS_WRITE_B64:
606   case AMDGPU::DS_WRITE_B32_gfx9:
607   case AMDGPU::DS_WRITE_B64_gfx9:
608     Result.Addr = true;
609     return Result;
610   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
611   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
612   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
613   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
614   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
615   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
616   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
617   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
618     Result.SAddr = true;
619     LLVM_FALLTHROUGH;
620   case AMDGPU::GLOBAL_LOAD_DWORD:
621   case AMDGPU::GLOBAL_LOAD_DWORDX2:
622   case AMDGPU::GLOBAL_LOAD_DWORDX3:
623   case AMDGPU::GLOBAL_LOAD_DWORDX4:
624   case AMDGPU::GLOBAL_STORE_DWORD:
625   case AMDGPU::GLOBAL_STORE_DWORDX2:
626   case AMDGPU::GLOBAL_STORE_DWORDX3:
627   case AMDGPU::GLOBAL_STORE_DWORDX4:
628   case AMDGPU::FLAT_LOAD_DWORD:
629   case AMDGPU::FLAT_LOAD_DWORDX2:
630   case AMDGPU::FLAT_LOAD_DWORDX3:
631   case AMDGPU::FLAT_LOAD_DWORDX4:
632   case AMDGPU::FLAT_STORE_DWORD:
633   case AMDGPU::FLAT_STORE_DWORDX2:
634   case AMDGPU::FLAT_STORE_DWORDX3:
635   case AMDGPU::FLAT_STORE_DWORDX4:
636     Result.VAddr = true;
637     return Result;
638   }
639 }
640 
641 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
642                                               const SILoadStoreOptimizer &LSO) {
643   I = MI;
644   unsigned Opc = MI->getOpcode();
645   InstClass = getInstClass(Opc, *LSO.TII);
646 
647   if (InstClass == UNKNOWN)
648     return;
649 
650   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
651 
652   switch (InstClass) {
653   case DS_READ:
654    EltSize =
655           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
656                                                                           : 4;
657    break;
658   case DS_WRITE:
659     EltSize =
660           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
661                                                                             : 4;
662     break;
663   case S_BUFFER_LOAD_IMM:
664     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
665     break;
666   default:
667     EltSize = 4;
668     break;
669   }
670 
671   if (InstClass == MIMG) {
672     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
673     // Offset is not considered for MIMG instructions.
674     Offset = 0;
675   } else {
676     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
677     Offset = I->getOperand(OffsetIdx).getImm();
678   }
679 
680   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
681     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
682 
683   Width = getOpcodeWidth(*I, *LSO.TII);
684 
685   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
686     Offset &= 0xffff;
687   } else if (InstClass != MIMG) {
688     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
689   }
690 
691   AddressRegs Regs = getRegs(Opc, *LSO.TII);
692 
693   NumAddresses = 0;
694   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
695     AddrIdx[NumAddresses++] =
696         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
697   if (Regs.Addr)
698     AddrIdx[NumAddresses++] =
699         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
700   if (Regs.SBase)
701     AddrIdx[NumAddresses++] =
702         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
703   if (Regs.SRsrc)
704     AddrIdx[NumAddresses++] =
705         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
706   if (Regs.SOffset)
707     AddrIdx[NumAddresses++] =
708         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
709   if (Regs.SAddr)
710     AddrIdx[NumAddresses++] =
711         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
712   if (Regs.VAddr)
713     AddrIdx[NumAddresses++] =
714         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
715   if (Regs.SSamp)
716     AddrIdx[NumAddresses++] =
717         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
718   assert(NumAddresses <= MaxAddressRegs);
719 
720   for (unsigned J = 0; J < NumAddresses; J++)
721     AddrReg[J] = &I->getOperand(AddrIdx[J]);
722 }
723 
724 } // end anonymous namespace.
725 
726 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
727                       "SI Load Store Optimizer", false, false)
728 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
729 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
730                     false, false)
731 
732 char SILoadStoreOptimizer::ID = 0;
733 
734 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
735 
736 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
737   return new SILoadStoreOptimizer();
738 }
739 
740 static void addDefsUsesToList(const MachineInstr &MI,
741                               DenseSet<Register> &RegDefs,
742                               DenseSet<Register> &RegUses) {
743   for (const auto &Op : MI.operands()) {
744     if (!Op.isReg())
745       continue;
746     if (Op.isDef())
747       RegDefs.insert(Op.getReg());
748     if (Op.readsReg())
749       RegUses.insert(Op.getReg());
750   }
751 }
752 
753 bool SILoadStoreOptimizer::canSwapInstructions(
754     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
755     const MachineInstr &A, const MachineInstr &B) const {
756   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
757       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
758     return false;
759   for (const auto &BOp : B.operands()) {
760     if (!BOp.isReg())
761       continue;
762     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
763       return false;
764     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
765       return false;
766   }
767   return true;
768 }
769 
770 // Given that \p CI and \p Paired are adjacent memory operations produce a new
771 // MMO for the combined operation with a new access size.
772 MachineMemOperand *
773 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
774                                                const CombineInfo &Paired) {
775   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
776   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
777 
778   unsigned Size = MMOa->getSize() + MMOb->getSize();
779 
780   // A base pointer for the combined operation is the same as the leading
781   // operation's pointer.
782   if (Paired < CI)
783     std::swap(MMOa, MMOb);
784 
785   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
786   // If merging FLAT and GLOBAL set address space to FLAT.
787   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
788     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
789 
790   MachineFunction *MF = CI.I->getMF();
791   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
792 }
793 
794 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
795                                                const SIInstrInfo &TII,
796                                                const CombineInfo &Paired) {
797   assert(CI.InstClass == MIMG);
798 
799   // Ignore instructions with tfe/lwe set.
800   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
801   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
802 
803   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
804     return false;
805 
806   // Check other optional immediate operands for equality.
807   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
808                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
809                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
810 
811   for (auto op : OperandsToMatch) {
812     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
813     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
814       return false;
815     if (Idx != -1 &&
816         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
817       return false;
818   }
819 
820   // Check DMask for overlaps.
821   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
822   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
823 
824   unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
825   if ((1u << AllowedBitsForMin) <= MinMask)
826     return false;
827 
828   return true;
829 }
830 
831 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
832                                        unsigned ComponentCount,
833                                        const GCNSubtarget &STI) {
834   if (ComponentCount > 4)
835     return 0;
836 
837   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
838       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
839   if (!OldFormatInfo)
840     return 0;
841 
842   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
843       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
844                                            ComponentCount,
845                                            OldFormatInfo->NumFormat, STI);
846 
847   if (!NewFormatInfo)
848     return 0;
849 
850   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
851          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
852 
853   return NewFormatInfo->Format;
854 }
855 
856 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
857 // highest power of two. Note that the result is well defined for all inputs
858 // including corner cases like:
859 // - if Lo == Hi, return that value
860 // - if Lo == 0, return 0 (even though the "- 1" below underflows
861 // - if Lo > Hi, return 0 (as if the range wrapped around)
862 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
863   return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
864 }
865 
866 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
867                                                 const GCNSubtarget &STI,
868                                                 CombineInfo &Paired,
869                                                 bool Modify) {
870   assert(CI.InstClass != MIMG);
871 
872   // XXX - Would the same offset be OK? Is there any reason this would happen or
873   // be useful?
874   if (CI.Offset == Paired.Offset)
875     return false;
876 
877   // This won't be valid if the offset isn't aligned.
878   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
879     return false;
880 
881   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
882 
883     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
884         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
885     if (!Info0)
886       return false;
887     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
888         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
889     if (!Info1)
890       return false;
891 
892     if (Info0->BitsPerComp != Info1->BitsPerComp ||
893         Info0->NumFormat != Info1->NumFormat)
894       return false;
895 
896     // TODO: Should be possible to support more formats, but if format loads
897     // are not dword-aligned, the merged load might not be valid.
898     if (Info0->BitsPerComp != 32)
899       return false;
900 
901     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
902       return false;
903   }
904 
905   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
906   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
907   CI.UseST64 = false;
908   CI.BaseOff = 0;
909 
910   // Handle all non-DS instructions.
911   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
912     return (EltOffset0 + CI.Width == EltOffset1 ||
913             EltOffset1 + Paired.Width == EltOffset0) &&
914            CI.CPol == Paired.CPol;
915   }
916 
917   // If the offset in elements doesn't fit in 8-bits, we might be able to use
918   // the stride 64 versions.
919   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
920       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
921     if (Modify) {
922       CI.Offset = EltOffset0 / 64;
923       Paired.Offset = EltOffset1 / 64;
924       CI.UseST64 = true;
925     }
926     return true;
927   }
928 
929   // Check if the new offsets fit in the reduced 8-bit range.
930   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
931     if (Modify) {
932       CI.Offset = EltOffset0;
933       Paired.Offset = EltOffset1;
934     }
935     return true;
936   }
937 
938   // Try to shift base address to decrease offsets.
939   uint32_t Min = std::min(EltOffset0, EltOffset1);
940   uint32_t Max = std::max(EltOffset0, EltOffset1);
941 
942   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
943   if (((Max - Min) & ~Mask) == 0) {
944     if (Modify) {
945       // From the range of values we could use for BaseOff, choose the one that
946       // is aligned to the highest power of two, to maximise the chance that
947       // the same offset can be reused for other load/store pairs.
948       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
949       // Copy the low bits of the offsets, so that when we adjust them by
950       // subtracting BaseOff they will be multiples of 64.
951       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
952       CI.BaseOff = BaseOff * CI.EltSize;
953       CI.Offset = (EltOffset0 - BaseOff) / 64;
954       Paired.Offset = (EltOffset1 - BaseOff) / 64;
955       CI.UseST64 = true;
956     }
957     return true;
958   }
959 
960   if (isUInt<8>(Max - Min)) {
961     if (Modify) {
962       // From the range of values we could use for BaseOff, choose the one that
963       // is aligned to the highest power of two, to maximise the chance that
964       // the same offset can be reused for other load/store pairs.
965       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
966       CI.BaseOff = BaseOff * CI.EltSize;
967       CI.Offset = EltOffset0 - BaseOff;
968       Paired.Offset = EltOffset1 - BaseOff;
969     }
970     return true;
971   }
972 
973   return false;
974 }
975 
976 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
977                                      const CombineInfo &CI,
978                                      const CombineInfo &Paired) {
979   const unsigned Width = (CI.Width + Paired.Width);
980   switch (CI.InstClass) {
981   default:
982     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
983   case S_BUFFER_LOAD_IMM:
984     switch (Width) {
985     default:
986       return false;
987     case 2:
988     case 4:
989     case 8:
990       return true;
991     }
992   }
993 }
994 
995 const TargetRegisterClass *
996 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
997   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
998     return TRI->getRegClassForReg(*MRI, Dst->getReg());
999   }
1000   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1001     return TRI->getRegClassForReg(*MRI, Src->getReg());
1002   }
1003   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1004     return TRI->getRegClassForReg(*MRI, Src->getReg());
1005   }
1006   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1007     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1008   }
1009   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1010     return TRI->getRegClassForReg(*MRI, Src->getReg());
1011   }
1012   return nullptr;
1013 }
1014 
1015 /// This function assumes that CI comes before Paired in a basic block. Return
1016 /// an insertion point for the merged instruction or nullptr on failure.
1017 SILoadStoreOptimizer::CombineInfo *
1018 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1019                                            CombineInfo &Paired) {
1020   // If another instruction has already been merged into CI, it may now be a
1021   // type that we can't do any further merging into.
1022   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1023     return nullptr;
1024   assert(CI.InstClass == Paired.InstClass);
1025 
1026   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1027       getInstSubclass(Paired.I->getOpcode(), *TII))
1028     return nullptr;
1029 
1030   // Check both offsets (or masks for MIMG) can be combined and fit in the
1031   // reduced range.
1032   if (CI.InstClass == MIMG) {
1033     if (!dmasksCanBeCombined(CI, *TII, Paired))
1034       return nullptr;
1035   } else {
1036     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1037       return nullptr;
1038   }
1039 
1040   DenseSet<Register> RegDefs;
1041   DenseSet<Register> RegUses;
1042   CombineInfo *Where;
1043   if (CI.I->mayLoad()) {
1044     // Try to hoist Paired up to CI.
1045     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1046     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1047       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1048         return nullptr;
1049     }
1050     Where = &CI;
1051   } else {
1052     // Try to sink CI down to Paired.
1053     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1054     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1055       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1056         return nullptr;
1057     }
1058     Where = &Paired;
1059   }
1060 
1061   // Call offsetsCanBeCombined with modify = true so that the offsets are
1062   // correct for the new instruction.  This should return true, because
1063   // this function should only be called on CombineInfo objects that
1064   // have already been confirmed to be mergeable.
1065   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1066     offsetsCanBeCombined(CI, *STM, Paired, true);
1067   return Where;
1068 }
1069 
1070 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1071   if (STM->ldsRequiresM0Init())
1072     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1073   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1074 }
1075 
1076 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1077   if (STM->ldsRequiresM0Init())
1078     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1079 
1080   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1081                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1082 }
1083 
1084 MachineBasicBlock::iterator
1085 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1086                                      MachineBasicBlock::iterator InsertBefore) {
1087   MachineBasicBlock *MBB = CI.I->getParent();
1088 
1089   // Be careful, since the addresses could be subregisters themselves in weird
1090   // cases, like vectors of pointers.
1091   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1092 
1093   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1094   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1095 
1096   unsigned NewOffset0 = CI.Offset;
1097   unsigned NewOffset1 = Paired.Offset;
1098   unsigned Opc =
1099       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1100 
1101   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1102   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1103 
1104   if (NewOffset0 > NewOffset1) {
1105     // Canonicalize the merged instruction so the smaller offset comes first.
1106     std::swap(NewOffset0, NewOffset1);
1107     std::swap(SubRegIdx0, SubRegIdx1);
1108   }
1109 
1110   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1111          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1112 
1113   const MCInstrDesc &Read2Desc = TII->get(Opc);
1114 
1115   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1116   Register DestReg = MRI->createVirtualRegister(SuperRC);
1117 
1118   DebugLoc DL = CI.I->getDebugLoc();
1119 
1120   Register BaseReg = AddrReg->getReg();
1121   unsigned BaseSubReg = AddrReg->getSubReg();
1122   unsigned BaseRegFlags = 0;
1123   if (CI.BaseOff) {
1124     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1125     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1126         .addImm(CI.BaseOff);
1127 
1128     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1129     BaseRegFlags = RegState::Kill;
1130 
1131     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1132         .addReg(ImmReg)
1133         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1134         .addImm(0); // clamp bit
1135     BaseSubReg = 0;
1136   }
1137 
1138   MachineInstrBuilder Read2 =
1139       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1140           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1141           .addImm(NewOffset0)                        // offset0
1142           .addImm(NewOffset1)                        // offset1
1143           .addImm(0)                                 // gds
1144           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1145 
1146   (void)Read2;
1147 
1148   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1149 
1150   // Copy to the old destination registers.
1151   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1152       .add(*Dest0) // Copy to same destination including flags and sub reg.
1153       .addReg(DestReg, 0, SubRegIdx0);
1154   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1155       .add(*Dest1)
1156       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1157 
1158   CI.I->eraseFromParent();
1159   Paired.I->eraseFromParent();
1160 
1161   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1162   return Read2;
1163 }
1164 
1165 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1166   if (STM->ldsRequiresM0Init())
1167     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1168   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1169                         : AMDGPU::DS_WRITE2_B64_gfx9;
1170 }
1171 
1172 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1173   if (STM->ldsRequiresM0Init())
1174     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1175                           : AMDGPU::DS_WRITE2ST64_B64;
1176 
1177   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1178                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1179 }
1180 
1181 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1182     CombineInfo &CI, CombineInfo &Paired,
1183     MachineBasicBlock::iterator InsertBefore) {
1184   MachineBasicBlock *MBB = CI.I->getParent();
1185 
1186   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1187   // sure we preserve the subregister index and any register flags set on them.
1188   const MachineOperand *AddrReg =
1189       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1190   const MachineOperand *Data0 =
1191       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1192   const MachineOperand *Data1 =
1193       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1194 
1195   unsigned NewOffset0 = CI.Offset;
1196   unsigned NewOffset1 = Paired.Offset;
1197   unsigned Opc =
1198       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1199 
1200   if (NewOffset0 > NewOffset1) {
1201     // Canonicalize the merged instruction so the smaller offset comes first.
1202     std::swap(NewOffset0, NewOffset1);
1203     std::swap(Data0, Data1);
1204   }
1205 
1206   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1207          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1208 
1209   const MCInstrDesc &Write2Desc = TII->get(Opc);
1210   DebugLoc DL = CI.I->getDebugLoc();
1211 
1212   Register BaseReg = AddrReg->getReg();
1213   unsigned BaseSubReg = AddrReg->getSubReg();
1214   unsigned BaseRegFlags = 0;
1215   if (CI.BaseOff) {
1216     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1217     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1218         .addImm(CI.BaseOff);
1219 
1220     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1221     BaseRegFlags = RegState::Kill;
1222 
1223     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1224         .addReg(ImmReg)
1225         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1226         .addImm(0); // clamp bit
1227     BaseSubReg = 0;
1228   }
1229 
1230   MachineInstrBuilder Write2 =
1231       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1232           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1233           .add(*Data0)                               // data0
1234           .add(*Data1)                               // data1
1235           .addImm(NewOffset0)                        // offset0
1236           .addImm(NewOffset1)                        // offset1
1237           .addImm(0)                                 // gds
1238           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1239 
1240   CI.I->eraseFromParent();
1241   Paired.I->eraseFromParent();
1242 
1243   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1244   return Write2;
1245 }
1246 
1247 MachineBasicBlock::iterator
1248 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1249                                      MachineBasicBlock::iterator InsertBefore) {
1250   MachineBasicBlock *MBB = CI.I->getParent();
1251   DebugLoc DL = CI.I->getDebugLoc();
1252   const unsigned Opcode = getNewOpcode(CI, Paired);
1253 
1254   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1255 
1256   Register DestReg = MRI->createVirtualRegister(SuperRC);
1257   unsigned MergedDMask = CI.DMask | Paired.DMask;
1258   unsigned DMaskIdx =
1259       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1260 
1261   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1262   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1263     if (I == DMaskIdx)
1264       MIB.addImm(MergedDMask);
1265     else
1266       MIB.add((*CI.I).getOperand(I));
1267   }
1268 
1269   // It shouldn't be possible to get this far if the two instructions
1270   // don't have a single memoperand, because MachineInstr::mayAlias()
1271   // will return true if this is the case.
1272   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1273 
1274   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1275 
1276   unsigned SubRegIdx0, SubRegIdx1;
1277   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1278 
1279   // Copy to the old destination registers.
1280   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1281   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1282   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1283 
1284   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1285       .add(*Dest0) // Copy to same destination including flags and sub reg.
1286       .addReg(DestReg, 0, SubRegIdx0);
1287   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1288       .add(*Dest1)
1289       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1290 
1291   CI.I->eraseFromParent();
1292   Paired.I->eraseFromParent();
1293   return New;
1294 }
1295 
1296 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1297     CombineInfo &CI, CombineInfo &Paired,
1298     MachineBasicBlock::iterator InsertBefore) {
1299   MachineBasicBlock *MBB = CI.I->getParent();
1300   DebugLoc DL = CI.I->getDebugLoc();
1301   const unsigned Opcode = getNewOpcode(CI, Paired);
1302 
1303   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1304 
1305   Register DestReg = MRI->createVirtualRegister(SuperRC);
1306   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1307 
1308   // It shouldn't be possible to get this far if the two instructions
1309   // don't have a single memoperand, because MachineInstr::mayAlias()
1310   // will return true if this is the case.
1311   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1312 
1313   MachineInstr *New =
1314       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1315           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1316           .addImm(MergedOffset) // offset
1317           .addImm(CI.CPol)      // cpol
1318           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1319 
1320   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1321   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1322   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1323 
1324   // Copy to the old destination registers.
1325   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1326   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1327   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1328 
1329   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1330       .add(*Dest0) // Copy to same destination including flags and sub reg.
1331       .addReg(DestReg, 0, SubRegIdx0);
1332   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1333       .add(*Dest1)
1334       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1335 
1336   CI.I->eraseFromParent();
1337   Paired.I->eraseFromParent();
1338   return New;
1339 }
1340 
1341 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1342     CombineInfo &CI, CombineInfo &Paired,
1343     MachineBasicBlock::iterator InsertBefore) {
1344   MachineBasicBlock *MBB = CI.I->getParent();
1345   DebugLoc DL = CI.I->getDebugLoc();
1346 
1347   const unsigned Opcode = getNewOpcode(CI, Paired);
1348 
1349   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1350 
1351   // Copy to the new source register.
1352   Register DestReg = MRI->createVirtualRegister(SuperRC);
1353   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1354 
1355   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1356 
1357   AddressRegs Regs = getRegs(Opcode, *TII);
1358 
1359   if (Regs.VAddr)
1360     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1361 
1362   // It shouldn't be possible to get this far if the two instructions
1363   // don't have a single memoperand, because MachineInstr::mayAlias()
1364   // will return true if this is the case.
1365   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1366 
1367   MachineInstr *New =
1368     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1369         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1370         .addImm(MergedOffset) // offset
1371         .addImm(CI.CPol)      // cpol
1372         .addImm(0)            // tfe
1373         .addImm(0)            // swz
1374         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1375 
1376   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1377   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1378   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1379 
1380   // Copy to the old destination registers.
1381   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1382   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1383   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1384 
1385   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1386       .add(*Dest0) // Copy to same destination including flags and sub reg.
1387       .addReg(DestReg, 0, SubRegIdx0);
1388   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1389       .add(*Dest1)
1390       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1391 
1392   CI.I->eraseFromParent();
1393   Paired.I->eraseFromParent();
1394   return New;
1395 }
1396 
1397 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1398     CombineInfo &CI, CombineInfo &Paired,
1399     MachineBasicBlock::iterator InsertBefore) {
1400   MachineBasicBlock *MBB = CI.I->getParent();
1401   DebugLoc DL = CI.I->getDebugLoc();
1402 
1403   const unsigned Opcode = getNewOpcode(CI, Paired);
1404 
1405   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1406 
1407   // Copy to the new source register.
1408   Register DestReg = MRI->createVirtualRegister(SuperRC);
1409   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1410 
1411   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1412 
1413   AddressRegs Regs = getRegs(Opcode, *TII);
1414 
1415   if (Regs.VAddr)
1416     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1417 
1418   unsigned JoinedFormat =
1419       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1420 
1421   // It shouldn't be possible to get this far if the two instructions
1422   // don't have a single memoperand, because MachineInstr::mayAlias()
1423   // will return true if this is the case.
1424   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1425 
1426   MachineInstr *New =
1427       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1428           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1429           .addImm(MergedOffset) // offset
1430           .addImm(JoinedFormat) // format
1431           .addImm(CI.CPol)      // cpol
1432           .addImm(0)            // tfe
1433           .addImm(0)            // swz
1434           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1435 
1436   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1437   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1438   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1439 
1440   // Copy to the old destination registers.
1441   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1442   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1443   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1444 
1445   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1446       .add(*Dest0) // Copy to same destination including flags and sub reg.
1447       .addReg(DestReg, 0, SubRegIdx0);
1448   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1449       .add(*Dest1)
1450       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1451 
1452   CI.I->eraseFromParent();
1453   Paired.I->eraseFromParent();
1454   return New;
1455 }
1456 
1457 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1458     CombineInfo &CI, CombineInfo &Paired,
1459     MachineBasicBlock::iterator InsertBefore) {
1460   MachineBasicBlock *MBB = CI.I->getParent();
1461   DebugLoc DL = CI.I->getDebugLoc();
1462 
1463   const unsigned Opcode = getNewOpcode(CI, Paired);
1464 
1465   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1466   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1467   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1468 
1469   // Copy to the new source register.
1470   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1471   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1472 
1473   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1474   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1475 
1476   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1477       .add(*Src0)
1478       .addImm(SubRegIdx0)
1479       .add(*Src1)
1480       .addImm(SubRegIdx1);
1481 
1482   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1483                  .addReg(SrcReg, RegState::Kill);
1484 
1485   AddressRegs Regs = getRegs(Opcode, *TII);
1486 
1487   if (Regs.VAddr)
1488     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1489 
1490   unsigned JoinedFormat =
1491       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1492 
1493   // It shouldn't be possible to get this far if the two instructions
1494   // don't have a single memoperand, because MachineInstr::mayAlias()
1495   // will return true if this is the case.
1496   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1497 
1498   MachineInstr *New =
1499       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1500           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1501           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1502           .addImm(JoinedFormat)                     // format
1503           .addImm(CI.CPol)                          // cpol
1504           .addImm(0)                                // tfe
1505           .addImm(0)                                // swz
1506           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1507 
1508   CI.I->eraseFromParent();
1509   Paired.I->eraseFromParent();
1510   return New;
1511 }
1512 
1513 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1514     CombineInfo &CI, CombineInfo &Paired,
1515     MachineBasicBlock::iterator InsertBefore) {
1516   MachineBasicBlock *MBB = CI.I->getParent();
1517   DebugLoc DL = CI.I->getDebugLoc();
1518 
1519   const unsigned Opcode = getNewOpcode(CI, Paired);
1520 
1521   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1522   Register DestReg = MRI->createVirtualRegister(SuperRC);
1523 
1524   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1525 
1526   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1527     MIB.add(*SAddr);
1528 
1529   MachineInstr *New =
1530     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1531        .addImm(std::min(CI.Offset, Paired.Offset))
1532        .addImm(CI.CPol)
1533        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1534 
1535   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1536   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1537   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1538 
1539   // Copy to the old destination registers.
1540   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1541   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1542   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1543 
1544   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1545       .add(*Dest0) // Copy to same destination including flags and sub reg.
1546       .addReg(DestReg, 0, SubRegIdx0);
1547   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1548       .add(*Dest1)
1549       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1550 
1551   CI.I->eraseFromParent();
1552   Paired.I->eraseFromParent();
1553   return New;
1554 }
1555 
1556 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1557     CombineInfo &CI, CombineInfo &Paired,
1558     MachineBasicBlock::iterator InsertBefore) {
1559   MachineBasicBlock *MBB = CI.I->getParent();
1560   DebugLoc DL = CI.I->getDebugLoc();
1561 
1562   const unsigned Opcode = getNewOpcode(CI, Paired);
1563 
1564   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1565   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1566   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1567 
1568   // Copy to the new source register.
1569   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1570   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1571 
1572   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1573   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1574 
1575   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1576       .add(*Src0)
1577       .addImm(SubRegIdx0)
1578       .add(*Src1)
1579       .addImm(SubRegIdx1);
1580 
1581   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1582                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1583                  .addReg(SrcReg, RegState::Kill);
1584 
1585   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1586     MIB.add(*SAddr);
1587 
1588   MachineInstr *New =
1589     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1590        .addImm(CI.CPol)
1591        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1592 
1593   CI.I->eraseFromParent();
1594   Paired.I->eraseFromParent();
1595   return New;
1596 }
1597 
1598 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1599                                             const CombineInfo &Paired) {
1600   const unsigned Width = CI.Width + Paired.Width;
1601 
1602   switch (getCommonInstClass(CI, Paired)) {
1603   default:
1604     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1605     // FIXME: Handle d16 correctly
1606     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1607                                   Width);
1608   case TBUFFER_LOAD:
1609   case TBUFFER_STORE:
1610     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1611                                   Width);
1612 
1613   case UNKNOWN:
1614     llvm_unreachable("Unknown instruction class");
1615   case S_BUFFER_LOAD_IMM:
1616     switch (Width) {
1617     default:
1618       return 0;
1619     case 2:
1620       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1621     case 4:
1622       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1623     case 8:
1624       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1625     }
1626   case GLOBAL_LOAD:
1627     switch (Width) {
1628     default:
1629       return 0;
1630     case 2:
1631       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1632     case 3:
1633       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1634     case 4:
1635       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1636     }
1637   case GLOBAL_LOAD_SADDR:
1638     switch (Width) {
1639     default:
1640       return 0;
1641     case 2:
1642       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1643     case 3:
1644       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1645     case 4:
1646       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1647     }
1648   case GLOBAL_STORE:
1649     switch (Width) {
1650     default:
1651       return 0;
1652     case 2:
1653       return AMDGPU::GLOBAL_STORE_DWORDX2;
1654     case 3:
1655       return AMDGPU::GLOBAL_STORE_DWORDX3;
1656     case 4:
1657       return AMDGPU::GLOBAL_STORE_DWORDX4;
1658     }
1659   case GLOBAL_STORE_SADDR:
1660     switch (Width) {
1661     default:
1662       return 0;
1663     case 2:
1664       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1665     case 3:
1666       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1667     case 4:
1668       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1669     }
1670   case FLAT_LOAD:
1671     switch (Width) {
1672     default:
1673       return 0;
1674     case 2:
1675       return AMDGPU::FLAT_LOAD_DWORDX2;
1676     case 3:
1677       return AMDGPU::FLAT_LOAD_DWORDX3;
1678     case 4:
1679       return AMDGPU::FLAT_LOAD_DWORDX4;
1680     }
1681   case FLAT_STORE:
1682     switch (Width) {
1683     default:
1684       return 0;
1685     case 2:
1686       return AMDGPU::FLAT_STORE_DWORDX2;
1687     case 3:
1688       return AMDGPU::FLAT_STORE_DWORDX3;
1689     case 4:
1690       return AMDGPU::FLAT_STORE_DWORDX4;
1691     }
1692   case MIMG:
1693     assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
1694            "No overlaps");
1695     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1696   }
1697 }
1698 
1699 std::pair<unsigned, unsigned>
1700 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1701                                     const CombineInfo &Paired) {
1702   assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) ==
1703                                    CI.Width + Paired.Width)) &&
1704          "No overlaps");
1705 
1706   unsigned Idx0;
1707   unsigned Idx1;
1708 
1709   static const unsigned Idxs[5][4] = {
1710       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1711       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1712       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1713       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1714       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1715   };
1716 
1717   assert(CI.Width >= 1 && CI.Width <= 4);
1718   assert(Paired.Width >= 1 && Paired.Width <= 4);
1719 
1720   if (Paired < CI) {
1721     Idx1 = Idxs[0][Paired.Width - 1];
1722     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1723   } else {
1724     Idx0 = Idxs[0][CI.Width - 1];
1725     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1726   }
1727 
1728   return std::make_pair(Idx0, Idx1);
1729 }
1730 
1731 const TargetRegisterClass *
1732 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1733                                              const CombineInfo &Paired) {
1734   if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1735     switch (CI.Width + Paired.Width) {
1736     default:
1737       return nullptr;
1738     case 2:
1739       return &AMDGPU::SReg_64_XEXECRegClass;
1740     case 4:
1741       return &AMDGPU::SGPR_128RegClass;
1742     case 8:
1743       return &AMDGPU::SGPR_256RegClass;
1744     case 16:
1745       return &AMDGPU::SGPR_512RegClass;
1746     }
1747   }
1748 
1749   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1750   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1751              ? TRI->getAGPRClassForBitWidth(BitWidth)
1752              : TRI->getVGPRClassForBitWidth(BitWidth);
1753 }
1754 
1755 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1756     CombineInfo &CI, CombineInfo &Paired,
1757     MachineBasicBlock::iterator InsertBefore) {
1758   MachineBasicBlock *MBB = CI.I->getParent();
1759   DebugLoc DL = CI.I->getDebugLoc();
1760 
1761   const unsigned Opcode = getNewOpcode(CI, Paired);
1762 
1763   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1764   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1765   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1766 
1767   // Copy to the new source register.
1768   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1769   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1770 
1771   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1772   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1773 
1774   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1775       .add(*Src0)
1776       .addImm(SubRegIdx0)
1777       .add(*Src1)
1778       .addImm(SubRegIdx1);
1779 
1780   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1781                  .addReg(SrcReg, RegState::Kill);
1782 
1783   AddressRegs Regs = getRegs(Opcode, *TII);
1784 
1785   if (Regs.VAddr)
1786     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1787 
1788 
1789   // It shouldn't be possible to get this far if the two instructions
1790   // don't have a single memoperand, because MachineInstr::mayAlias()
1791   // will return true if this is the case.
1792   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1793 
1794   MachineInstr *New =
1795     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1796         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1797         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1798         .addImm(CI.CPol)      // cpol
1799         .addImm(0)            // tfe
1800         .addImm(0)            // swz
1801         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1802 
1803   CI.I->eraseFromParent();
1804   Paired.I->eraseFromParent();
1805   return New;
1806 }
1807 
1808 MachineOperand
1809 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1810   APInt V(32, Val, true);
1811   if (TII->isInlineConstant(V))
1812     return MachineOperand::CreateImm(Val);
1813 
1814   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1815   MachineInstr *Mov =
1816   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1817           TII->get(AMDGPU::S_MOV_B32), Reg)
1818     .addImm(Val);
1819   (void)Mov;
1820   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1821   return MachineOperand::CreateReg(Reg, false);
1822 }
1823 
1824 // Compute base address using Addr and return the final register.
1825 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1826                                            const MemAddress &Addr) const {
1827   MachineBasicBlock *MBB = MI.getParent();
1828   MachineBasicBlock::iterator MBBI = MI.getIterator();
1829   DebugLoc DL = MI.getDebugLoc();
1830 
1831   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1832           Addr.Base.LoSubReg) &&
1833          "Expected 32-bit Base-Register-Low!!");
1834 
1835   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1836           Addr.Base.HiSubReg) &&
1837          "Expected 32-bit Base-Register-Hi!!");
1838 
1839   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1840   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1841   MachineOperand OffsetHi =
1842     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1843 
1844   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1845   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1846   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1847 
1848   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1849   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1850   MachineInstr *LoHalf =
1851     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1852       .addReg(CarryReg, RegState::Define)
1853       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1854       .add(OffsetLo)
1855       .addImm(0); // clamp bit
1856   (void)LoHalf;
1857   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1858 
1859   MachineInstr *HiHalf =
1860   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1861     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1862     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1863     .add(OffsetHi)
1864     .addReg(CarryReg, RegState::Kill)
1865     .addImm(0); // clamp bit
1866   (void)HiHalf;
1867   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1868 
1869   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1870   MachineInstr *FullBase =
1871     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1872       .addReg(DestSub0)
1873       .addImm(AMDGPU::sub0)
1874       .addReg(DestSub1)
1875       .addImm(AMDGPU::sub1);
1876   (void)FullBase;
1877   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1878 
1879   return FullDestReg;
1880 }
1881 
1882 // Update base and offset with the NewBase and NewOffset in MI.
1883 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1884                                                Register NewBase,
1885                                                int32_t NewOffset) const {
1886   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1887   Base->setReg(NewBase);
1888   Base->setIsKill(false);
1889   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1890 }
1891 
1892 Optional<int32_t>
1893 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1894   if (Op.isImm())
1895     return Op.getImm();
1896 
1897   if (!Op.isReg())
1898     return None;
1899 
1900   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1901   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1902       !Def->getOperand(1).isImm())
1903     return None;
1904 
1905   return Def->getOperand(1).getImm();
1906 }
1907 
1908 // Analyze Base and extracts:
1909 //  - 32bit base registers, subregisters
1910 //  - 64bit constant offset
1911 // Expecting base computation as:
1912 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1913 //   %LO:vgpr_32, %c:sreg_64_xexec =
1914 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1915 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1916 //   %Base:vreg_64 =
1917 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1918 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1919                                                       MemAddress &Addr) const {
1920   if (!Base.isReg())
1921     return;
1922 
1923   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1924   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1925       || Def->getNumOperands() != 5)
1926     return;
1927 
1928   MachineOperand BaseLo = Def->getOperand(1);
1929   MachineOperand BaseHi = Def->getOperand(3);
1930   if (!BaseLo.isReg() || !BaseHi.isReg())
1931     return;
1932 
1933   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1934   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1935 
1936   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1937       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1938     return;
1939 
1940   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1941   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1942 
1943   auto Offset0P = extractConstOffset(*Src0);
1944   if (Offset0P)
1945     BaseLo = *Src1;
1946   else {
1947     if (!(Offset0P = extractConstOffset(*Src1)))
1948       return;
1949     BaseLo = *Src0;
1950   }
1951 
1952   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1953   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1954 
1955   if (Src0->isImm())
1956     std::swap(Src0, Src1);
1957 
1958   if (!Src1->isImm())
1959     return;
1960 
1961   uint64_t Offset1 = Src1->getImm();
1962   BaseHi = *Src0;
1963 
1964   Addr.Base.LoReg = BaseLo.getReg();
1965   Addr.Base.HiReg = BaseHi.getReg();
1966   Addr.Base.LoSubReg = BaseLo.getSubReg();
1967   Addr.Base.HiSubReg = BaseHi.getSubReg();
1968   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1969 }
1970 
1971 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1972     MachineInstr &MI,
1973     MemInfoMap &Visited,
1974     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1975 
1976   if (!(MI.mayLoad() ^ MI.mayStore()))
1977     return false;
1978 
1979   // TODO: Support flat and scratch.
1980   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1981     return false;
1982 
1983   if (MI.mayLoad() &&
1984       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
1985     return false;
1986 
1987   if (AnchorList.count(&MI))
1988     return false;
1989 
1990   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1991 
1992   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1993     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1994     return false;
1995   }
1996 
1997   // Step1: Find the base-registers and a 64bit constant offset.
1998   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1999   MemAddress MAddr;
2000   if (Visited.find(&MI) == Visited.end()) {
2001     processBaseWithConstOffset(Base, MAddr);
2002     Visited[&MI] = MAddr;
2003   } else
2004     MAddr = Visited[&MI];
2005 
2006   if (MAddr.Offset == 0) {
2007     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2008                          " constant offsets that can be promoted.\n";);
2009     return false;
2010   }
2011 
2012   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
2013              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2014 
2015   // Step2: Traverse through MI's basic block and find an anchor(that has the
2016   // same base-registers) with the highest 13bit distance from MI's offset.
2017   // E.g. (64bit loads)
2018   // bb:
2019   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2020   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2021   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2022   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2023   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2024   //
2025   // Starting from the first load, the optimization will try to find a new base
2026   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2027   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2028   // as the new-base(anchor) because of the maximum distance which can
2029   // accommodate more intermediate bases presumably.
2030   //
2031   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2032   // (&a + 8192) for load1, load2, load4.
2033   //   addr = &a + 8192
2034   //   load1 = load(addr,       -4096)
2035   //   load2 = load(addr,       -2048)
2036   //   load3 = load(addr,       0)
2037   //   load4 = load(addr,       2048)
2038   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2039   //
2040   MachineInstr *AnchorInst = nullptr;
2041   MemAddress AnchorAddr;
2042   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2043   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2044 
2045   MachineBasicBlock *MBB = MI.getParent();
2046   MachineBasicBlock::iterator E = MBB->end();
2047   MachineBasicBlock::iterator MBBI = MI.getIterator();
2048   ++MBBI;
2049   const SITargetLowering *TLI =
2050     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2051 
2052   for ( ; MBBI != E; ++MBBI) {
2053     MachineInstr &MINext = *MBBI;
2054     // TODO: Support finding an anchor(with same base) from store addresses or
2055     // any other load addresses where the opcodes are different.
2056     if (MINext.getOpcode() != MI.getOpcode() ||
2057         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2058       continue;
2059 
2060     const MachineOperand &BaseNext =
2061       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2062     MemAddress MAddrNext;
2063     if (Visited.find(&MINext) == Visited.end()) {
2064       processBaseWithConstOffset(BaseNext, MAddrNext);
2065       Visited[&MINext] = MAddrNext;
2066     } else
2067       MAddrNext = Visited[&MINext];
2068 
2069     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2070         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2071         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2072         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2073       continue;
2074 
2075     InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
2076 
2077     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2078     TargetLoweringBase::AddrMode AM;
2079     AM.HasBaseReg = true;
2080     AM.BaseOffs = Dist;
2081     if (TLI->isLegalGlobalAddressingMode(AM) &&
2082         (uint32_t)std::abs(Dist) > MaxDist) {
2083       MaxDist = std::abs(Dist);
2084 
2085       AnchorAddr = MAddrNext;
2086       AnchorInst = &MINext;
2087     }
2088   }
2089 
2090   if (AnchorInst) {
2091     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2092                AnchorInst->dump());
2093     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2094                <<  AnchorAddr.Offset << "\n\n");
2095 
2096     // Instead of moving up, just re-compute anchor-instruction's base address.
2097     Register Base = computeBase(MI, AnchorAddr);
2098 
2099     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2100     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2101 
2102     for (auto P : InstsWCommonBase) {
2103       TargetLoweringBase::AddrMode AM;
2104       AM.HasBaseReg = true;
2105       AM.BaseOffs = P.second - AnchorAddr.Offset;
2106 
2107       if (TLI->isLegalGlobalAddressingMode(AM)) {
2108         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
2109                    dbgs() << ")"; P.first->dump());
2110         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2111         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
2112       }
2113     }
2114     AnchorList.insert(AnchorInst);
2115     return true;
2116   }
2117 
2118   return false;
2119 }
2120 
2121 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2122                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2123   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2124     if (AddrList.front().InstClass == CI.InstClass &&
2125         AddrList.front().IsAGPR == CI.IsAGPR &&
2126         AddrList.front().hasSameBaseAddress(*CI.I)) {
2127       AddrList.emplace_back(CI);
2128       return;
2129     }
2130   }
2131 
2132   // Base address not found, so add a new list.
2133   MergeableInsts.emplace_back(1, CI);
2134 }
2135 
2136 std::pair<MachineBasicBlock::iterator, bool>
2137 SILoadStoreOptimizer::collectMergeableInsts(
2138     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2139     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2140     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2141   bool Modified = false;
2142 
2143   // Sort potential mergeable instructions into lists.  One list per base address.
2144   unsigned Order = 0;
2145   MachineBasicBlock::iterator BlockI = Begin;
2146   for (; BlockI != End; ++BlockI) {
2147     MachineInstr &MI = *BlockI;
2148 
2149     // We run this before checking if an address is mergeable, because it can produce
2150     // better code even if the instructions aren't mergeable.
2151     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2152       Modified = true;
2153 
2154     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2155     // barriers. We can look after this barrier for separate merges.
2156     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2157       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2158 
2159       // Search will resume after this instruction in a separate merge list.
2160       ++BlockI;
2161       break;
2162     }
2163 
2164     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2165     if (InstClass == UNKNOWN)
2166       continue;
2167 
2168     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2169     int Swizzled =
2170         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2171     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2172       continue;
2173 
2174     CombineInfo CI;
2175     CI.setMI(MI, *this);
2176     CI.Order = Order++;
2177 
2178     if (!CI.hasMergeableAddress(*MRI))
2179       continue;
2180 
2181     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2182       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2183       //        operands. However we are reporting that ds_write2 shall have
2184       //        only VGPR data so that machine copy propagation does not
2185       //        create an illegal instruction with a VGPR and AGPR sources.
2186       //        Consequenctially if we create such instruction the verifier
2187       //        will complain.
2188       continue;
2189     }
2190 
2191     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2192 
2193     addInstToMergeableList(CI, MergeableInsts);
2194   }
2195 
2196   // At this point we have lists of Mergeable instructions.
2197   //
2198   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2199   // list try to find an instruction that can be merged with I.  If an instruction
2200   // is found, it is stored in the Paired field.  If no instructions are found, then
2201   // the CombineInfo object is deleted from the list.
2202 
2203   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2204                                                    E = MergeableInsts.end(); I != E;) {
2205 
2206     std::list<CombineInfo> &MergeList = *I;
2207     if (MergeList.size() <= 1) {
2208       // This means we have found only one instruction with a given address
2209       // that can be merged, and we need at least 2 instructions to do a merge,
2210       // so this list can be discarded.
2211       I = MergeableInsts.erase(I);
2212       continue;
2213     }
2214 
2215     // Sort the lists by offsets, this way mergeable instructions will be
2216     // adjacent to each other in the list, which will make it easier to find
2217     // matches.
2218     MergeList.sort(
2219         [] (const CombineInfo &A, const CombineInfo &B) {
2220           return A.Offset < B.Offset;
2221         });
2222     ++I;
2223   }
2224 
2225   return std::make_pair(BlockI, Modified);
2226 }
2227 
2228 // Scan through looking for adjacent LDS operations with constant offsets from
2229 // the same base register. We rely on the scheduler to do the hard work of
2230 // clustering nearby loads, and assume these are all adjacent.
2231 bool SILoadStoreOptimizer::optimizeBlock(
2232                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2233   bool Modified = false;
2234 
2235   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2236                                                    E = MergeableInsts.end(); I != E;) {
2237     std::list<CombineInfo> &MergeList = *I;
2238 
2239     bool OptimizeListAgain = false;
2240     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2241       // We weren't able to make any changes, so delete the list so we don't
2242       // process the same instructions the next time we try to optimize this
2243       // block.
2244       I = MergeableInsts.erase(I);
2245       continue;
2246     }
2247 
2248     Modified = true;
2249 
2250     // We made changes, but also determined that there were no more optimization
2251     // opportunities, so we don't need to reprocess the list
2252     if (!OptimizeListAgain) {
2253       I = MergeableInsts.erase(I);
2254       continue;
2255     }
2256     OptimizeAgain = true;
2257   }
2258   return Modified;
2259 }
2260 
2261 bool
2262 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2263                                           std::list<CombineInfo> &MergeList,
2264                                           bool &OptimizeListAgain) {
2265   if (MergeList.empty())
2266     return false;
2267 
2268   bool Modified = false;
2269 
2270   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2271        Next = std::next(I)) {
2272 
2273     auto First = I;
2274     auto Second = Next;
2275 
2276     if ((*First).Order > (*Second).Order)
2277       std::swap(First, Second);
2278     CombineInfo &CI = *First;
2279     CombineInfo &Paired = *Second;
2280 
2281     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2282     if (!Where) {
2283       ++I;
2284       continue;
2285     }
2286 
2287     Modified = true;
2288 
2289     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2290 
2291     MachineBasicBlock::iterator NewMI;
2292     switch (CI.InstClass) {
2293     default:
2294       llvm_unreachable("unknown InstClass");
2295       break;
2296     case DS_READ:
2297       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2298       break;
2299     case DS_WRITE:
2300       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2301       break;
2302     case S_BUFFER_LOAD_IMM:
2303       NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
2304       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2305       break;
2306     case BUFFER_LOAD:
2307       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2308       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2309       break;
2310     case BUFFER_STORE:
2311       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2312       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2313       break;
2314     case MIMG:
2315       NewMI = mergeImagePair(CI, Paired, Where->I);
2316       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2317       break;
2318     case TBUFFER_LOAD:
2319       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2320       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2321       break;
2322     case TBUFFER_STORE:
2323       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2324       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2325       break;
2326     case FLAT_LOAD:
2327     case GLOBAL_LOAD:
2328     case GLOBAL_LOAD_SADDR:
2329       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2330       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2331       break;
2332     case FLAT_STORE:
2333     case GLOBAL_STORE:
2334     case GLOBAL_STORE_SADDR:
2335       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2336       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2337       break;
2338     }
2339     CI.setMI(NewMI, *this);
2340     CI.Order = Where->Order;
2341     if (I == Second)
2342       I = Next;
2343 
2344     MergeList.erase(Second);
2345   }
2346 
2347   return Modified;
2348 }
2349 
2350 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2351   if (skipFunction(MF.getFunction()))
2352     return false;
2353 
2354   STM = &MF.getSubtarget<GCNSubtarget>();
2355   if (!STM->loadStoreOptEnabled())
2356     return false;
2357 
2358   TII = STM->getInstrInfo();
2359   TRI = &TII->getRegisterInfo();
2360 
2361   MRI = &MF.getRegInfo();
2362   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2363 
2364   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2365 
2366   bool Modified = false;
2367 
2368   // Contains the list of instructions for which constant offsets are being
2369   // promoted to the IMM. This is tracked for an entire block at time.
2370   SmallPtrSet<MachineInstr *, 4> AnchorList;
2371   MemInfoMap Visited;
2372 
2373   for (MachineBasicBlock &MBB : MF) {
2374     MachineBasicBlock::iterator SectionEnd;
2375     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2376          I = SectionEnd) {
2377       bool CollectModified;
2378       std::list<std::list<CombineInfo>> MergeableInsts;
2379 
2380       // First pass: Collect list of all instructions we know how to merge in a
2381       // subset of the block.
2382       std::tie(SectionEnd, CollectModified) =
2383           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2384 
2385       Modified |= CollectModified;
2386 
2387       do {
2388         OptimizeAgain = false;
2389         Modified |= optimizeBlock(MergeableInsts);
2390       } while (OptimizeAgain);
2391     }
2392 
2393     Visited.clear();
2394     AnchorList.clear();
2395   }
2396 
2397   return Modified;
2398 }
2399