15ffd83dbSDimitry Andric //===- X86AvoidStoreForwardingBlocks.cpp - Avoid HW Store Forward Block ---===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // If a load follows a store and reloads data that the store has written to
100b57cec5SDimitry Andric // memory, Intel microarchitectures can in many cases forward the data directly
110b57cec5SDimitry Andric // from the store to the load, This "store forwarding" saves cycles by enabling
120b57cec5SDimitry Andric // the load to directly obtain the data instead of accessing the data from
130b57cec5SDimitry Andric // cache or memory.
140b57cec5SDimitry Andric // A "store forward block" occurs in cases that a store cannot be forwarded to
150b57cec5SDimitry Andric // the load. The most typical case of store forward block on Intel Core
160b57cec5SDimitry Andric // microarchitecture that a small store cannot be forwarded to a large load.
170b57cec5SDimitry Andric // The estimated penalty for a store forward block is ~13 cycles.
180b57cec5SDimitry Andric //
190b57cec5SDimitry Andric // This pass tries to recognize and handle cases where "store forward block"
200b57cec5SDimitry Andric // is created by the compiler when lowering memcpy calls to a sequence
210b57cec5SDimitry Andric // of a load and a store.
220b57cec5SDimitry Andric //
230b57cec5SDimitry Andric // The pass currently only handles cases where memcpy is lowered to
240b57cec5SDimitry Andric // XMM/YMM registers, it tries to break the memcpy into smaller copies.
250b57cec5SDimitry Andric // breaking the memcpy should be possible since there is no atomicity
260b57cec5SDimitry Andric // guarantee for loads and stores to XMM/YMM.
270b57cec5SDimitry Andric //
280b57cec5SDimitry Andric // It could be better for performance to solve the problem by loading
290b57cec5SDimitry Andric // to XMM/YMM then inserting the partial store before storing back from XMM/YMM
300b57cec5SDimitry Andric // to memory, but this will result in a more conservative optimization since it
310b57cec5SDimitry Andric // requires we prove that all memory accesses between the blocking store and the
320b57cec5SDimitry Andric // load must alias/don't alias before we can move the store, whereas the
330b57cec5SDimitry Andric // transformation done here is correct regardless to other memory accesses.
340b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
350b57cec5SDimitry Andric 
365ffd83dbSDimitry Andric #include "X86.h"
370b57cec5SDimitry Andric #include "X86InstrInfo.h"
380b57cec5SDimitry Andric #include "X86Subtarget.h"
398bcb0991SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h"
400b57cec5SDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h"
410b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunction.h"
420b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
430b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstr.h"
440b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h"
450b57cec5SDimitry Andric #include "llvm/CodeGen/MachineOperand.h"
460b57cec5SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h"
470b57cec5SDimitry Andric #include "llvm/IR/DebugInfoMetadata.h"
480b57cec5SDimitry Andric #include "llvm/IR/DebugLoc.h"
490b57cec5SDimitry Andric #include "llvm/IR/Function.h"
50480093f4SDimitry Andric #include "llvm/InitializePasses.h"
510b57cec5SDimitry Andric #include "llvm/MC/MCInstrDesc.h"
520b57cec5SDimitry Andric 
530b57cec5SDimitry Andric using namespace llvm;
540b57cec5SDimitry Andric 
550b57cec5SDimitry Andric #define DEBUG_TYPE "x86-avoid-SFB"
560b57cec5SDimitry Andric 
570b57cec5SDimitry Andric static cl::opt<bool> DisableX86AvoidStoreForwardBlocks(
580b57cec5SDimitry Andric     "x86-disable-avoid-SFB", cl::Hidden,
590b57cec5SDimitry Andric     cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
600b57cec5SDimitry Andric 
610b57cec5SDimitry Andric static cl::opt<unsigned> X86AvoidSFBInspectionLimit(
620b57cec5SDimitry Andric     "x86-sfb-inspection-limit",
630b57cec5SDimitry Andric     cl::desc("X86: Number of instructions backward to "
640b57cec5SDimitry Andric              "inspect for store forwarding blocks."),
650b57cec5SDimitry Andric     cl::init(20), cl::Hidden);
660b57cec5SDimitry Andric 
670b57cec5SDimitry Andric namespace {
680b57cec5SDimitry Andric 
690b57cec5SDimitry Andric using DisplacementSizeMap = std::map<int64_t, unsigned>;
700b57cec5SDimitry Andric 
710b57cec5SDimitry Andric class X86AvoidSFBPass : public MachineFunctionPass {
720b57cec5SDimitry Andric public:
730b57cec5SDimitry Andric   static char ID;
740b57cec5SDimitry Andric   X86AvoidSFBPass() : MachineFunctionPass(ID) { }
750b57cec5SDimitry Andric 
760b57cec5SDimitry Andric   StringRef getPassName() const override {
770b57cec5SDimitry Andric     return "X86 Avoid Store Forwarding Blocks";
780b57cec5SDimitry Andric   }
790b57cec5SDimitry Andric 
800b57cec5SDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
810b57cec5SDimitry Andric 
820b57cec5SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
830b57cec5SDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
840b57cec5SDimitry Andric     AU.addRequired<AAResultsWrapperPass>();
850b57cec5SDimitry Andric   }
860b57cec5SDimitry Andric 
870b57cec5SDimitry Andric private:
88480093f4SDimitry Andric   MachineRegisterInfo *MRI = nullptr;
89480093f4SDimitry Andric   const X86InstrInfo *TII = nullptr;
90480093f4SDimitry Andric   const X86RegisterInfo *TRI = nullptr;
910b57cec5SDimitry Andric   SmallVector<std::pair<MachineInstr *, MachineInstr *>, 2>
920b57cec5SDimitry Andric       BlockedLoadsStoresPairs;
930b57cec5SDimitry Andric   SmallVector<MachineInstr *, 2> ForRemoval;
94480093f4SDimitry Andric   AliasAnalysis *AA = nullptr;
950b57cec5SDimitry Andric 
960b57cec5SDimitry Andric   /// Returns couples of Load then Store to memory which look
970b57cec5SDimitry Andric   ///  like a memcpy.
980b57cec5SDimitry Andric   void findPotentiallylBlockedCopies(MachineFunction &MF);
990b57cec5SDimitry Andric   /// Break the memcpy's load and store into smaller copies
1000b57cec5SDimitry Andric   /// such that each memory load that was blocked by a smaller store
1010b57cec5SDimitry Andric   /// would now be copied separately.
1020b57cec5SDimitry Andric   void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
1030b57cec5SDimitry Andric                           const DisplacementSizeMap &BlockingStoresDispSizeMap);
1040b57cec5SDimitry Andric   /// Break a copy of size Size to smaller copies.
1050b57cec5SDimitry Andric   void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
1060b57cec5SDimitry Andric                    MachineInstr *StoreInst, int64_t StDispImm,
1070b57cec5SDimitry Andric                    int64_t LMMOffset, int64_t SMMOffset);
1080b57cec5SDimitry Andric 
1090b57cec5SDimitry Andric   void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
1100b57cec5SDimitry Andric                  MachineInstr *StoreInst, unsigned NStoreOpcode,
1110b57cec5SDimitry Andric                  int64_t StoreDisp, unsigned Size, int64_t LMMOffset,
1120b57cec5SDimitry Andric                  int64_t SMMOffset);
1130b57cec5SDimitry Andric 
1140b57cec5SDimitry Andric   bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;
1150b57cec5SDimitry Andric 
1160b57cec5SDimitry Andric   unsigned getRegSizeInBytes(MachineInstr *Inst);
1170b57cec5SDimitry Andric };
1180b57cec5SDimitry Andric 
1190b57cec5SDimitry Andric } // end anonymous namespace
1200b57cec5SDimitry Andric 
1210b57cec5SDimitry Andric char X86AvoidSFBPass::ID = 0;
1220b57cec5SDimitry Andric 
1230b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking",
1240b57cec5SDimitry Andric                       false, false)
1250b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
1260b57cec5SDimitry Andric INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", false,
1270b57cec5SDimitry Andric                     false)
1280b57cec5SDimitry Andric 
1290b57cec5SDimitry Andric FunctionPass *llvm::createX86AvoidStoreForwardingBlocks() {
1300b57cec5SDimitry Andric   return new X86AvoidSFBPass();
1310b57cec5SDimitry Andric }
1320b57cec5SDimitry Andric 
1330b57cec5SDimitry Andric static bool isXMMLoadOpcode(unsigned Opcode) {
1340b57cec5SDimitry Andric   return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
1350b57cec5SDimitry Andric          Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
1360b57cec5SDimitry Andric          Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
1370b57cec5SDimitry Andric          Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
1380b57cec5SDimitry Andric          Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
1390b57cec5SDimitry Andric          Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
1400b57cec5SDimitry Andric          Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
1410b57cec5SDimitry Andric          Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
1420b57cec5SDimitry Andric }
1430b57cec5SDimitry Andric static bool isYMMLoadOpcode(unsigned Opcode) {
1440b57cec5SDimitry Andric   return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
1450b57cec5SDimitry Andric          Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
1460b57cec5SDimitry Andric          Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
1470b57cec5SDimitry Andric          Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
1480b57cec5SDimitry Andric          Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
1490b57cec5SDimitry Andric          Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
1500b57cec5SDimitry Andric          Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
1510b57cec5SDimitry Andric }
1520b57cec5SDimitry Andric 
1530b57cec5SDimitry Andric static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
1540b57cec5SDimitry Andric   return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
1550b57cec5SDimitry Andric }
1560b57cec5SDimitry Andric 
157e8d8bef9SDimitry Andric static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
1580b57cec5SDimitry Andric   switch (LdOpcode) {
1590b57cec5SDimitry Andric   case X86::MOVUPSrm:
1600b57cec5SDimitry Andric   case X86::MOVAPSrm:
1610b57cec5SDimitry Andric     return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
1620b57cec5SDimitry Andric   case X86::VMOVUPSrm:
1630b57cec5SDimitry Andric   case X86::VMOVAPSrm:
1640b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
1650b57cec5SDimitry Andric   case X86::VMOVUPDrm:
1660b57cec5SDimitry Andric   case X86::VMOVAPDrm:
1670b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
1680b57cec5SDimitry Andric   case X86::VMOVDQUrm:
1690b57cec5SDimitry Andric   case X86::VMOVDQArm:
1700b57cec5SDimitry Andric     return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
1710b57cec5SDimitry Andric   case X86::VMOVUPSZ128rm:
1720b57cec5SDimitry Andric   case X86::VMOVAPSZ128rm:
1730b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
1740b57cec5SDimitry Andric   case X86::VMOVUPDZ128rm:
1750b57cec5SDimitry Andric   case X86::VMOVAPDZ128rm:
1760b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
1770b57cec5SDimitry Andric   case X86::VMOVUPSYrm:
1780b57cec5SDimitry Andric   case X86::VMOVAPSYrm:
1790b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
1800b57cec5SDimitry Andric   case X86::VMOVUPDYrm:
1810b57cec5SDimitry Andric   case X86::VMOVAPDYrm:
1820b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
1830b57cec5SDimitry Andric   case X86::VMOVDQUYrm:
1840b57cec5SDimitry Andric   case X86::VMOVDQAYrm:
1850b57cec5SDimitry Andric     return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
1860b57cec5SDimitry Andric   case X86::VMOVUPSZ256rm:
1870b57cec5SDimitry Andric   case X86::VMOVAPSZ256rm:
1880b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
1890b57cec5SDimitry Andric   case X86::VMOVUPDZ256rm:
1900b57cec5SDimitry Andric   case X86::VMOVAPDZ256rm:
1910b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
1920b57cec5SDimitry Andric   case X86::VMOVDQU64Z128rm:
1930b57cec5SDimitry Andric   case X86::VMOVDQA64Z128rm:
1940b57cec5SDimitry Andric     return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
1950b57cec5SDimitry Andric   case X86::VMOVDQU32Z128rm:
1960b57cec5SDimitry Andric   case X86::VMOVDQA32Z128rm:
1970b57cec5SDimitry Andric     return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
1980b57cec5SDimitry Andric   case X86::VMOVDQU64Z256rm:
1990b57cec5SDimitry Andric   case X86::VMOVDQA64Z256rm:
2000b57cec5SDimitry Andric     return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
2010b57cec5SDimitry Andric   case X86::VMOVDQU32Z256rm:
2020b57cec5SDimitry Andric   case X86::VMOVDQA32Z256rm:
2030b57cec5SDimitry Andric     return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
2040b57cec5SDimitry Andric   default:
2050b57cec5SDimitry Andric     return false;
2060b57cec5SDimitry Andric   }
2070b57cec5SDimitry Andric }
2080b57cec5SDimitry Andric 
209e8d8bef9SDimitry Andric static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) {
2100b57cec5SDimitry Andric   bool PBlock = false;
2110b57cec5SDimitry Andric   PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
2120b57cec5SDimitry Andric             Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
2130b57cec5SDimitry Andric             Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
2140b57cec5SDimitry Andric             Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
2150b57cec5SDimitry Andric   if (isYMMLoadOpcode(LoadOpcode))
2160b57cec5SDimitry Andric     PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
2170b57cec5SDimitry Andric               Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
2180b57cec5SDimitry Andric               Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
2190b57cec5SDimitry Andric               Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
2200b57cec5SDimitry Andric               Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
2210b57cec5SDimitry Andric               Opcode == X86::VMOVDQU64Z128mr ||
2220b57cec5SDimitry Andric               Opcode == X86::VMOVDQA64Z128mr ||
2230b57cec5SDimitry Andric               Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
2240b57cec5SDimitry Andric   return PBlock;
2250b57cec5SDimitry Andric }
2260b57cec5SDimitry Andric 
2270b57cec5SDimitry Andric static const int MOV128SZ = 16;
2280b57cec5SDimitry Andric static const int MOV64SZ = 8;
2290b57cec5SDimitry Andric static const int MOV32SZ = 4;
2300b57cec5SDimitry Andric static const int MOV16SZ = 2;
2310b57cec5SDimitry Andric static const int MOV8SZ = 1;
2320b57cec5SDimitry Andric 
2330b57cec5SDimitry Andric static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {
2340b57cec5SDimitry Andric   switch (LoadOpcode) {
2350b57cec5SDimitry Andric   case X86::VMOVUPSYrm:
2360b57cec5SDimitry Andric   case X86::VMOVAPSYrm:
2370b57cec5SDimitry Andric     return X86::VMOVUPSrm;
2380b57cec5SDimitry Andric   case X86::VMOVUPDYrm:
2390b57cec5SDimitry Andric   case X86::VMOVAPDYrm:
2400b57cec5SDimitry Andric     return X86::VMOVUPDrm;
2410b57cec5SDimitry Andric   case X86::VMOVDQUYrm:
2420b57cec5SDimitry Andric   case X86::VMOVDQAYrm:
2430b57cec5SDimitry Andric     return X86::VMOVDQUrm;
2440b57cec5SDimitry Andric   case X86::VMOVUPSZ256rm:
2450b57cec5SDimitry Andric   case X86::VMOVAPSZ256rm:
2460b57cec5SDimitry Andric     return X86::VMOVUPSZ128rm;
2470b57cec5SDimitry Andric   case X86::VMOVUPDZ256rm:
2480b57cec5SDimitry Andric   case X86::VMOVAPDZ256rm:
2490b57cec5SDimitry Andric     return X86::VMOVUPDZ128rm;
2500b57cec5SDimitry Andric   case X86::VMOVDQU64Z256rm:
2510b57cec5SDimitry Andric   case X86::VMOVDQA64Z256rm:
2520b57cec5SDimitry Andric     return X86::VMOVDQU64Z128rm;
2530b57cec5SDimitry Andric   case X86::VMOVDQU32Z256rm:
2540b57cec5SDimitry Andric   case X86::VMOVDQA32Z256rm:
2550b57cec5SDimitry Andric     return X86::VMOVDQU32Z128rm;
2560b57cec5SDimitry Andric   default:
2570b57cec5SDimitry Andric     llvm_unreachable("Unexpected Load Instruction Opcode");
2580b57cec5SDimitry Andric   }
2590b57cec5SDimitry Andric   return 0;
2600b57cec5SDimitry Andric }
2610b57cec5SDimitry Andric 
2620b57cec5SDimitry Andric static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
2630b57cec5SDimitry Andric   switch (StoreOpcode) {
2640b57cec5SDimitry Andric   case X86::VMOVUPSYmr:
2650b57cec5SDimitry Andric   case X86::VMOVAPSYmr:
2660b57cec5SDimitry Andric     return X86::VMOVUPSmr;
2670b57cec5SDimitry Andric   case X86::VMOVUPDYmr:
2680b57cec5SDimitry Andric   case X86::VMOVAPDYmr:
2690b57cec5SDimitry Andric     return X86::VMOVUPDmr;
2700b57cec5SDimitry Andric   case X86::VMOVDQUYmr:
2710b57cec5SDimitry Andric   case X86::VMOVDQAYmr:
2720b57cec5SDimitry Andric     return X86::VMOVDQUmr;
2730b57cec5SDimitry Andric   case X86::VMOVUPSZ256mr:
2740b57cec5SDimitry Andric   case X86::VMOVAPSZ256mr:
2750b57cec5SDimitry Andric     return X86::VMOVUPSZ128mr;
2760b57cec5SDimitry Andric   case X86::VMOVUPDZ256mr:
2770b57cec5SDimitry Andric   case X86::VMOVAPDZ256mr:
2780b57cec5SDimitry Andric     return X86::VMOVUPDZ128mr;
2790b57cec5SDimitry Andric   case X86::VMOVDQU64Z256mr:
2800b57cec5SDimitry Andric   case X86::VMOVDQA64Z256mr:
2810b57cec5SDimitry Andric     return X86::VMOVDQU64Z128mr;
2820b57cec5SDimitry Andric   case X86::VMOVDQU32Z256mr:
2830b57cec5SDimitry Andric   case X86::VMOVDQA32Z256mr:
2840b57cec5SDimitry Andric     return X86::VMOVDQU32Z128mr;
2850b57cec5SDimitry Andric   default:
2860b57cec5SDimitry Andric     llvm_unreachable("Unexpected Load Instruction Opcode");
2870b57cec5SDimitry Andric   }
2880b57cec5SDimitry Andric   return 0;
2890b57cec5SDimitry Andric }
2900b57cec5SDimitry Andric 
2915ffd83dbSDimitry Andric static int getAddrOffset(const MachineInstr *MI) {
2920b57cec5SDimitry Andric   const MCInstrDesc &Descl = MI->getDesc();
2930b57cec5SDimitry Andric   int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
2940b57cec5SDimitry Andric   assert(AddrOffset != -1 && "Expected Memory Operand");
2950b57cec5SDimitry Andric   AddrOffset += X86II::getOperandBias(Descl);
2960b57cec5SDimitry Andric   return AddrOffset;
2970b57cec5SDimitry Andric }
2980b57cec5SDimitry Andric 
2990b57cec5SDimitry Andric static MachineOperand &getBaseOperand(MachineInstr *MI) {
3000b57cec5SDimitry Andric   int AddrOffset = getAddrOffset(MI);
3010b57cec5SDimitry Andric   return MI->getOperand(AddrOffset + X86::AddrBaseReg);
3020b57cec5SDimitry Andric }
3030b57cec5SDimitry Andric 
3040b57cec5SDimitry Andric static MachineOperand &getDispOperand(MachineInstr *MI) {
3050b57cec5SDimitry Andric   int AddrOffset = getAddrOffset(MI);
3060b57cec5SDimitry Andric   return MI->getOperand(AddrOffset + X86::AddrDisp);
3070b57cec5SDimitry Andric }
3080b57cec5SDimitry Andric 
3090b57cec5SDimitry Andric // Relevant addressing modes contain only base register and immediate
3100b57cec5SDimitry Andric // displacement or frameindex and immediate displacement.
3110b57cec5SDimitry Andric // TODO: Consider expanding to other addressing modes in the future
3120b57cec5SDimitry Andric static bool isRelevantAddressingMode(MachineInstr *MI) {
3130b57cec5SDimitry Andric   int AddrOffset = getAddrOffset(MI);
3145ffd83dbSDimitry Andric   const MachineOperand &Base = getBaseOperand(MI);
3155ffd83dbSDimitry Andric   const MachineOperand &Disp = getDispOperand(MI);
3165ffd83dbSDimitry Andric   const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
3175ffd83dbSDimitry Andric   const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
3185ffd83dbSDimitry Andric   const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
3190b57cec5SDimitry Andric 
3200b57cec5SDimitry Andric   if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
3210b57cec5SDimitry Andric     return false;
3220b57cec5SDimitry Andric   if (!Disp.isImm())
3230b57cec5SDimitry Andric     return false;
3240b57cec5SDimitry Andric   if (Scale.getImm() != 1)
3250b57cec5SDimitry Andric     return false;
3260b57cec5SDimitry Andric   if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
3270b57cec5SDimitry Andric     return false;
3280b57cec5SDimitry Andric   if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
3290b57cec5SDimitry Andric     return false;
3300b57cec5SDimitry Andric   return true;
3310b57cec5SDimitry Andric }
3320b57cec5SDimitry Andric 
3330b57cec5SDimitry Andric // Collect potentially blocking stores.
3340b57cec5SDimitry Andric // Limit the number of instructions backwards we want to inspect
3350b57cec5SDimitry Andric // since the effect of store block won't be visible if the store
3360b57cec5SDimitry Andric // and load instructions have enough instructions in between to
3370b57cec5SDimitry Andric // keep the core busy.
3380b57cec5SDimitry Andric static SmallVector<MachineInstr *, 2>
3390b57cec5SDimitry Andric findPotentialBlockers(MachineInstr *LoadInst) {
3400b57cec5SDimitry Andric   SmallVector<MachineInstr *, 2> PotentialBlockers;
3410b57cec5SDimitry Andric   unsigned BlockCount = 0;
3420b57cec5SDimitry Andric   const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;
3430b57cec5SDimitry Andric   for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
3440b57cec5SDimitry Andric             E = LoadInst->getParent()->rend();
3450b57cec5SDimitry Andric        PBInst != E; ++PBInst) {
3460b57cec5SDimitry Andric     if (PBInst->isMetaInstruction())
3470b57cec5SDimitry Andric       continue;
3480b57cec5SDimitry Andric     BlockCount++;
3490b57cec5SDimitry Andric     if (BlockCount >= InspectionLimit)
3500b57cec5SDimitry Andric       break;
3510b57cec5SDimitry Andric     MachineInstr &MI = *PBInst;
3520b57cec5SDimitry Andric     if (MI.getDesc().isCall())
3530b57cec5SDimitry Andric       return PotentialBlockers;
3540b57cec5SDimitry Andric     PotentialBlockers.push_back(&MI);
3550b57cec5SDimitry Andric   }
3560b57cec5SDimitry Andric   // If we didn't get to the instructions limit try predecessing blocks.
3570b57cec5SDimitry Andric   // Ideally we should traverse the predecessor blocks in depth with some
3580b57cec5SDimitry Andric   // coloring algorithm, but for now let's just look at the first order
3590b57cec5SDimitry Andric   // predecessors.
3600b57cec5SDimitry Andric   if (BlockCount < InspectionLimit) {
3610b57cec5SDimitry Andric     MachineBasicBlock *MBB = LoadInst->getParent();
3620b57cec5SDimitry Andric     int LimitLeft = InspectionLimit - BlockCount;
3630b57cec5SDimitry Andric     for (MachineBasicBlock::pred_iterator PB = MBB->pred_begin(),
3640b57cec5SDimitry Andric                                           PE = MBB->pred_end();
3650b57cec5SDimitry Andric          PB != PE; ++PB) {
3660b57cec5SDimitry Andric       MachineBasicBlock *PMBB = *PB;
3670b57cec5SDimitry Andric       int PredCount = 0;
3680b57cec5SDimitry Andric       for (MachineBasicBlock::reverse_iterator PBInst = PMBB->rbegin(),
3690b57cec5SDimitry Andric                                                PME = PMBB->rend();
3700b57cec5SDimitry Andric            PBInst != PME; ++PBInst) {
3710b57cec5SDimitry Andric         if (PBInst->isMetaInstruction())
3720b57cec5SDimitry Andric           continue;
3730b57cec5SDimitry Andric         PredCount++;
3740b57cec5SDimitry Andric         if (PredCount >= LimitLeft)
3750b57cec5SDimitry Andric           break;
3760b57cec5SDimitry Andric         if (PBInst->getDesc().isCall())
3770b57cec5SDimitry Andric           break;
3780b57cec5SDimitry Andric         PotentialBlockers.push_back(&*PBInst);
3790b57cec5SDimitry Andric       }
3800b57cec5SDimitry Andric     }
3810b57cec5SDimitry Andric   }
3820b57cec5SDimitry Andric   return PotentialBlockers;
3830b57cec5SDimitry Andric }
3840b57cec5SDimitry Andric 
3850b57cec5SDimitry Andric void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
3860b57cec5SDimitry Andric                                 int64_t LoadDisp, MachineInstr *StoreInst,
3870b57cec5SDimitry Andric                                 unsigned NStoreOpcode, int64_t StoreDisp,
3880b57cec5SDimitry Andric                                 unsigned Size, int64_t LMMOffset,
3890b57cec5SDimitry Andric                                 int64_t SMMOffset) {
3900b57cec5SDimitry Andric   MachineOperand &LoadBase = getBaseOperand(LoadInst);
3910b57cec5SDimitry Andric   MachineOperand &StoreBase = getBaseOperand(StoreInst);
3920b57cec5SDimitry Andric   MachineBasicBlock *MBB = LoadInst->getParent();
3930b57cec5SDimitry Andric   MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
3940b57cec5SDimitry Andric   MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
3950b57cec5SDimitry Andric 
3968bcb0991SDimitry Andric   Register Reg1 = MRI->createVirtualRegister(
3970b57cec5SDimitry Andric       TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
3980b57cec5SDimitry Andric   MachineInstr *NewLoad =
3990b57cec5SDimitry Andric       BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
4000b57cec5SDimitry Andric               Reg1)
4010b57cec5SDimitry Andric           .add(LoadBase)
4020b57cec5SDimitry Andric           .addImm(1)
4030b57cec5SDimitry Andric           .addReg(X86::NoRegister)
4040b57cec5SDimitry Andric           .addImm(LoadDisp)
4050b57cec5SDimitry Andric           .addReg(X86::NoRegister)
4060b57cec5SDimitry Andric           .addMemOperand(
4070b57cec5SDimitry Andric               MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size));
4080b57cec5SDimitry Andric   if (LoadBase.isReg())
4090b57cec5SDimitry Andric     getBaseOperand(NewLoad).setIsKill(false);
4100b57cec5SDimitry Andric   LLVM_DEBUG(NewLoad->dump());
4110b57cec5SDimitry Andric   // If the load and store are consecutive, use the loadInst location to
4120b57cec5SDimitry Andric   // reduce register pressure.
4130b57cec5SDimitry Andric   MachineInstr *StInst = StoreInst;
4145ffd83dbSDimitry Andric   auto PrevInstrIt = prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),
4150b57cec5SDimitry Andric                                 MBB->instr_begin());
4160b57cec5SDimitry Andric   if (PrevInstrIt.getNodePtr() == LoadInst)
4170b57cec5SDimitry Andric     StInst = LoadInst;
4180b57cec5SDimitry Andric   MachineInstr *NewStore =
4190b57cec5SDimitry Andric       BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
4200b57cec5SDimitry Andric           .add(StoreBase)
4210b57cec5SDimitry Andric           .addImm(1)
4220b57cec5SDimitry Andric           .addReg(X86::NoRegister)
4230b57cec5SDimitry Andric           .addImm(StoreDisp)
4240b57cec5SDimitry Andric           .addReg(X86::NoRegister)
4250b57cec5SDimitry Andric           .addReg(Reg1)
4260b57cec5SDimitry Andric           .addMemOperand(
4270b57cec5SDimitry Andric               MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size));
4280b57cec5SDimitry Andric   if (StoreBase.isReg())
4290b57cec5SDimitry Andric     getBaseOperand(NewStore).setIsKill(false);
4300b57cec5SDimitry Andric   MachineOperand &StoreSrcVReg = StoreInst->getOperand(X86::AddrNumOperands);
4310b57cec5SDimitry Andric   assert(StoreSrcVReg.isReg() && "Expected virtual register");
4320b57cec5SDimitry Andric   NewStore->getOperand(X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill());
4330b57cec5SDimitry Andric   LLVM_DEBUG(NewStore->dump());
4340b57cec5SDimitry Andric }
4350b57cec5SDimitry Andric 
4360b57cec5SDimitry Andric void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
4370b57cec5SDimitry Andric                                   int64_t LdDispImm, MachineInstr *StoreInst,
4380b57cec5SDimitry Andric                                   int64_t StDispImm, int64_t LMMOffset,
4390b57cec5SDimitry Andric                                   int64_t SMMOffset) {
4400b57cec5SDimitry Andric   int LdDisp = LdDispImm;
4410b57cec5SDimitry Andric   int StDisp = StDispImm;
4420b57cec5SDimitry Andric   while (Size > 0) {
4430b57cec5SDimitry Andric     if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {
4440b57cec5SDimitry Andric       Size = Size - MOV128SZ;
4450b57cec5SDimitry Andric       buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp,
4460b57cec5SDimitry Andric                 StoreInst, getYMMtoXMMStoreOpcode(StoreInst->getOpcode()),
4470b57cec5SDimitry Andric                 StDisp, MOV128SZ, LMMOffset, SMMOffset);
4480b57cec5SDimitry Andric       LdDisp += MOV128SZ;
4490b57cec5SDimitry Andric       StDisp += MOV128SZ;
4500b57cec5SDimitry Andric       LMMOffset += MOV128SZ;
4510b57cec5SDimitry Andric       SMMOffset += MOV128SZ;
4520b57cec5SDimitry Andric       continue;
4530b57cec5SDimitry Andric     }
4540b57cec5SDimitry Andric     if (Size - MOV64SZ >= 0) {
4550b57cec5SDimitry Andric       Size = Size - MOV64SZ;
4560b57cec5SDimitry Andric       buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
4570b57cec5SDimitry Andric                 MOV64SZ, LMMOffset, SMMOffset);
4580b57cec5SDimitry Andric       LdDisp += MOV64SZ;
4590b57cec5SDimitry Andric       StDisp += MOV64SZ;
4600b57cec5SDimitry Andric       LMMOffset += MOV64SZ;
4610b57cec5SDimitry Andric       SMMOffset += MOV64SZ;
4620b57cec5SDimitry Andric       continue;
4630b57cec5SDimitry Andric     }
4640b57cec5SDimitry Andric     if (Size - MOV32SZ >= 0) {
4650b57cec5SDimitry Andric       Size = Size - MOV32SZ;
4660b57cec5SDimitry Andric       buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
4670b57cec5SDimitry Andric                 MOV32SZ, LMMOffset, SMMOffset);
4680b57cec5SDimitry Andric       LdDisp += MOV32SZ;
4690b57cec5SDimitry Andric       StDisp += MOV32SZ;
4700b57cec5SDimitry Andric       LMMOffset += MOV32SZ;
4710b57cec5SDimitry Andric       SMMOffset += MOV32SZ;
4720b57cec5SDimitry Andric       continue;
4730b57cec5SDimitry Andric     }
4740b57cec5SDimitry Andric     if (Size - MOV16SZ >= 0) {
4750b57cec5SDimitry Andric       Size = Size - MOV16SZ;
4760b57cec5SDimitry Andric       buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
4770b57cec5SDimitry Andric                 MOV16SZ, LMMOffset, SMMOffset);
4780b57cec5SDimitry Andric       LdDisp += MOV16SZ;
4790b57cec5SDimitry Andric       StDisp += MOV16SZ;
4800b57cec5SDimitry Andric       LMMOffset += MOV16SZ;
4810b57cec5SDimitry Andric       SMMOffset += MOV16SZ;
4820b57cec5SDimitry Andric       continue;
4830b57cec5SDimitry Andric     }
4840b57cec5SDimitry Andric     if (Size - MOV8SZ >= 0) {
4850b57cec5SDimitry Andric       Size = Size - MOV8SZ;
4860b57cec5SDimitry Andric       buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
4870b57cec5SDimitry Andric                 MOV8SZ, LMMOffset, SMMOffset);
4880b57cec5SDimitry Andric       LdDisp += MOV8SZ;
4890b57cec5SDimitry Andric       StDisp += MOV8SZ;
4900b57cec5SDimitry Andric       LMMOffset += MOV8SZ;
4910b57cec5SDimitry Andric       SMMOffset += MOV8SZ;
4920b57cec5SDimitry Andric       continue;
4930b57cec5SDimitry Andric     }
4940b57cec5SDimitry Andric   }
4950b57cec5SDimitry Andric   assert(Size == 0 && "Wrong size division");
4960b57cec5SDimitry Andric }
4970b57cec5SDimitry Andric 
4980b57cec5SDimitry Andric static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
4990b57cec5SDimitry Andric   MachineOperand &LoadBase = getBaseOperand(LoadInst);
5000b57cec5SDimitry Andric   MachineOperand &StoreBase = getBaseOperand(StoreInst);
5015ffd83dbSDimitry Andric   auto *StorePrevNonDbgInstr =
5025ffd83dbSDimitry Andric       prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),
5035ffd83dbSDimitry Andric                  LoadInst->getParent()->instr_begin())
5045ffd83dbSDimitry Andric           .getNodePtr();
5050b57cec5SDimitry Andric   if (LoadBase.isReg()) {
5060b57cec5SDimitry Andric     MachineInstr *LastLoad = LoadInst->getPrevNode();
5070b57cec5SDimitry Andric     // If the original load and store to xmm/ymm were consecutive
5080b57cec5SDimitry Andric     // then the partial copies were also created in
5090b57cec5SDimitry Andric     // a consecutive order to reduce register pressure,
5100b57cec5SDimitry Andric     // and the location of the last load is before the last store.
5110b57cec5SDimitry Andric     if (StorePrevNonDbgInstr == LoadInst)
5120b57cec5SDimitry Andric       LastLoad = LoadInst->getPrevNode()->getPrevNode();
5130b57cec5SDimitry Andric     getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
5140b57cec5SDimitry Andric   }
5150b57cec5SDimitry Andric   if (StoreBase.isReg()) {
5160b57cec5SDimitry Andric     MachineInstr *StInst = StoreInst;
5170b57cec5SDimitry Andric     if (StorePrevNonDbgInstr == LoadInst)
5180b57cec5SDimitry Andric       StInst = LoadInst;
5190b57cec5SDimitry Andric     getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
5200b57cec5SDimitry Andric   }
5210b57cec5SDimitry Andric }
5220b57cec5SDimitry Andric 
5230b57cec5SDimitry Andric bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1,
5240b57cec5SDimitry Andric                             const MachineMemOperand &Op2) const {
5250b57cec5SDimitry Andric   if (!Op1.getValue() || !Op2.getValue())
5260b57cec5SDimitry Andric     return true;
5270b57cec5SDimitry Andric 
5280b57cec5SDimitry Andric   int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
5290b57cec5SDimitry Andric   int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
5300b57cec5SDimitry Andric   int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
5310b57cec5SDimitry Andric 
5320b57cec5SDimitry Andric   AliasResult AAResult =
5330b57cec5SDimitry Andric       AA->alias(MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
5340b57cec5SDimitry Andric                 MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
5350b57cec5SDimitry Andric   return AAResult != NoAlias;
5360b57cec5SDimitry Andric }
5370b57cec5SDimitry Andric 
5380b57cec5SDimitry Andric void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
5390b57cec5SDimitry Andric   for (auto &MBB : MF)
5400b57cec5SDimitry Andric     for (auto &MI : MBB) {
5410b57cec5SDimitry Andric       if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
5420b57cec5SDimitry Andric         continue;
5430b57cec5SDimitry Andric       int DefVR = MI.getOperand(0).getReg();
5440b57cec5SDimitry Andric       if (!MRI->hasOneNonDBGUse(DefVR))
5450b57cec5SDimitry Andric         continue;
5460b57cec5SDimitry Andric       for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end();
5470b57cec5SDimitry Andric            UI != UE;) {
5480b57cec5SDimitry Andric         MachineOperand &StoreMO = *UI++;
5490b57cec5SDimitry Andric         MachineInstr &StoreMI = *StoreMO.getParent();
5500b57cec5SDimitry Andric         // Skip cases where the memcpy may overlap.
5510b57cec5SDimitry Andric         if (StoreMI.getParent() == MI.getParent() &&
5520b57cec5SDimitry Andric             isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&
5530b57cec5SDimitry Andric             isRelevantAddressingMode(&MI) &&
5545ffd83dbSDimitry Andric             isRelevantAddressingMode(&StoreMI) &&
5555ffd83dbSDimitry Andric             MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) {
5560b57cec5SDimitry Andric           if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))
5570b57cec5SDimitry Andric             BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
5580b57cec5SDimitry Andric         }
5590b57cec5SDimitry Andric       }
5600b57cec5SDimitry Andric     }
5610b57cec5SDimitry Andric }
5620b57cec5SDimitry Andric 
5630b57cec5SDimitry Andric unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
5645ffd83dbSDimitry Andric   const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
5650b57cec5SDimitry Andric                               *LoadInst->getParent()->getParent());
5660b57cec5SDimitry Andric   return TRI->getRegSizeInBits(*TRC) / 8;
5670b57cec5SDimitry Andric }
5680b57cec5SDimitry Andric 
5690b57cec5SDimitry Andric void X86AvoidSFBPass::breakBlockedCopies(
5700b57cec5SDimitry Andric     MachineInstr *LoadInst, MachineInstr *StoreInst,
5710b57cec5SDimitry Andric     const DisplacementSizeMap &BlockingStoresDispSizeMap) {
5720b57cec5SDimitry Andric   int64_t LdDispImm = getDispOperand(LoadInst).getImm();
5730b57cec5SDimitry Andric   int64_t StDispImm = getDispOperand(StoreInst).getImm();
5740b57cec5SDimitry Andric   int64_t LMMOffset = 0;
5750b57cec5SDimitry Andric   int64_t SMMOffset = 0;
5760b57cec5SDimitry Andric 
5770b57cec5SDimitry Andric   int64_t LdDisp1 = LdDispImm;
5780b57cec5SDimitry Andric   int64_t LdDisp2 = 0;
5790b57cec5SDimitry Andric   int64_t StDisp1 = StDispImm;
5800b57cec5SDimitry Andric   int64_t StDisp2 = 0;
5810b57cec5SDimitry Andric   unsigned Size1 = 0;
5820b57cec5SDimitry Andric   unsigned Size2 = 0;
5830b57cec5SDimitry Andric   int64_t LdStDelta = StDispImm - LdDispImm;
5840b57cec5SDimitry Andric 
5850b57cec5SDimitry Andric   for (auto DispSizePair : BlockingStoresDispSizeMap) {
5860b57cec5SDimitry Andric     LdDisp2 = DispSizePair.first;
5870b57cec5SDimitry Andric     StDisp2 = DispSizePair.first + LdStDelta;
5880b57cec5SDimitry Andric     Size2 = DispSizePair.second;
5890b57cec5SDimitry Andric     // Avoid copying overlapping areas.
5900b57cec5SDimitry Andric     if (LdDisp2 < LdDisp1) {
5910b57cec5SDimitry Andric       int OverlapDelta = LdDisp1 - LdDisp2;
5920b57cec5SDimitry Andric       LdDisp2 += OverlapDelta;
5930b57cec5SDimitry Andric       StDisp2 += OverlapDelta;
5940b57cec5SDimitry Andric       Size2 -= OverlapDelta;
5950b57cec5SDimitry Andric     }
5960b57cec5SDimitry Andric     Size1 = LdDisp2 - LdDisp1;
5970b57cec5SDimitry Andric 
5980b57cec5SDimitry Andric     // Build a copy for the point until the current blocking store's
5990b57cec5SDimitry Andric     // displacement.
6000b57cec5SDimitry Andric     buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
6010b57cec5SDimitry Andric                 SMMOffset);
6020b57cec5SDimitry Andric     // Build a copy for the current blocking store.
6030b57cec5SDimitry Andric     buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1,
6040b57cec5SDimitry Andric                 SMMOffset + Size1);
6050b57cec5SDimitry Andric     LdDisp1 = LdDisp2 + Size2;
6060b57cec5SDimitry Andric     StDisp1 = StDisp2 + Size2;
6070b57cec5SDimitry Andric     LMMOffset += Size1 + Size2;
6080b57cec5SDimitry Andric     SMMOffset += Size1 + Size2;
6090b57cec5SDimitry Andric   }
6100b57cec5SDimitry Andric   unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
6110b57cec5SDimitry Andric   buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
6120b57cec5SDimitry Andric               LMMOffset);
6130b57cec5SDimitry Andric }
6140b57cec5SDimitry Andric 
6150b57cec5SDimitry Andric static bool hasSameBaseOpValue(MachineInstr *LoadInst,
6160b57cec5SDimitry Andric                                MachineInstr *StoreInst) {
6175ffd83dbSDimitry Andric   const MachineOperand &LoadBase = getBaseOperand(LoadInst);
6185ffd83dbSDimitry Andric   const MachineOperand &StoreBase = getBaseOperand(StoreInst);
6190b57cec5SDimitry Andric   if (LoadBase.isReg() != StoreBase.isReg())
6200b57cec5SDimitry Andric     return false;
6210b57cec5SDimitry Andric   if (LoadBase.isReg())
6220b57cec5SDimitry Andric     return LoadBase.getReg() == StoreBase.getReg();
6230b57cec5SDimitry Andric   return LoadBase.getIndex() == StoreBase.getIndex();
6240b57cec5SDimitry Andric }
6250b57cec5SDimitry Andric 
6260b57cec5SDimitry Andric static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,
6270b57cec5SDimitry Andric                             int64_t StoreDispImm, unsigned StoreSize) {
6280b57cec5SDimitry Andric   return ((StoreDispImm >= LoadDispImm) &&
6290b57cec5SDimitry Andric           (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
6300b57cec5SDimitry Andric }
6310b57cec5SDimitry Andric 
6320b57cec5SDimitry Andric // Keep track of all stores blocking a load
6330b57cec5SDimitry Andric static void
6340b57cec5SDimitry Andric updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,
6350b57cec5SDimitry Andric                                 int64_t DispImm, unsigned Size) {
6360b57cec5SDimitry Andric   if (BlockingStoresDispSizeMap.count(DispImm)) {
6370b57cec5SDimitry Andric     // Choose the smallest blocking store starting at this displacement.
6380b57cec5SDimitry Andric     if (BlockingStoresDispSizeMap[DispImm] > Size)
6390b57cec5SDimitry Andric       BlockingStoresDispSizeMap[DispImm] = Size;
6400b57cec5SDimitry Andric 
6410b57cec5SDimitry Andric   } else
6420b57cec5SDimitry Andric     BlockingStoresDispSizeMap[DispImm] = Size;
6430b57cec5SDimitry Andric }
6440b57cec5SDimitry Andric 
6450b57cec5SDimitry Andric // Remove blocking stores contained in each other.
6460b57cec5SDimitry Andric static void
6470b57cec5SDimitry Andric removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {
6480b57cec5SDimitry Andric   if (BlockingStoresDispSizeMap.size() <= 1)
6490b57cec5SDimitry Andric     return;
6500b57cec5SDimitry Andric 
6510b57cec5SDimitry Andric   SmallVector<std::pair<int64_t, unsigned>, 0> DispSizeStack;
6520b57cec5SDimitry Andric   for (auto DispSizePair : BlockingStoresDispSizeMap) {
6530b57cec5SDimitry Andric     int64_t CurrDisp = DispSizePair.first;
6540b57cec5SDimitry Andric     unsigned CurrSize = DispSizePair.second;
6550b57cec5SDimitry Andric     while (DispSizeStack.size()) {
6560b57cec5SDimitry Andric       int64_t PrevDisp = DispSizeStack.back().first;
6570b57cec5SDimitry Andric       unsigned PrevSize = DispSizeStack.back().second;
6580b57cec5SDimitry Andric       if (CurrDisp + CurrSize > PrevDisp + PrevSize)
6590b57cec5SDimitry Andric         break;
6600b57cec5SDimitry Andric       DispSizeStack.pop_back();
6610b57cec5SDimitry Andric     }
6620b57cec5SDimitry Andric     DispSizeStack.push_back(DispSizePair);
6630b57cec5SDimitry Andric   }
6640b57cec5SDimitry Andric   BlockingStoresDispSizeMap.clear();
6650b57cec5SDimitry Andric   for (auto Disp : DispSizeStack)
6660b57cec5SDimitry Andric     BlockingStoresDispSizeMap.insert(Disp);
6670b57cec5SDimitry Andric }
6680b57cec5SDimitry Andric 
6690b57cec5SDimitry Andric bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
6700b57cec5SDimitry Andric   bool Changed = false;
6710b57cec5SDimitry Andric 
6720b57cec5SDimitry Andric   if (DisableX86AvoidStoreForwardBlocks || skipFunction(MF.getFunction()) ||
6730b57cec5SDimitry Andric       !MF.getSubtarget<X86Subtarget>().is64Bit())
6740b57cec5SDimitry Andric     return false;
6750b57cec5SDimitry Andric 
6760b57cec5SDimitry Andric   MRI = &MF.getRegInfo();
6770b57cec5SDimitry Andric   assert(MRI->isSSA() && "Expected MIR to be in SSA form");
6780b57cec5SDimitry Andric   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
6790b57cec5SDimitry Andric   TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
6800b57cec5SDimitry Andric   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
6810b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);
6820b57cec5SDimitry Andric   // Look for a load then a store to XMM/YMM which look like a memcpy
6830b57cec5SDimitry Andric   findPotentiallylBlockedCopies(MF);
6840b57cec5SDimitry Andric 
6850b57cec5SDimitry Andric   for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
6860b57cec5SDimitry Andric     MachineInstr *LoadInst = LoadStoreInstPair.first;
6870b57cec5SDimitry Andric     int64_t LdDispImm = getDispOperand(LoadInst).getImm();
6880b57cec5SDimitry Andric     DisplacementSizeMap BlockingStoresDispSizeMap;
6890b57cec5SDimitry Andric 
6900b57cec5SDimitry Andric     SmallVector<MachineInstr *, 2> PotentialBlockers =
6910b57cec5SDimitry Andric         findPotentialBlockers(LoadInst);
6925ffd83dbSDimitry Andric     for (auto *PBInst : PotentialBlockers) {
6930b57cec5SDimitry Andric       if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
6940b57cec5SDimitry Andric                                         LoadInst->getOpcode()) ||
6955ffd83dbSDimitry Andric           !isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand())
6960b57cec5SDimitry Andric         continue;
6970b57cec5SDimitry Andric       int64_t PBstDispImm = getDispOperand(PBInst).getImm();
6980b57cec5SDimitry Andric       unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
6990b57cec5SDimitry Andric       // This check doesn't cover all cases, but it will suffice for now.
7000b57cec5SDimitry Andric       // TODO: take branch probability into consideration, if the blocking
7010b57cec5SDimitry Andric       // store is in an unreached block, breaking the memcopy could lose
7020b57cec5SDimitry Andric       // performance.
7030b57cec5SDimitry Andric       if (hasSameBaseOpValue(LoadInst, PBInst) &&
7040b57cec5SDimitry Andric           isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm,
7050b57cec5SDimitry Andric                           PBstSize))
7060b57cec5SDimitry Andric         updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm,
7070b57cec5SDimitry Andric                                         PBstSize);
7080b57cec5SDimitry Andric     }
7090b57cec5SDimitry Andric 
7100b57cec5SDimitry Andric     if (BlockingStoresDispSizeMap.empty())
7110b57cec5SDimitry Andric       continue;
7120b57cec5SDimitry Andric 
7130b57cec5SDimitry Andric     // We found a store forward block, break the memcpy's load and store
7140b57cec5SDimitry Andric     // into smaller copies such that each smaller store that was causing
7150b57cec5SDimitry Andric     // a store block would now be copied separately.
7160b57cec5SDimitry Andric     MachineInstr *StoreInst = LoadStoreInstPair.second;
7170b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n");
7180b57cec5SDimitry Andric     LLVM_DEBUG(LoadInst->dump());
7190b57cec5SDimitry Andric     LLVM_DEBUG(StoreInst->dump());
7200b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "Replaced with:\n");
7210b57cec5SDimitry Andric     removeRedundantBlockingStores(BlockingStoresDispSizeMap);
7220b57cec5SDimitry Andric     breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);
7230b57cec5SDimitry Andric     updateKillStatus(LoadInst, StoreInst);
7240b57cec5SDimitry Andric     ForRemoval.push_back(LoadInst);
7250b57cec5SDimitry Andric     ForRemoval.push_back(StoreInst);
7260b57cec5SDimitry Andric   }
7275ffd83dbSDimitry Andric   for (auto *RemovedInst : ForRemoval) {
7280b57cec5SDimitry Andric     RemovedInst->eraseFromParent();
7290b57cec5SDimitry Andric   }
7300b57cec5SDimitry Andric   ForRemoval.clear();
7310b57cec5SDimitry Andric   BlockedLoadsStoresPairs.clear();
7320b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);
7330b57cec5SDimitry Andric 
7340b57cec5SDimitry Andric   return Changed;
7350b57cec5SDimitry Andric }
736