15ffd83dbSDimitry Andric //===- X86AvoidStoreForwardingBlocks.cpp - Avoid HW Store Forward Block ---===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // If a load follows a store and reloads data that the store has written to
100b57cec5SDimitry Andric // memory, Intel microarchitectures can in many cases forward the data directly
110b57cec5SDimitry Andric // from the store to the load, This "store forwarding" saves cycles by enabling
120b57cec5SDimitry Andric // the load to directly obtain the data instead of accessing the data from
130b57cec5SDimitry Andric // cache or memory.
140b57cec5SDimitry Andric // A "store forward block" occurs in cases that a store cannot be forwarded to
150b57cec5SDimitry Andric // the load. The most typical case of store forward block on Intel Core
160b57cec5SDimitry Andric // microarchitecture that a small store cannot be forwarded to a large load.
170b57cec5SDimitry Andric // The estimated penalty for a store forward block is ~13 cycles.
180b57cec5SDimitry Andric //
190b57cec5SDimitry Andric // This pass tries to recognize and handle cases where "store forward block"
200b57cec5SDimitry Andric // is created by the compiler when lowering memcpy calls to a sequence
210b57cec5SDimitry Andric // of a load and a store.
220b57cec5SDimitry Andric //
230b57cec5SDimitry Andric // The pass currently only handles cases where memcpy is lowered to
240b57cec5SDimitry Andric // XMM/YMM registers, it tries to break the memcpy into smaller copies.
250b57cec5SDimitry Andric // breaking the memcpy should be possible since there is no atomicity
260b57cec5SDimitry Andric // guarantee for loads and stores to XMM/YMM.
270b57cec5SDimitry Andric //
280b57cec5SDimitry Andric // It could be better for performance to solve the problem by loading
290b57cec5SDimitry Andric // to XMM/YMM then inserting the partial store before storing back from XMM/YMM
300b57cec5SDimitry Andric // to memory, but this will result in a more conservative optimization since it
310b57cec5SDimitry Andric // requires we prove that all memory accesses between the blocking store and the
320b57cec5SDimitry Andric // load must alias/don't alias before we can move the store, whereas the
330b57cec5SDimitry Andric // transformation done here is correct regardless to other memory accesses.
340b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
350b57cec5SDimitry Andric 
365ffd83dbSDimitry Andric #include "X86.h"
370b57cec5SDimitry Andric #include "X86InstrInfo.h"
380b57cec5SDimitry Andric #include "X86Subtarget.h"
398bcb0991SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h"
400b57cec5SDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h"
410b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunction.h"
420b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
430b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstr.h"
440b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h"
450b57cec5SDimitry Andric #include "llvm/CodeGen/MachineOperand.h"
460b57cec5SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h"
470b57cec5SDimitry Andric #include "llvm/IR/DebugInfoMetadata.h"
480b57cec5SDimitry Andric #include "llvm/IR/DebugLoc.h"
490b57cec5SDimitry Andric #include "llvm/IR/Function.h"
50480093f4SDimitry Andric #include "llvm/InitializePasses.h"
510b57cec5SDimitry Andric #include "llvm/MC/MCInstrDesc.h"
520b57cec5SDimitry Andric 
530b57cec5SDimitry Andric using namespace llvm;
540b57cec5SDimitry Andric 
550b57cec5SDimitry Andric #define DEBUG_TYPE "x86-avoid-SFB"
560b57cec5SDimitry Andric 
570b57cec5SDimitry Andric static cl::opt<bool> DisableX86AvoidStoreForwardBlocks(
580b57cec5SDimitry Andric     "x86-disable-avoid-SFB", cl::Hidden,
590b57cec5SDimitry Andric     cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
600b57cec5SDimitry Andric 
610b57cec5SDimitry Andric static cl::opt<unsigned> X86AvoidSFBInspectionLimit(
620b57cec5SDimitry Andric     "x86-sfb-inspection-limit",
630b57cec5SDimitry Andric     cl::desc("X86: Number of instructions backward to "
640b57cec5SDimitry Andric              "inspect for store forwarding blocks."),
650b57cec5SDimitry Andric     cl::init(20), cl::Hidden);
660b57cec5SDimitry Andric 
670b57cec5SDimitry Andric namespace {
680b57cec5SDimitry Andric 
690b57cec5SDimitry Andric using DisplacementSizeMap = std::map<int64_t, unsigned>;
700b57cec5SDimitry Andric 
710b57cec5SDimitry Andric class X86AvoidSFBPass : public MachineFunctionPass {
720b57cec5SDimitry Andric public:
730b57cec5SDimitry Andric   static char ID;
X86AvoidSFBPass()740b57cec5SDimitry Andric   X86AvoidSFBPass() : MachineFunctionPass(ID) { }
750b57cec5SDimitry Andric 
getPassName() const760b57cec5SDimitry Andric   StringRef getPassName() const override {
770b57cec5SDimitry Andric     return "X86 Avoid Store Forwarding Blocks";
780b57cec5SDimitry Andric   }
790b57cec5SDimitry Andric 
800b57cec5SDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
810b57cec5SDimitry Andric 
getAnalysisUsage(AnalysisUsage & AU) const820b57cec5SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
830b57cec5SDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
840b57cec5SDimitry Andric     AU.addRequired<AAResultsWrapperPass>();
850b57cec5SDimitry Andric   }
860b57cec5SDimitry Andric 
870b57cec5SDimitry Andric private:
88480093f4SDimitry Andric   MachineRegisterInfo *MRI = nullptr;
89480093f4SDimitry Andric   const X86InstrInfo *TII = nullptr;
90480093f4SDimitry Andric   const X86RegisterInfo *TRI = nullptr;
910b57cec5SDimitry Andric   SmallVector<std::pair<MachineInstr *, MachineInstr *>, 2>
920b57cec5SDimitry Andric       BlockedLoadsStoresPairs;
930b57cec5SDimitry Andric   SmallVector<MachineInstr *, 2> ForRemoval;
94480093f4SDimitry Andric   AliasAnalysis *AA = nullptr;
950b57cec5SDimitry Andric 
960b57cec5SDimitry Andric   /// Returns couples of Load then Store to memory which look
970b57cec5SDimitry Andric   ///  like a memcpy.
980b57cec5SDimitry Andric   void findPotentiallylBlockedCopies(MachineFunction &MF);
990b57cec5SDimitry Andric   /// Break the memcpy's load and store into smaller copies
1000b57cec5SDimitry Andric   /// such that each memory load that was blocked by a smaller store
1010b57cec5SDimitry Andric   /// would now be copied separately.
1020b57cec5SDimitry Andric   void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
1030b57cec5SDimitry Andric                           const DisplacementSizeMap &BlockingStoresDispSizeMap);
1040b57cec5SDimitry Andric   /// Break a copy of size Size to smaller copies.
1050b57cec5SDimitry Andric   void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
1060b57cec5SDimitry Andric                    MachineInstr *StoreInst, int64_t StDispImm,
1070b57cec5SDimitry Andric                    int64_t LMMOffset, int64_t SMMOffset);
1080b57cec5SDimitry Andric 
1090b57cec5SDimitry Andric   void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
1100b57cec5SDimitry Andric                  MachineInstr *StoreInst, unsigned NStoreOpcode,
1110b57cec5SDimitry Andric                  int64_t StoreDisp, unsigned Size, int64_t LMMOffset,
1120b57cec5SDimitry Andric                  int64_t SMMOffset);
1130b57cec5SDimitry Andric 
1140b57cec5SDimitry Andric   bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;
1150b57cec5SDimitry Andric 
1160b57cec5SDimitry Andric   unsigned getRegSizeInBytes(MachineInstr *Inst);
1170b57cec5SDimitry Andric };
1180b57cec5SDimitry Andric 
1190b57cec5SDimitry Andric } // end anonymous namespace
1200b57cec5SDimitry Andric 
1210b57cec5SDimitry Andric char X86AvoidSFBPass::ID = 0;
1220b57cec5SDimitry Andric 
1230b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking",
1240b57cec5SDimitry Andric                       false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)1250b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
1260b57cec5SDimitry Andric INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", false,
1270b57cec5SDimitry Andric                     false)
1280b57cec5SDimitry Andric 
1290b57cec5SDimitry Andric FunctionPass *llvm::createX86AvoidStoreForwardingBlocks() {
1300b57cec5SDimitry Andric   return new X86AvoidSFBPass();
1310b57cec5SDimitry Andric }
1320b57cec5SDimitry Andric 
isXMMLoadOpcode(unsigned Opcode)1330b57cec5SDimitry Andric static bool isXMMLoadOpcode(unsigned Opcode) {
1340b57cec5SDimitry Andric   return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
1350b57cec5SDimitry Andric          Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
1360b57cec5SDimitry Andric          Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
1370b57cec5SDimitry Andric          Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
1380b57cec5SDimitry Andric          Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
1390b57cec5SDimitry Andric          Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
1400b57cec5SDimitry Andric          Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
1410b57cec5SDimitry Andric          Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
1420b57cec5SDimitry Andric }
isYMMLoadOpcode(unsigned Opcode)1430b57cec5SDimitry Andric static bool isYMMLoadOpcode(unsigned Opcode) {
1440b57cec5SDimitry Andric   return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
1450b57cec5SDimitry Andric          Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
1460b57cec5SDimitry Andric          Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
1470b57cec5SDimitry Andric          Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
1480b57cec5SDimitry Andric          Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
1490b57cec5SDimitry Andric          Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
1500b57cec5SDimitry Andric          Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
1510b57cec5SDimitry Andric }
1520b57cec5SDimitry Andric 
isPotentialBlockedMemCpyLd(unsigned Opcode)1530b57cec5SDimitry Andric static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
1540b57cec5SDimitry Andric   return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
1550b57cec5SDimitry Andric }
1560b57cec5SDimitry Andric 
isPotentialBlockedMemCpyPair(unsigned LdOpcode,unsigned StOpcode)157e8d8bef9SDimitry Andric static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
1580b57cec5SDimitry Andric   switch (LdOpcode) {
1590b57cec5SDimitry Andric   case X86::MOVUPSrm:
1600b57cec5SDimitry Andric   case X86::MOVAPSrm:
1610b57cec5SDimitry Andric     return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
1620b57cec5SDimitry Andric   case X86::VMOVUPSrm:
1630b57cec5SDimitry Andric   case X86::VMOVAPSrm:
1640b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
1650b57cec5SDimitry Andric   case X86::VMOVUPDrm:
1660b57cec5SDimitry Andric   case X86::VMOVAPDrm:
1670b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
1680b57cec5SDimitry Andric   case X86::VMOVDQUrm:
1690b57cec5SDimitry Andric   case X86::VMOVDQArm:
1700b57cec5SDimitry Andric     return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
1710b57cec5SDimitry Andric   case X86::VMOVUPSZ128rm:
1720b57cec5SDimitry Andric   case X86::VMOVAPSZ128rm:
1730b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
1740b57cec5SDimitry Andric   case X86::VMOVUPDZ128rm:
1750b57cec5SDimitry Andric   case X86::VMOVAPDZ128rm:
1760b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
1770b57cec5SDimitry Andric   case X86::VMOVUPSYrm:
1780b57cec5SDimitry Andric   case X86::VMOVAPSYrm:
1790b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
1800b57cec5SDimitry Andric   case X86::VMOVUPDYrm:
1810b57cec5SDimitry Andric   case X86::VMOVAPDYrm:
1820b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
1830b57cec5SDimitry Andric   case X86::VMOVDQUYrm:
1840b57cec5SDimitry Andric   case X86::VMOVDQAYrm:
1850b57cec5SDimitry Andric     return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
1860b57cec5SDimitry Andric   case X86::VMOVUPSZ256rm:
1870b57cec5SDimitry Andric   case X86::VMOVAPSZ256rm:
1880b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
1890b57cec5SDimitry Andric   case X86::VMOVUPDZ256rm:
1900b57cec5SDimitry Andric   case X86::VMOVAPDZ256rm:
1910b57cec5SDimitry Andric     return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
1920b57cec5SDimitry Andric   case X86::VMOVDQU64Z128rm:
1930b57cec5SDimitry Andric   case X86::VMOVDQA64Z128rm:
1940b57cec5SDimitry Andric     return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
1950b57cec5SDimitry Andric   case X86::VMOVDQU32Z128rm:
1960b57cec5SDimitry Andric   case X86::VMOVDQA32Z128rm:
1970b57cec5SDimitry Andric     return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
1980b57cec5SDimitry Andric   case X86::VMOVDQU64Z256rm:
1990b57cec5SDimitry Andric   case X86::VMOVDQA64Z256rm:
2000b57cec5SDimitry Andric     return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
2010b57cec5SDimitry Andric   case X86::VMOVDQU32Z256rm:
2020b57cec5SDimitry Andric   case X86::VMOVDQA32Z256rm:
2030b57cec5SDimitry Andric     return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
2040b57cec5SDimitry Andric   default:
2050b57cec5SDimitry Andric     return false;
2060b57cec5SDimitry Andric   }
2070b57cec5SDimitry Andric }
2080b57cec5SDimitry Andric 
isPotentialBlockingStoreInst(unsigned Opcode,unsigned LoadOpcode)209e8d8bef9SDimitry Andric static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) {
2100b57cec5SDimitry Andric   bool PBlock = false;
2110b57cec5SDimitry Andric   PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
2120b57cec5SDimitry Andric             Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
2130b57cec5SDimitry Andric             Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
2140b57cec5SDimitry Andric             Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
2150b57cec5SDimitry Andric   if (isYMMLoadOpcode(LoadOpcode))
2160b57cec5SDimitry Andric     PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
2170b57cec5SDimitry Andric               Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
2180b57cec5SDimitry Andric               Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
2190b57cec5SDimitry Andric               Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
2200b57cec5SDimitry Andric               Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
2210b57cec5SDimitry Andric               Opcode == X86::VMOVDQU64Z128mr ||
2220b57cec5SDimitry Andric               Opcode == X86::VMOVDQA64Z128mr ||
2230b57cec5SDimitry Andric               Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
2240b57cec5SDimitry Andric   return PBlock;
2250b57cec5SDimitry Andric }
2260b57cec5SDimitry Andric 
2270b57cec5SDimitry Andric static const int MOV128SZ = 16;
2280b57cec5SDimitry Andric static const int MOV64SZ = 8;
2290b57cec5SDimitry Andric static const int MOV32SZ = 4;
2300b57cec5SDimitry Andric static const int MOV16SZ = 2;
2310b57cec5SDimitry Andric static const int MOV8SZ = 1;
2320b57cec5SDimitry Andric 
getYMMtoXMMLoadOpcode(unsigned LoadOpcode)2330b57cec5SDimitry Andric static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {
2340b57cec5SDimitry Andric   switch (LoadOpcode) {
2350b57cec5SDimitry Andric   case X86::VMOVUPSYrm:
2360b57cec5SDimitry Andric   case X86::VMOVAPSYrm:
2370b57cec5SDimitry Andric     return X86::VMOVUPSrm;
2380b57cec5SDimitry Andric   case X86::VMOVUPDYrm:
2390b57cec5SDimitry Andric   case X86::VMOVAPDYrm:
2400b57cec5SDimitry Andric     return X86::VMOVUPDrm;
2410b57cec5SDimitry Andric   case X86::VMOVDQUYrm:
2420b57cec5SDimitry Andric   case X86::VMOVDQAYrm:
2430b57cec5SDimitry Andric     return X86::VMOVDQUrm;
2440b57cec5SDimitry Andric   case X86::VMOVUPSZ256rm:
2450b57cec5SDimitry Andric   case X86::VMOVAPSZ256rm:
2460b57cec5SDimitry Andric     return X86::VMOVUPSZ128rm;
2470b57cec5SDimitry Andric   case X86::VMOVUPDZ256rm:
2480b57cec5SDimitry Andric   case X86::VMOVAPDZ256rm:
2490b57cec5SDimitry Andric     return X86::VMOVUPDZ128rm;
2500b57cec5SDimitry Andric   case X86::VMOVDQU64Z256rm:
2510b57cec5SDimitry Andric   case X86::VMOVDQA64Z256rm:
2520b57cec5SDimitry Andric     return X86::VMOVDQU64Z128rm;
2530b57cec5SDimitry Andric   case X86::VMOVDQU32Z256rm:
2540b57cec5SDimitry Andric   case X86::VMOVDQA32Z256rm:
2550b57cec5SDimitry Andric     return X86::VMOVDQU32Z128rm;
2560b57cec5SDimitry Andric   default:
2570b57cec5SDimitry Andric     llvm_unreachable("Unexpected Load Instruction Opcode");
2580b57cec5SDimitry Andric   }
2590b57cec5SDimitry Andric   return 0;
2600b57cec5SDimitry Andric }
2610b57cec5SDimitry Andric 
getYMMtoXMMStoreOpcode(unsigned StoreOpcode)2620b57cec5SDimitry Andric static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
2630b57cec5SDimitry Andric   switch (StoreOpcode) {
2640b57cec5SDimitry Andric   case X86::VMOVUPSYmr:
2650b57cec5SDimitry Andric   case X86::VMOVAPSYmr:
2660b57cec5SDimitry Andric     return X86::VMOVUPSmr;
2670b57cec5SDimitry Andric   case X86::VMOVUPDYmr:
2680b57cec5SDimitry Andric   case X86::VMOVAPDYmr:
2690b57cec5SDimitry Andric     return X86::VMOVUPDmr;
2700b57cec5SDimitry Andric   case X86::VMOVDQUYmr:
2710b57cec5SDimitry Andric   case X86::VMOVDQAYmr:
2720b57cec5SDimitry Andric     return X86::VMOVDQUmr;
2730b57cec5SDimitry Andric   case X86::VMOVUPSZ256mr:
2740b57cec5SDimitry Andric   case X86::VMOVAPSZ256mr:
2750b57cec5SDimitry Andric     return X86::VMOVUPSZ128mr;
2760b57cec5SDimitry Andric   case X86::VMOVUPDZ256mr:
2770b57cec5SDimitry Andric   case X86::VMOVAPDZ256mr:
2780b57cec5SDimitry Andric     return X86::VMOVUPDZ128mr;
2790b57cec5SDimitry Andric   case X86::VMOVDQU64Z256mr:
2800b57cec5SDimitry Andric   case X86::VMOVDQA64Z256mr:
2810b57cec5SDimitry Andric     return X86::VMOVDQU64Z128mr;
2820b57cec5SDimitry Andric   case X86::VMOVDQU32Z256mr:
2830b57cec5SDimitry Andric   case X86::VMOVDQA32Z256mr:
2840b57cec5SDimitry Andric     return X86::VMOVDQU32Z128mr;
2850b57cec5SDimitry Andric   default:
2860b57cec5SDimitry Andric     llvm_unreachable("Unexpected Load Instruction Opcode");
2870b57cec5SDimitry Andric   }
2880b57cec5SDimitry Andric   return 0;
2890b57cec5SDimitry Andric }
2900b57cec5SDimitry Andric 
getAddrOffset(const MachineInstr * MI)2915ffd83dbSDimitry Andric static int getAddrOffset(const MachineInstr *MI) {
2920b57cec5SDimitry Andric   const MCInstrDesc &Descl = MI->getDesc();
2930b57cec5SDimitry Andric   int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
2940b57cec5SDimitry Andric   assert(AddrOffset != -1 && "Expected Memory Operand");
2950b57cec5SDimitry Andric   AddrOffset += X86II::getOperandBias(Descl);
2960b57cec5SDimitry Andric   return AddrOffset;
2970b57cec5SDimitry Andric }
2980b57cec5SDimitry Andric 
getBaseOperand(MachineInstr * MI)2990b57cec5SDimitry Andric static MachineOperand &getBaseOperand(MachineInstr *MI) {
3000b57cec5SDimitry Andric   int AddrOffset = getAddrOffset(MI);
3010b57cec5SDimitry Andric   return MI->getOperand(AddrOffset + X86::AddrBaseReg);
3020b57cec5SDimitry Andric }
3030b57cec5SDimitry Andric 
getDispOperand(MachineInstr * MI)3040b57cec5SDimitry Andric static MachineOperand &getDispOperand(MachineInstr *MI) {
3050b57cec5SDimitry Andric   int AddrOffset = getAddrOffset(MI);
3060b57cec5SDimitry Andric   return MI->getOperand(AddrOffset + X86::AddrDisp);
3070b57cec5SDimitry Andric }
3080b57cec5SDimitry Andric 
3090b57cec5SDimitry Andric // Relevant addressing modes contain only base register and immediate
3100b57cec5SDimitry Andric // displacement or frameindex and immediate displacement.
3110b57cec5SDimitry Andric // TODO: Consider expanding to other addressing modes in the future
isRelevantAddressingMode(MachineInstr * MI)3120b57cec5SDimitry Andric static bool isRelevantAddressingMode(MachineInstr *MI) {
3130b57cec5SDimitry Andric   int AddrOffset = getAddrOffset(MI);
3145ffd83dbSDimitry Andric   const MachineOperand &Base = getBaseOperand(MI);
3155ffd83dbSDimitry Andric   const MachineOperand &Disp = getDispOperand(MI);
3165ffd83dbSDimitry Andric   const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
3175ffd83dbSDimitry Andric   const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
3185ffd83dbSDimitry Andric   const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
3190b57cec5SDimitry Andric 
3200b57cec5SDimitry Andric   if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
3210b57cec5SDimitry Andric     return false;
3220b57cec5SDimitry Andric   if (!Disp.isImm())
3230b57cec5SDimitry Andric     return false;
3240b57cec5SDimitry Andric   if (Scale.getImm() != 1)
3250b57cec5SDimitry Andric     return false;
3260b57cec5SDimitry Andric   if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
3270b57cec5SDimitry Andric     return false;
3280b57cec5SDimitry Andric   if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
3290b57cec5SDimitry Andric     return false;
3300b57cec5SDimitry Andric   return true;
3310b57cec5SDimitry Andric }
3320b57cec5SDimitry Andric 
3330b57cec5SDimitry Andric // Collect potentially blocking stores.
3340b57cec5SDimitry Andric // Limit the number of instructions backwards we want to inspect
3350b57cec5SDimitry Andric // since the effect of store block won't be visible if the store
3360b57cec5SDimitry Andric // and load instructions have enough instructions in between to
3370b57cec5SDimitry Andric // keep the core busy.
3380b57cec5SDimitry Andric static SmallVector<MachineInstr *, 2>
findPotentialBlockers(MachineInstr * LoadInst)3390b57cec5SDimitry Andric findPotentialBlockers(MachineInstr *LoadInst) {
3400b57cec5SDimitry Andric   SmallVector<MachineInstr *, 2> PotentialBlockers;
3410b57cec5SDimitry Andric   unsigned BlockCount = 0;
3420b57cec5SDimitry Andric   const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;
3430b57cec5SDimitry Andric   for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
3440b57cec5SDimitry Andric             E = LoadInst->getParent()->rend();
3450b57cec5SDimitry Andric        PBInst != E; ++PBInst) {
3460b57cec5SDimitry Andric     if (PBInst->isMetaInstruction())
3470b57cec5SDimitry Andric       continue;
3480b57cec5SDimitry Andric     BlockCount++;
3490b57cec5SDimitry Andric     if (BlockCount >= InspectionLimit)
3500b57cec5SDimitry Andric       break;
3510b57cec5SDimitry Andric     MachineInstr &MI = *PBInst;
3520b57cec5SDimitry Andric     if (MI.getDesc().isCall())
3530b57cec5SDimitry Andric       return PotentialBlockers;
3540b57cec5SDimitry Andric     PotentialBlockers.push_back(&MI);
3550b57cec5SDimitry Andric   }
3560b57cec5SDimitry Andric   // If we didn't get to the instructions limit try predecessing blocks.
3570b57cec5SDimitry Andric   // Ideally we should traverse the predecessor blocks in depth with some
3580b57cec5SDimitry Andric   // coloring algorithm, but for now let's just look at the first order
3590b57cec5SDimitry Andric   // predecessors.
3600b57cec5SDimitry Andric   if (BlockCount < InspectionLimit) {
3610b57cec5SDimitry Andric     MachineBasicBlock *MBB = LoadInst->getParent();
3620b57cec5SDimitry Andric     int LimitLeft = InspectionLimit - BlockCount;
363349cc55cSDimitry Andric     for (MachineBasicBlock *PMBB : MBB->predecessors()) {
3640b57cec5SDimitry Andric       int PredCount = 0;
365349cc55cSDimitry Andric       for (MachineInstr &PBInst : llvm::reverse(*PMBB)) {
366349cc55cSDimitry Andric         if (PBInst.isMetaInstruction())
3670b57cec5SDimitry Andric           continue;
3680b57cec5SDimitry Andric         PredCount++;
3690b57cec5SDimitry Andric         if (PredCount >= LimitLeft)
3700b57cec5SDimitry Andric           break;
371349cc55cSDimitry Andric         if (PBInst.getDesc().isCall())
3720b57cec5SDimitry Andric           break;
373349cc55cSDimitry Andric         PotentialBlockers.push_back(&PBInst);
3740b57cec5SDimitry Andric       }
3750b57cec5SDimitry Andric     }
3760b57cec5SDimitry Andric   }
3770b57cec5SDimitry Andric   return PotentialBlockers;
3780b57cec5SDimitry Andric }
3790b57cec5SDimitry Andric 
buildCopy(MachineInstr * LoadInst,unsigned NLoadOpcode,int64_t LoadDisp,MachineInstr * StoreInst,unsigned NStoreOpcode,int64_t StoreDisp,unsigned Size,int64_t LMMOffset,int64_t SMMOffset)3800b57cec5SDimitry Andric void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
3810b57cec5SDimitry Andric                                 int64_t LoadDisp, MachineInstr *StoreInst,
3820b57cec5SDimitry Andric                                 unsigned NStoreOpcode, int64_t StoreDisp,
3830b57cec5SDimitry Andric                                 unsigned Size, int64_t LMMOffset,
3840b57cec5SDimitry Andric                                 int64_t SMMOffset) {
3850b57cec5SDimitry Andric   MachineOperand &LoadBase = getBaseOperand(LoadInst);
3860b57cec5SDimitry Andric   MachineOperand &StoreBase = getBaseOperand(StoreInst);
3870b57cec5SDimitry Andric   MachineBasicBlock *MBB = LoadInst->getParent();
3880b57cec5SDimitry Andric   MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
3890b57cec5SDimitry Andric   MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
3900b57cec5SDimitry Andric 
3918bcb0991SDimitry Andric   Register Reg1 = MRI->createVirtualRegister(
3920b57cec5SDimitry Andric       TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
3930b57cec5SDimitry Andric   MachineInstr *NewLoad =
3940b57cec5SDimitry Andric       BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
3950b57cec5SDimitry Andric               Reg1)
3960b57cec5SDimitry Andric           .add(LoadBase)
3970b57cec5SDimitry Andric           .addImm(1)
3980b57cec5SDimitry Andric           .addReg(X86::NoRegister)
3990b57cec5SDimitry Andric           .addImm(LoadDisp)
4000b57cec5SDimitry Andric           .addReg(X86::NoRegister)
4010b57cec5SDimitry Andric           .addMemOperand(
4020b57cec5SDimitry Andric               MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size));
4030b57cec5SDimitry Andric   if (LoadBase.isReg())
4040b57cec5SDimitry Andric     getBaseOperand(NewLoad).setIsKill(false);
4050b57cec5SDimitry Andric   LLVM_DEBUG(NewLoad->dump());
4060b57cec5SDimitry Andric   // If the load and store are consecutive, use the loadInst location to
4070b57cec5SDimitry Andric   // reduce register pressure.
4080b57cec5SDimitry Andric   MachineInstr *StInst = StoreInst;
4095ffd83dbSDimitry Andric   auto PrevInstrIt = prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),
4100b57cec5SDimitry Andric                                 MBB->instr_begin());
4110b57cec5SDimitry Andric   if (PrevInstrIt.getNodePtr() == LoadInst)
4120b57cec5SDimitry Andric     StInst = LoadInst;
4130b57cec5SDimitry Andric   MachineInstr *NewStore =
4140b57cec5SDimitry Andric       BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
4150b57cec5SDimitry Andric           .add(StoreBase)
4160b57cec5SDimitry Andric           .addImm(1)
4170b57cec5SDimitry Andric           .addReg(X86::NoRegister)
4180b57cec5SDimitry Andric           .addImm(StoreDisp)
4190b57cec5SDimitry Andric           .addReg(X86::NoRegister)
4200b57cec5SDimitry Andric           .addReg(Reg1)
4210b57cec5SDimitry Andric           .addMemOperand(
4220b57cec5SDimitry Andric               MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size));
4230b57cec5SDimitry Andric   if (StoreBase.isReg())
4240b57cec5SDimitry Andric     getBaseOperand(NewStore).setIsKill(false);
4250b57cec5SDimitry Andric   MachineOperand &StoreSrcVReg = StoreInst->getOperand(X86::AddrNumOperands);
4260b57cec5SDimitry Andric   assert(StoreSrcVReg.isReg() && "Expected virtual register");
4270b57cec5SDimitry Andric   NewStore->getOperand(X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill());
4280b57cec5SDimitry Andric   LLVM_DEBUG(NewStore->dump());
4290b57cec5SDimitry Andric }
4300b57cec5SDimitry Andric 
buildCopies(int Size,MachineInstr * LoadInst,int64_t LdDispImm,MachineInstr * StoreInst,int64_t StDispImm,int64_t LMMOffset,int64_t SMMOffset)4310b57cec5SDimitry Andric void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
4320b57cec5SDimitry Andric                                   int64_t LdDispImm, MachineInstr *StoreInst,
4330b57cec5SDimitry Andric                                   int64_t StDispImm, int64_t LMMOffset,
4340b57cec5SDimitry Andric                                   int64_t SMMOffset) {
4350b57cec5SDimitry Andric   int LdDisp = LdDispImm;
4360b57cec5SDimitry Andric   int StDisp = StDispImm;
4370b57cec5SDimitry Andric   while (Size > 0) {
4380b57cec5SDimitry Andric     if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {
4390b57cec5SDimitry Andric       Size = Size - MOV128SZ;
4400b57cec5SDimitry Andric       buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp,
4410b57cec5SDimitry Andric                 StoreInst, getYMMtoXMMStoreOpcode(StoreInst->getOpcode()),
4420b57cec5SDimitry Andric                 StDisp, MOV128SZ, LMMOffset, SMMOffset);
4430b57cec5SDimitry Andric       LdDisp += MOV128SZ;
4440b57cec5SDimitry Andric       StDisp += MOV128SZ;
4450b57cec5SDimitry Andric       LMMOffset += MOV128SZ;
4460b57cec5SDimitry Andric       SMMOffset += MOV128SZ;
4470b57cec5SDimitry Andric       continue;
4480b57cec5SDimitry Andric     }
4490b57cec5SDimitry Andric     if (Size - MOV64SZ >= 0) {
4500b57cec5SDimitry Andric       Size = Size - MOV64SZ;
4510b57cec5SDimitry Andric       buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
4520b57cec5SDimitry Andric                 MOV64SZ, LMMOffset, SMMOffset);
4530b57cec5SDimitry Andric       LdDisp += MOV64SZ;
4540b57cec5SDimitry Andric       StDisp += MOV64SZ;
4550b57cec5SDimitry Andric       LMMOffset += MOV64SZ;
4560b57cec5SDimitry Andric       SMMOffset += MOV64SZ;
4570b57cec5SDimitry Andric       continue;
4580b57cec5SDimitry Andric     }
4590b57cec5SDimitry Andric     if (Size - MOV32SZ >= 0) {
4600b57cec5SDimitry Andric       Size = Size - MOV32SZ;
4610b57cec5SDimitry Andric       buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
4620b57cec5SDimitry Andric                 MOV32SZ, LMMOffset, SMMOffset);
4630b57cec5SDimitry Andric       LdDisp += MOV32SZ;
4640b57cec5SDimitry Andric       StDisp += MOV32SZ;
4650b57cec5SDimitry Andric       LMMOffset += MOV32SZ;
4660b57cec5SDimitry Andric       SMMOffset += MOV32SZ;
4670b57cec5SDimitry Andric       continue;
4680b57cec5SDimitry Andric     }
4690b57cec5SDimitry Andric     if (Size - MOV16SZ >= 0) {
4700b57cec5SDimitry Andric       Size = Size - MOV16SZ;
4710b57cec5SDimitry Andric       buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
4720b57cec5SDimitry Andric                 MOV16SZ, LMMOffset, SMMOffset);
4730b57cec5SDimitry Andric       LdDisp += MOV16SZ;
4740b57cec5SDimitry Andric       StDisp += MOV16SZ;
4750b57cec5SDimitry Andric       LMMOffset += MOV16SZ;
4760b57cec5SDimitry Andric       SMMOffset += MOV16SZ;
4770b57cec5SDimitry Andric       continue;
4780b57cec5SDimitry Andric     }
4790b57cec5SDimitry Andric     if (Size - MOV8SZ >= 0) {
4800b57cec5SDimitry Andric       Size = Size - MOV8SZ;
4810b57cec5SDimitry Andric       buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
4820b57cec5SDimitry Andric                 MOV8SZ, LMMOffset, SMMOffset);
4830b57cec5SDimitry Andric       LdDisp += MOV8SZ;
4840b57cec5SDimitry Andric       StDisp += MOV8SZ;
4850b57cec5SDimitry Andric       LMMOffset += MOV8SZ;
4860b57cec5SDimitry Andric       SMMOffset += MOV8SZ;
4870b57cec5SDimitry Andric       continue;
4880b57cec5SDimitry Andric     }
4890b57cec5SDimitry Andric   }
4900b57cec5SDimitry Andric   assert(Size == 0 && "Wrong size division");
4910b57cec5SDimitry Andric }
4920b57cec5SDimitry Andric 
updateKillStatus(MachineInstr * LoadInst,MachineInstr * StoreInst)4930b57cec5SDimitry Andric static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
4940b57cec5SDimitry Andric   MachineOperand &LoadBase = getBaseOperand(LoadInst);
4950b57cec5SDimitry Andric   MachineOperand &StoreBase = getBaseOperand(StoreInst);
4965ffd83dbSDimitry Andric   auto *StorePrevNonDbgInstr =
4975ffd83dbSDimitry Andric       prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),
4985ffd83dbSDimitry Andric                  LoadInst->getParent()->instr_begin())
4995ffd83dbSDimitry Andric           .getNodePtr();
5000b57cec5SDimitry Andric   if (LoadBase.isReg()) {
5010b57cec5SDimitry Andric     MachineInstr *LastLoad = LoadInst->getPrevNode();
5020b57cec5SDimitry Andric     // If the original load and store to xmm/ymm were consecutive
5030b57cec5SDimitry Andric     // then the partial copies were also created in
5040b57cec5SDimitry Andric     // a consecutive order to reduce register pressure,
5050b57cec5SDimitry Andric     // and the location of the last load is before the last store.
5060b57cec5SDimitry Andric     if (StorePrevNonDbgInstr == LoadInst)
5070b57cec5SDimitry Andric       LastLoad = LoadInst->getPrevNode()->getPrevNode();
5080b57cec5SDimitry Andric     getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
5090b57cec5SDimitry Andric   }
5100b57cec5SDimitry Andric   if (StoreBase.isReg()) {
5110b57cec5SDimitry Andric     MachineInstr *StInst = StoreInst;
5120b57cec5SDimitry Andric     if (StorePrevNonDbgInstr == LoadInst)
5130b57cec5SDimitry Andric       StInst = LoadInst;
5140b57cec5SDimitry Andric     getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
5150b57cec5SDimitry Andric   }
5160b57cec5SDimitry Andric }
5170b57cec5SDimitry Andric 
alias(const MachineMemOperand & Op1,const MachineMemOperand & Op2) const5180b57cec5SDimitry Andric bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1,
5190b57cec5SDimitry Andric                             const MachineMemOperand &Op2) const {
5200b57cec5SDimitry Andric   if (!Op1.getValue() || !Op2.getValue())
5210b57cec5SDimitry Andric     return true;
5220b57cec5SDimitry Andric 
5230b57cec5SDimitry Andric   int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
5240b57cec5SDimitry Andric   int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
5250b57cec5SDimitry Andric   int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
5260b57cec5SDimitry Andric 
527fe6060f1SDimitry Andric   return !AA->isNoAlias(
528fe6060f1SDimitry Andric       MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
5290b57cec5SDimitry Andric       MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
5300b57cec5SDimitry Andric }
5310b57cec5SDimitry Andric 
findPotentiallylBlockedCopies(MachineFunction & MF)5320b57cec5SDimitry Andric void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
5330b57cec5SDimitry Andric   for (auto &MBB : MF)
5340b57cec5SDimitry Andric     for (auto &MI : MBB) {
5350b57cec5SDimitry Andric       if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
5360b57cec5SDimitry Andric         continue;
5370b57cec5SDimitry Andric       int DefVR = MI.getOperand(0).getReg();
5380b57cec5SDimitry Andric       if (!MRI->hasOneNonDBGUse(DefVR))
5390b57cec5SDimitry Andric         continue;
540349cc55cSDimitry Andric       for (MachineOperand &StoreMO :
541349cc55cSDimitry Andric            llvm::make_early_inc_range(MRI->use_nodbg_operands(DefVR))) {
5420b57cec5SDimitry Andric         MachineInstr &StoreMI = *StoreMO.getParent();
5430b57cec5SDimitry Andric         // Skip cases where the memcpy may overlap.
5440b57cec5SDimitry Andric         if (StoreMI.getParent() == MI.getParent() &&
5450b57cec5SDimitry Andric             isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&
5460b57cec5SDimitry Andric             isRelevantAddressingMode(&MI) &&
5475ffd83dbSDimitry Andric             isRelevantAddressingMode(&StoreMI) &&
5485ffd83dbSDimitry Andric             MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) {
5490b57cec5SDimitry Andric           if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))
5500b57cec5SDimitry Andric             BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
5510b57cec5SDimitry Andric         }
5520b57cec5SDimitry Andric       }
5530b57cec5SDimitry Andric     }
5540b57cec5SDimitry Andric }
5550b57cec5SDimitry Andric 
getRegSizeInBytes(MachineInstr * LoadInst)5560b57cec5SDimitry Andric unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
5575ffd83dbSDimitry Andric   const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
5580b57cec5SDimitry Andric                               *LoadInst->getParent()->getParent());
5590b57cec5SDimitry Andric   return TRI->getRegSizeInBits(*TRC) / 8;
5600b57cec5SDimitry Andric }
5610b57cec5SDimitry Andric 
breakBlockedCopies(MachineInstr * LoadInst,MachineInstr * StoreInst,const DisplacementSizeMap & BlockingStoresDispSizeMap)5620b57cec5SDimitry Andric void X86AvoidSFBPass::breakBlockedCopies(
5630b57cec5SDimitry Andric     MachineInstr *LoadInst, MachineInstr *StoreInst,
5640b57cec5SDimitry Andric     const DisplacementSizeMap &BlockingStoresDispSizeMap) {
5650b57cec5SDimitry Andric   int64_t LdDispImm = getDispOperand(LoadInst).getImm();
5660b57cec5SDimitry Andric   int64_t StDispImm = getDispOperand(StoreInst).getImm();
5670b57cec5SDimitry Andric   int64_t LMMOffset = 0;
5680b57cec5SDimitry Andric   int64_t SMMOffset = 0;
5690b57cec5SDimitry Andric 
5700b57cec5SDimitry Andric   int64_t LdDisp1 = LdDispImm;
5710b57cec5SDimitry Andric   int64_t LdDisp2 = 0;
5720b57cec5SDimitry Andric   int64_t StDisp1 = StDispImm;
5730b57cec5SDimitry Andric   int64_t StDisp2 = 0;
5740b57cec5SDimitry Andric   unsigned Size1 = 0;
5750b57cec5SDimitry Andric   unsigned Size2 = 0;
5760b57cec5SDimitry Andric   int64_t LdStDelta = StDispImm - LdDispImm;
5770b57cec5SDimitry Andric 
5780b57cec5SDimitry Andric   for (auto DispSizePair : BlockingStoresDispSizeMap) {
5790b57cec5SDimitry Andric     LdDisp2 = DispSizePair.first;
5800b57cec5SDimitry Andric     StDisp2 = DispSizePair.first + LdStDelta;
5810b57cec5SDimitry Andric     Size2 = DispSizePair.second;
5820b57cec5SDimitry Andric     // Avoid copying overlapping areas.
5830b57cec5SDimitry Andric     if (LdDisp2 < LdDisp1) {
5840b57cec5SDimitry Andric       int OverlapDelta = LdDisp1 - LdDisp2;
5850b57cec5SDimitry Andric       LdDisp2 += OverlapDelta;
5860b57cec5SDimitry Andric       StDisp2 += OverlapDelta;
5870b57cec5SDimitry Andric       Size2 -= OverlapDelta;
5880b57cec5SDimitry Andric     }
5890b57cec5SDimitry Andric     Size1 = LdDisp2 - LdDisp1;
5900b57cec5SDimitry Andric 
5910b57cec5SDimitry Andric     // Build a copy for the point until the current blocking store's
5920b57cec5SDimitry Andric     // displacement.
5930b57cec5SDimitry Andric     buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
5940b57cec5SDimitry Andric                 SMMOffset);
5950b57cec5SDimitry Andric     // Build a copy for the current blocking store.
5960b57cec5SDimitry Andric     buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1,
5970b57cec5SDimitry Andric                 SMMOffset + Size1);
5980b57cec5SDimitry Andric     LdDisp1 = LdDisp2 + Size2;
5990b57cec5SDimitry Andric     StDisp1 = StDisp2 + Size2;
6000b57cec5SDimitry Andric     LMMOffset += Size1 + Size2;
6010b57cec5SDimitry Andric     SMMOffset += Size1 + Size2;
6020b57cec5SDimitry Andric   }
6030b57cec5SDimitry Andric   unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
6040b57cec5SDimitry Andric   buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
6050b57cec5SDimitry Andric               LMMOffset);
6060b57cec5SDimitry Andric }
6070b57cec5SDimitry Andric 
hasSameBaseOpValue(MachineInstr * LoadInst,MachineInstr * StoreInst)6080b57cec5SDimitry Andric static bool hasSameBaseOpValue(MachineInstr *LoadInst,
6090b57cec5SDimitry Andric                                MachineInstr *StoreInst) {
6105ffd83dbSDimitry Andric   const MachineOperand &LoadBase = getBaseOperand(LoadInst);
6115ffd83dbSDimitry Andric   const MachineOperand &StoreBase = getBaseOperand(StoreInst);
6120b57cec5SDimitry Andric   if (LoadBase.isReg() != StoreBase.isReg())
6130b57cec5SDimitry Andric     return false;
6140b57cec5SDimitry Andric   if (LoadBase.isReg())
6150b57cec5SDimitry Andric     return LoadBase.getReg() == StoreBase.getReg();
6160b57cec5SDimitry Andric   return LoadBase.getIndex() == StoreBase.getIndex();
6170b57cec5SDimitry Andric }
6180b57cec5SDimitry Andric 
isBlockingStore(int64_t LoadDispImm,unsigned LoadSize,int64_t StoreDispImm,unsigned StoreSize)6190b57cec5SDimitry Andric static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,
6200b57cec5SDimitry Andric                             int64_t StoreDispImm, unsigned StoreSize) {
6210b57cec5SDimitry Andric   return ((StoreDispImm >= LoadDispImm) &&
6220b57cec5SDimitry Andric           (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
6230b57cec5SDimitry Andric }
6240b57cec5SDimitry Andric 
6250b57cec5SDimitry Andric // Keep track of all stores blocking a load
6260b57cec5SDimitry Andric static void
updateBlockingStoresDispSizeMap(DisplacementSizeMap & BlockingStoresDispSizeMap,int64_t DispImm,unsigned Size)6270b57cec5SDimitry Andric updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,
6280b57cec5SDimitry Andric                                 int64_t DispImm, unsigned Size) {
6290b57cec5SDimitry Andric   if (BlockingStoresDispSizeMap.count(DispImm)) {
6300b57cec5SDimitry Andric     // Choose the smallest blocking store starting at this displacement.
6310b57cec5SDimitry Andric     if (BlockingStoresDispSizeMap[DispImm] > Size)
6320b57cec5SDimitry Andric       BlockingStoresDispSizeMap[DispImm] = Size;
6330b57cec5SDimitry Andric 
6340b57cec5SDimitry Andric   } else
6350b57cec5SDimitry Andric     BlockingStoresDispSizeMap[DispImm] = Size;
6360b57cec5SDimitry Andric }
6370b57cec5SDimitry Andric 
6380b57cec5SDimitry Andric // Remove blocking stores contained in each other.
6390b57cec5SDimitry Andric static void
removeRedundantBlockingStores(DisplacementSizeMap & BlockingStoresDispSizeMap)6400b57cec5SDimitry Andric removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {
6410b57cec5SDimitry Andric   if (BlockingStoresDispSizeMap.size() <= 1)
6420b57cec5SDimitry Andric     return;
6430b57cec5SDimitry Andric 
6440b57cec5SDimitry Andric   SmallVector<std::pair<int64_t, unsigned>, 0> DispSizeStack;
6450b57cec5SDimitry Andric   for (auto DispSizePair : BlockingStoresDispSizeMap) {
6460b57cec5SDimitry Andric     int64_t CurrDisp = DispSizePair.first;
6470b57cec5SDimitry Andric     unsigned CurrSize = DispSizePair.second;
6480b57cec5SDimitry Andric     while (DispSizeStack.size()) {
6490b57cec5SDimitry Andric       int64_t PrevDisp = DispSizeStack.back().first;
6500b57cec5SDimitry Andric       unsigned PrevSize = DispSizeStack.back().second;
6510b57cec5SDimitry Andric       if (CurrDisp + CurrSize > PrevDisp + PrevSize)
6520b57cec5SDimitry Andric         break;
6530b57cec5SDimitry Andric       DispSizeStack.pop_back();
6540b57cec5SDimitry Andric     }
6550b57cec5SDimitry Andric     DispSizeStack.push_back(DispSizePair);
6560b57cec5SDimitry Andric   }
6570b57cec5SDimitry Andric   BlockingStoresDispSizeMap.clear();
6580b57cec5SDimitry Andric   for (auto Disp : DispSizeStack)
6590b57cec5SDimitry Andric     BlockingStoresDispSizeMap.insert(Disp);
6600b57cec5SDimitry Andric }
6610b57cec5SDimitry Andric 
runOnMachineFunction(MachineFunction & MF)6620b57cec5SDimitry Andric bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
6630b57cec5SDimitry Andric   bool Changed = false;
6640b57cec5SDimitry Andric 
6650b57cec5SDimitry Andric   if (DisableX86AvoidStoreForwardBlocks || skipFunction(MF.getFunction()) ||
6660b57cec5SDimitry Andric       !MF.getSubtarget<X86Subtarget>().is64Bit())
6670b57cec5SDimitry Andric     return false;
6680b57cec5SDimitry Andric 
6690b57cec5SDimitry Andric   MRI = &MF.getRegInfo();
6700b57cec5SDimitry Andric   assert(MRI->isSSA() && "Expected MIR to be in SSA form");
6710b57cec5SDimitry Andric   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
6720b57cec5SDimitry Andric   TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
6730b57cec5SDimitry Andric   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
6740b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);
6750b57cec5SDimitry Andric   // Look for a load then a store to XMM/YMM which look like a memcpy
6760b57cec5SDimitry Andric   findPotentiallylBlockedCopies(MF);
6770b57cec5SDimitry Andric 
6780b57cec5SDimitry Andric   for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
6790b57cec5SDimitry Andric     MachineInstr *LoadInst = LoadStoreInstPair.first;
6800b57cec5SDimitry Andric     int64_t LdDispImm = getDispOperand(LoadInst).getImm();
6810b57cec5SDimitry Andric     DisplacementSizeMap BlockingStoresDispSizeMap;
6820b57cec5SDimitry Andric 
6830b57cec5SDimitry Andric     SmallVector<MachineInstr *, 2> PotentialBlockers =
6840b57cec5SDimitry Andric         findPotentialBlockers(LoadInst);
6855ffd83dbSDimitry Andric     for (auto *PBInst : PotentialBlockers) {
6860b57cec5SDimitry Andric       if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
6870b57cec5SDimitry Andric                                         LoadInst->getOpcode()) ||
6885ffd83dbSDimitry Andric           !isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand())
6890b57cec5SDimitry Andric         continue;
6900b57cec5SDimitry Andric       int64_t PBstDispImm = getDispOperand(PBInst).getImm();
6910b57cec5SDimitry Andric       unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
6920b57cec5SDimitry Andric       // This check doesn't cover all cases, but it will suffice for now.
6930b57cec5SDimitry Andric       // TODO: take branch probability into consideration, if the blocking
6940b57cec5SDimitry Andric       // store is in an unreached block, breaking the memcopy could lose
6950b57cec5SDimitry Andric       // performance.
6960b57cec5SDimitry Andric       if (hasSameBaseOpValue(LoadInst, PBInst) &&
6970b57cec5SDimitry Andric           isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm,
6980b57cec5SDimitry Andric                           PBstSize))
6990b57cec5SDimitry Andric         updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm,
7000b57cec5SDimitry Andric                                         PBstSize);
7010b57cec5SDimitry Andric     }
7020b57cec5SDimitry Andric 
7030b57cec5SDimitry Andric     if (BlockingStoresDispSizeMap.empty())
7040b57cec5SDimitry Andric       continue;
7050b57cec5SDimitry Andric 
7060b57cec5SDimitry Andric     // We found a store forward block, break the memcpy's load and store
7070b57cec5SDimitry Andric     // into smaller copies such that each smaller store that was causing
7080b57cec5SDimitry Andric     // a store block would now be copied separately.
7090b57cec5SDimitry Andric     MachineInstr *StoreInst = LoadStoreInstPair.second;
7100b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n");
7110b57cec5SDimitry Andric     LLVM_DEBUG(LoadInst->dump());
7120b57cec5SDimitry Andric     LLVM_DEBUG(StoreInst->dump());
7130b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "Replaced with:\n");
7140b57cec5SDimitry Andric     removeRedundantBlockingStores(BlockingStoresDispSizeMap);
7150b57cec5SDimitry Andric     breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);
7160b57cec5SDimitry Andric     updateKillStatus(LoadInst, StoreInst);
7170b57cec5SDimitry Andric     ForRemoval.push_back(LoadInst);
7180b57cec5SDimitry Andric     ForRemoval.push_back(StoreInst);
7190b57cec5SDimitry Andric   }
7205ffd83dbSDimitry Andric   for (auto *RemovedInst : ForRemoval) {
7210b57cec5SDimitry Andric     RemovedInst->eraseFromParent();
7220b57cec5SDimitry Andric   }
7230b57cec5SDimitry Andric   ForRemoval.clear();
7240b57cec5SDimitry Andric   BlockedLoadsStoresPairs.clear();
7250b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);
7260b57cec5SDimitry Andric 
7270b57cec5SDimitry Andric   return Changed;
7280b57cec5SDimitry Andric }
729