15ffd83dbSDimitry Andric //===- X86AvoidStoreForwardingBlocks.cpp - Avoid HW Store Forward Block ---===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric // If a load follows a store and reloads data that the store has written to 100b57cec5SDimitry Andric // memory, Intel microarchitectures can in many cases forward the data directly 110b57cec5SDimitry Andric // from the store to the load, This "store forwarding" saves cycles by enabling 120b57cec5SDimitry Andric // the load to directly obtain the data instead of accessing the data from 130b57cec5SDimitry Andric // cache or memory. 140b57cec5SDimitry Andric // A "store forward block" occurs in cases that a store cannot be forwarded to 150b57cec5SDimitry Andric // the load. The most typical case of store forward block on Intel Core 160b57cec5SDimitry Andric // microarchitecture that a small store cannot be forwarded to a large load. 170b57cec5SDimitry Andric // The estimated penalty for a store forward block is ~13 cycles. 180b57cec5SDimitry Andric // 190b57cec5SDimitry Andric // This pass tries to recognize and handle cases where "store forward block" 200b57cec5SDimitry Andric // is created by the compiler when lowering memcpy calls to a sequence 210b57cec5SDimitry Andric // of a load and a store. 220b57cec5SDimitry Andric // 230b57cec5SDimitry Andric // The pass currently only handles cases where memcpy is lowered to 240b57cec5SDimitry Andric // XMM/YMM registers, it tries to break the memcpy into smaller copies. 250b57cec5SDimitry Andric // breaking the memcpy should be possible since there is no atomicity 260b57cec5SDimitry Andric // guarantee for loads and stores to XMM/YMM. 270b57cec5SDimitry Andric // 280b57cec5SDimitry Andric // It could be better for performance to solve the problem by loading 290b57cec5SDimitry Andric // to XMM/YMM then inserting the partial store before storing back from XMM/YMM 300b57cec5SDimitry Andric // to memory, but this will result in a more conservative optimization since it 310b57cec5SDimitry Andric // requires we prove that all memory accesses between the blocking store and the 320b57cec5SDimitry Andric // load must alias/don't alias before we can move the store, whereas the 330b57cec5SDimitry Andric // transformation done here is correct regardless to other memory accesses. 340b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 350b57cec5SDimitry Andric 365ffd83dbSDimitry Andric #include "X86.h" 370b57cec5SDimitry Andric #include "X86InstrInfo.h" 380b57cec5SDimitry Andric #include "X86Subtarget.h" 398bcb0991SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h" 400b57cec5SDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h" 410b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunction.h" 420b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h" 430b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstr.h" 440b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h" 450b57cec5SDimitry Andric #include "llvm/CodeGen/MachineOperand.h" 460b57cec5SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h" 470b57cec5SDimitry Andric #include "llvm/IR/DebugInfoMetadata.h" 480b57cec5SDimitry Andric #include "llvm/IR/DebugLoc.h" 490b57cec5SDimitry Andric #include "llvm/IR/Function.h" 50480093f4SDimitry Andric #include "llvm/InitializePasses.h" 510b57cec5SDimitry Andric #include "llvm/MC/MCInstrDesc.h" 520b57cec5SDimitry Andric 530b57cec5SDimitry Andric using namespace llvm; 540b57cec5SDimitry Andric 550b57cec5SDimitry Andric #define DEBUG_TYPE "x86-avoid-SFB" 560b57cec5SDimitry Andric 570b57cec5SDimitry Andric static cl::opt<bool> DisableX86AvoidStoreForwardBlocks( 580b57cec5SDimitry Andric "x86-disable-avoid-SFB", cl::Hidden, 590b57cec5SDimitry Andric cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false)); 600b57cec5SDimitry Andric 610b57cec5SDimitry Andric static cl::opt<unsigned> X86AvoidSFBInspectionLimit( 620b57cec5SDimitry Andric "x86-sfb-inspection-limit", 630b57cec5SDimitry Andric cl::desc("X86: Number of instructions backward to " 640b57cec5SDimitry Andric "inspect for store forwarding blocks."), 650b57cec5SDimitry Andric cl::init(20), cl::Hidden); 660b57cec5SDimitry Andric 670b57cec5SDimitry Andric namespace { 680b57cec5SDimitry Andric 690b57cec5SDimitry Andric using DisplacementSizeMap = std::map<int64_t, unsigned>; 700b57cec5SDimitry Andric 710b57cec5SDimitry Andric class X86AvoidSFBPass : public MachineFunctionPass { 720b57cec5SDimitry Andric public: 730b57cec5SDimitry Andric static char ID; 740b57cec5SDimitry Andric X86AvoidSFBPass() : MachineFunctionPass(ID) { } 750b57cec5SDimitry Andric 760b57cec5SDimitry Andric StringRef getPassName() const override { 770b57cec5SDimitry Andric return "X86 Avoid Store Forwarding Blocks"; 780b57cec5SDimitry Andric } 790b57cec5SDimitry Andric 800b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 810b57cec5SDimitry Andric 820b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 830b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 840b57cec5SDimitry Andric AU.addRequired<AAResultsWrapperPass>(); 850b57cec5SDimitry Andric } 860b57cec5SDimitry Andric 870b57cec5SDimitry Andric private: 88480093f4SDimitry Andric MachineRegisterInfo *MRI = nullptr; 89480093f4SDimitry Andric const X86InstrInfo *TII = nullptr; 90480093f4SDimitry Andric const X86RegisterInfo *TRI = nullptr; 910b57cec5SDimitry Andric SmallVector<std::pair<MachineInstr *, MachineInstr *>, 2> 920b57cec5SDimitry Andric BlockedLoadsStoresPairs; 930b57cec5SDimitry Andric SmallVector<MachineInstr *, 2> ForRemoval; 94480093f4SDimitry Andric AliasAnalysis *AA = nullptr; 950b57cec5SDimitry Andric 960b57cec5SDimitry Andric /// Returns couples of Load then Store to memory which look 970b57cec5SDimitry Andric /// like a memcpy. 980b57cec5SDimitry Andric void findPotentiallylBlockedCopies(MachineFunction &MF); 990b57cec5SDimitry Andric /// Break the memcpy's load and store into smaller copies 1000b57cec5SDimitry Andric /// such that each memory load that was blocked by a smaller store 1010b57cec5SDimitry Andric /// would now be copied separately. 1020b57cec5SDimitry Andric void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst, 1030b57cec5SDimitry Andric const DisplacementSizeMap &BlockingStoresDispSizeMap); 1040b57cec5SDimitry Andric /// Break a copy of size Size to smaller copies. 1050b57cec5SDimitry Andric void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm, 1060b57cec5SDimitry Andric MachineInstr *StoreInst, int64_t StDispImm, 1070b57cec5SDimitry Andric int64_t LMMOffset, int64_t SMMOffset); 1080b57cec5SDimitry Andric 1090b57cec5SDimitry Andric void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp, 1100b57cec5SDimitry Andric MachineInstr *StoreInst, unsigned NStoreOpcode, 1110b57cec5SDimitry Andric int64_t StoreDisp, unsigned Size, int64_t LMMOffset, 1120b57cec5SDimitry Andric int64_t SMMOffset); 1130b57cec5SDimitry Andric 1140b57cec5SDimitry Andric bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const; 1150b57cec5SDimitry Andric 1160b57cec5SDimitry Andric unsigned getRegSizeInBytes(MachineInstr *Inst); 1170b57cec5SDimitry Andric }; 1180b57cec5SDimitry Andric 1190b57cec5SDimitry Andric } // end anonymous namespace 1200b57cec5SDimitry Andric 1210b57cec5SDimitry Andric char X86AvoidSFBPass::ID = 0; 1220b57cec5SDimitry Andric 1230b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", 1240b57cec5SDimitry Andric false, false) 1250b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 1260b57cec5SDimitry Andric INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", false, 1270b57cec5SDimitry Andric false) 1280b57cec5SDimitry Andric 1290b57cec5SDimitry Andric FunctionPass *llvm::createX86AvoidStoreForwardingBlocks() { 1300b57cec5SDimitry Andric return new X86AvoidSFBPass(); 1310b57cec5SDimitry Andric } 1320b57cec5SDimitry Andric 1330b57cec5SDimitry Andric static bool isXMMLoadOpcode(unsigned Opcode) { 1340b57cec5SDimitry Andric return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm || 1350b57cec5SDimitry Andric Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm || 1360b57cec5SDimitry Andric Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm || 1370b57cec5SDimitry Andric Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm || 1380b57cec5SDimitry Andric Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm || 1390b57cec5SDimitry Andric Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm || 1400b57cec5SDimitry Andric Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm || 1410b57cec5SDimitry Andric Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm; 1420b57cec5SDimitry Andric } 1430b57cec5SDimitry Andric static bool isYMMLoadOpcode(unsigned Opcode) { 1440b57cec5SDimitry Andric return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm || 1450b57cec5SDimitry Andric Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm || 1460b57cec5SDimitry Andric Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm || 1470b57cec5SDimitry Andric Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm || 1480b57cec5SDimitry Andric Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm || 1490b57cec5SDimitry Andric Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm || 1500b57cec5SDimitry Andric Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm; 1510b57cec5SDimitry Andric } 1520b57cec5SDimitry Andric 1530b57cec5SDimitry Andric static bool isPotentialBlockedMemCpyLd(unsigned Opcode) { 1540b57cec5SDimitry Andric return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode); 1550b57cec5SDimitry Andric } 1560b57cec5SDimitry Andric 157e8d8bef9SDimitry Andric static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) { 1580b57cec5SDimitry Andric switch (LdOpcode) { 1590b57cec5SDimitry Andric case X86::MOVUPSrm: 1600b57cec5SDimitry Andric case X86::MOVAPSrm: 1610b57cec5SDimitry Andric return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr; 1620b57cec5SDimitry Andric case X86::VMOVUPSrm: 1630b57cec5SDimitry Andric case X86::VMOVAPSrm: 1640b57cec5SDimitry Andric return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr; 1650b57cec5SDimitry Andric case X86::VMOVUPDrm: 1660b57cec5SDimitry Andric case X86::VMOVAPDrm: 1670b57cec5SDimitry Andric return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr; 1680b57cec5SDimitry Andric case X86::VMOVDQUrm: 1690b57cec5SDimitry Andric case X86::VMOVDQArm: 1700b57cec5SDimitry Andric return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr; 1710b57cec5SDimitry Andric case X86::VMOVUPSZ128rm: 1720b57cec5SDimitry Andric case X86::VMOVAPSZ128rm: 1730b57cec5SDimitry Andric return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr; 1740b57cec5SDimitry Andric case X86::VMOVUPDZ128rm: 1750b57cec5SDimitry Andric case X86::VMOVAPDZ128rm: 1760b57cec5SDimitry Andric return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr; 1770b57cec5SDimitry Andric case X86::VMOVUPSYrm: 1780b57cec5SDimitry Andric case X86::VMOVAPSYrm: 1790b57cec5SDimitry Andric return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr; 1800b57cec5SDimitry Andric case X86::VMOVUPDYrm: 1810b57cec5SDimitry Andric case X86::VMOVAPDYrm: 1820b57cec5SDimitry Andric return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr; 1830b57cec5SDimitry Andric case X86::VMOVDQUYrm: 1840b57cec5SDimitry Andric case X86::VMOVDQAYrm: 1850b57cec5SDimitry Andric return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr; 1860b57cec5SDimitry Andric case X86::VMOVUPSZ256rm: 1870b57cec5SDimitry Andric case X86::VMOVAPSZ256rm: 1880b57cec5SDimitry Andric return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr; 1890b57cec5SDimitry Andric case X86::VMOVUPDZ256rm: 1900b57cec5SDimitry Andric case X86::VMOVAPDZ256rm: 1910b57cec5SDimitry Andric return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr; 1920b57cec5SDimitry Andric case X86::VMOVDQU64Z128rm: 1930b57cec5SDimitry Andric case X86::VMOVDQA64Z128rm: 1940b57cec5SDimitry Andric return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr; 1950b57cec5SDimitry Andric case X86::VMOVDQU32Z128rm: 1960b57cec5SDimitry Andric case X86::VMOVDQA32Z128rm: 1970b57cec5SDimitry Andric return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr; 1980b57cec5SDimitry Andric case X86::VMOVDQU64Z256rm: 1990b57cec5SDimitry Andric case X86::VMOVDQA64Z256rm: 2000b57cec5SDimitry Andric return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr; 2010b57cec5SDimitry Andric case X86::VMOVDQU32Z256rm: 2020b57cec5SDimitry Andric case X86::VMOVDQA32Z256rm: 2030b57cec5SDimitry Andric return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr; 2040b57cec5SDimitry Andric default: 2050b57cec5SDimitry Andric return false; 2060b57cec5SDimitry Andric } 2070b57cec5SDimitry Andric } 2080b57cec5SDimitry Andric 209e8d8bef9SDimitry Andric static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) { 2100b57cec5SDimitry Andric bool PBlock = false; 2110b57cec5SDimitry Andric PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 || 2120b57cec5SDimitry Andric Opcode == X86::MOV32mr || Opcode == X86::MOV32mi || 2130b57cec5SDimitry Andric Opcode == X86::MOV16mr || Opcode == X86::MOV16mi || 2140b57cec5SDimitry Andric Opcode == X86::MOV8mr || Opcode == X86::MOV8mi; 2150b57cec5SDimitry Andric if (isYMMLoadOpcode(LoadOpcode)) 2160b57cec5SDimitry Andric PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr || 2170b57cec5SDimitry Andric Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr || 2180b57cec5SDimitry Andric Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr || 2190b57cec5SDimitry Andric Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr || 2200b57cec5SDimitry Andric Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr || 2210b57cec5SDimitry Andric Opcode == X86::VMOVDQU64Z128mr || 2220b57cec5SDimitry Andric Opcode == X86::VMOVDQA64Z128mr || 2230b57cec5SDimitry Andric Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr; 2240b57cec5SDimitry Andric return PBlock; 2250b57cec5SDimitry Andric } 2260b57cec5SDimitry Andric 2270b57cec5SDimitry Andric static const int MOV128SZ = 16; 2280b57cec5SDimitry Andric static const int MOV64SZ = 8; 2290b57cec5SDimitry Andric static const int MOV32SZ = 4; 2300b57cec5SDimitry Andric static const int MOV16SZ = 2; 2310b57cec5SDimitry Andric static const int MOV8SZ = 1; 2320b57cec5SDimitry Andric 2330b57cec5SDimitry Andric static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) { 2340b57cec5SDimitry Andric switch (LoadOpcode) { 2350b57cec5SDimitry Andric case X86::VMOVUPSYrm: 2360b57cec5SDimitry Andric case X86::VMOVAPSYrm: 2370b57cec5SDimitry Andric return X86::VMOVUPSrm; 2380b57cec5SDimitry Andric case X86::VMOVUPDYrm: 2390b57cec5SDimitry Andric case X86::VMOVAPDYrm: 2400b57cec5SDimitry Andric return X86::VMOVUPDrm; 2410b57cec5SDimitry Andric case X86::VMOVDQUYrm: 2420b57cec5SDimitry Andric case X86::VMOVDQAYrm: 2430b57cec5SDimitry Andric return X86::VMOVDQUrm; 2440b57cec5SDimitry Andric case X86::VMOVUPSZ256rm: 2450b57cec5SDimitry Andric case X86::VMOVAPSZ256rm: 2460b57cec5SDimitry Andric return X86::VMOVUPSZ128rm; 2470b57cec5SDimitry Andric case X86::VMOVUPDZ256rm: 2480b57cec5SDimitry Andric case X86::VMOVAPDZ256rm: 2490b57cec5SDimitry Andric return X86::VMOVUPDZ128rm; 2500b57cec5SDimitry Andric case X86::VMOVDQU64Z256rm: 2510b57cec5SDimitry Andric case X86::VMOVDQA64Z256rm: 2520b57cec5SDimitry Andric return X86::VMOVDQU64Z128rm; 2530b57cec5SDimitry Andric case X86::VMOVDQU32Z256rm: 2540b57cec5SDimitry Andric case X86::VMOVDQA32Z256rm: 2550b57cec5SDimitry Andric return X86::VMOVDQU32Z128rm; 2560b57cec5SDimitry Andric default: 2570b57cec5SDimitry Andric llvm_unreachable("Unexpected Load Instruction Opcode"); 2580b57cec5SDimitry Andric } 2590b57cec5SDimitry Andric return 0; 2600b57cec5SDimitry Andric } 2610b57cec5SDimitry Andric 2620b57cec5SDimitry Andric static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) { 2630b57cec5SDimitry Andric switch (StoreOpcode) { 2640b57cec5SDimitry Andric case X86::VMOVUPSYmr: 2650b57cec5SDimitry Andric case X86::VMOVAPSYmr: 2660b57cec5SDimitry Andric return X86::VMOVUPSmr; 2670b57cec5SDimitry Andric case X86::VMOVUPDYmr: 2680b57cec5SDimitry Andric case X86::VMOVAPDYmr: 2690b57cec5SDimitry Andric return X86::VMOVUPDmr; 2700b57cec5SDimitry Andric case X86::VMOVDQUYmr: 2710b57cec5SDimitry Andric case X86::VMOVDQAYmr: 2720b57cec5SDimitry Andric return X86::VMOVDQUmr; 2730b57cec5SDimitry Andric case X86::VMOVUPSZ256mr: 2740b57cec5SDimitry Andric case X86::VMOVAPSZ256mr: 2750b57cec5SDimitry Andric return X86::VMOVUPSZ128mr; 2760b57cec5SDimitry Andric case X86::VMOVUPDZ256mr: 2770b57cec5SDimitry Andric case X86::VMOVAPDZ256mr: 2780b57cec5SDimitry Andric return X86::VMOVUPDZ128mr; 2790b57cec5SDimitry Andric case X86::VMOVDQU64Z256mr: 2800b57cec5SDimitry Andric case X86::VMOVDQA64Z256mr: 2810b57cec5SDimitry Andric return X86::VMOVDQU64Z128mr; 2820b57cec5SDimitry Andric case X86::VMOVDQU32Z256mr: 2830b57cec5SDimitry Andric case X86::VMOVDQA32Z256mr: 2840b57cec5SDimitry Andric return X86::VMOVDQU32Z128mr; 2850b57cec5SDimitry Andric default: 2860b57cec5SDimitry Andric llvm_unreachable("Unexpected Load Instruction Opcode"); 2870b57cec5SDimitry Andric } 2880b57cec5SDimitry Andric return 0; 2890b57cec5SDimitry Andric } 2900b57cec5SDimitry Andric 2915ffd83dbSDimitry Andric static int getAddrOffset(const MachineInstr *MI) { 2920b57cec5SDimitry Andric const MCInstrDesc &Descl = MI->getDesc(); 2930b57cec5SDimitry Andric int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags); 2940b57cec5SDimitry Andric assert(AddrOffset != -1 && "Expected Memory Operand"); 2950b57cec5SDimitry Andric AddrOffset += X86II::getOperandBias(Descl); 2960b57cec5SDimitry Andric return AddrOffset; 2970b57cec5SDimitry Andric } 2980b57cec5SDimitry Andric 2990b57cec5SDimitry Andric static MachineOperand &getBaseOperand(MachineInstr *MI) { 3000b57cec5SDimitry Andric int AddrOffset = getAddrOffset(MI); 3010b57cec5SDimitry Andric return MI->getOperand(AddrOffset + X86::AddrBaseReg); 3020b57cec5SDimitry Andric } 3030b57cec5SDimitry Andric 3040b57cec5SDimitry Andric static MachineOperand &getDispOperand(MachineInstr *MI) { 3050b57cec5SDimitry Andric int AddrOffset = getAddrOffset(MI); 3060b57cec5SDimitry Andric return MI->getOperand(AddrOffset + X86::AddrDisp); 3070b57cec5SDimitry Andric } 3080b57cec5SDimitry Andric 3090b57cec5SDimitry Andric // Relevant addressing modes contain only base register and immediate 3100b57cec5SDimitry Andric // displacement or frameindex and immediate displacement. 3110b57cec5SDimitry Andric // TODO: Consider expanding to other addressing modes in the future 3120b57cec5SDimitry Andric static bool isRelevantAddressingMode(MachineInstr *MI) { 3130b57cec5SDimitry Andric int AddrOffset = getAddrOffset(MI); 3145ffd83dbSDimitry Andric const MachineOperand &Base = getBaseOperand(MI); 3155ffd83dbSDimitry Andric const MachineOperand &Disp = getDispOperand(MI); 3165ffd83dbSDimitry Andric const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt); 3175ffd83dbSDimitry Andric const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg); 3185ffd83dbSDimitry Andric const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg); 3190b57cec5SDimitry Andric 3200b57cec5SDimitry Andric if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI())) 3210b57cec5SDimitry Andric return false; 3220b57cec5SDimitry Andric if (!Disp.isImm()) 3230b57cec5SDimitry Andric return false; 3240b57cec5SDimitry Andric if (Scale.getImm() != 1) 3250b57cec5SDimitry Andric return false; 3260b57cec5SDimitry Andric if (!(Index.isReg() && Index.getReg() == X86::NoRegister)) 3270b57cec5SDimitry Andric return false; 3280b57cec5SDimitry Andric if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister)) 3290b57cec5SDimitry Andric return false; 3300b57cec5SDimitry Andric return true; 3310b57cec5SDimitry Andric } 3320b57cec5SDimitry Andric 3330b57cec5SDimitry Andric // Collect potentially blocking stores. 3340b57cec5SDimitry Andric // Limit the number of instructions backwards we want to inspect 3350b57cec5SDimitry Andric // since the effect of store block won't be visible if the store 3360b57cec5SDimitry Andric // and load instructions have enough instructions in between to 3370b57cec5SDimitry Andric // keep the core busy. 3380b57cec5SDimitry Andric static SmallVector<MachineInstr *, 2> 3390b57cec5SDimitry Andric findPotentialBlockers(MachineInstr *LoadInst) { 3400b57cec5SDimitry Andric SmallVector<MachineInstr *, 2> PotentialBlockers; 3410b57cec5SDimitry Andric unsigned BlockCount = 0; 3420b57cec5SDimitry Andric const unsigned InspectionLimit = X86AvoidSFBInspectionLimit; 3430b57cec5SDimitry Andric for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)), 3440b57cec5SDimitry Andric E = LoadInst->getParent()->rend(); 3450b57cec5SDimitry Andric PBInst != E; ++PBInst) { 3460b57cec5SDimitry Andric if (PBInst->isMetaInstruction()) 3470b57cec5SDimitry Andric continue; 3480b57cec5SDimitry Andric BlockCount++; 3490b57cec5SDimitry Andric if (BlockCount >= InspectionLimit) 3500b57cec5SDimitry Andric break; 3510b57cec5SDimitry Andric MachineInstr &MI = *PBInst; 3520b57cec5SDimitry Andric if (MI.getDesc().isCall()) 3530b57cec5SDimitry Andric return PotentialBlockers; 3540b57cec5SDimitry Andric PotentialBlockers.push_back(&MI); 3550b57cec5SDimitry Andric } 3560b57cec5SDimitry Andric // If we didn't get to the instructions limit try predecessing blocks. 3570b57cec5SDimitry Andric // Ideally we should traverse the predecessor blocks in depth with some 3580b57cec5SDimitry Andric // coloring algorithm, but for now let's just look at the first order 3590b57cec5SDimitry Andric // predecessors. 3600b57cec5SDimitry Andric if (BlockCount < InspectionLimit) { 3610b57cec5SDimitry Andric MachineBasicBlock *MBB = LoadInst->getParent(); 3620b57cec5SDimitry Andric int LimitLeft = InspectionLimit - BlockCount; 3630b57cec5SDimitry Andric for (MachineBasicBlock::pred_iterator PB = MBB->pred_begin(), 3640b57cec5SDimitry Andric PE = MBB->pred_end(); 3650b57cec5SDimitry Andric PB != PE; ++PB) { 3660b57cec5SDimitry Andric MachineBasicBlock *PMBB = *PB; 3670b57cec5SDimitry Andric int PredCount = 0; 3680b57cec5SDimitry Andric for (MachineBasicBlock::reverse_iterator PBInst = PMBB->rbegin(), 3690b57cec5SDimitry Andric PME = PMBB->rend(); 3700b57cec5SDimitry Andric PBInst != PME; ++PBInst) { 3710b57cec5SDimitry Andric if (PBInst->isMetaInstruction()) 3720b57cec5SDimitry Andric continue; 3730b57cec5SDimitry Andric PredCount++; 3740b57cec5SDimitry Andric if (PredCount >= LimitLeft) 3750b57cec5SDimitry Andric break; 3760b57cec5SDimitry Andric if (PBInst->getDesc().isCall()) 3770b57cec5SDimitry Andric break; 3780b57cec5SDimitry Andric PotentialBlockers.push_back(&*PBInst); 3790b57cec5SDimitry Andric } 3800b57cec5SDimitry Andric } 3810b57cec5SDimitry Andric } 3820b57cec5SDimitry Andric return PotentialBlockers; 3830b57cec5SDimitry Andric } 3840b57cec5SDimitry Andric 3850b57cec5SDimitry Andric void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, 3860b57cec5SDimitry Andric int64_t LoadDisp, MachineInstr *StoreInst, 3870b57cec5SDimitry Andric unsigned NStoreOpcode, int64_t StoreDisp, 3880b57cec5SDimitry Andric unsigned Size, int64_t LMMOffset, 3890b57cec5SDimitry Andric int64_t SMMOffset) { 3900b57cec5SDimitry Andric MachineOperand &LoadBase = getBaseOperand(LoadInst); 3910b57cec5SDimitry Andric MachineOperand &StoreBase = getBaseOperand(StoreInst); 3920b57cec5SDimitry Andric MachineBasicBlock *MBB = LoadInst->getParent(); 3930b57cec5SDimitry Andric MachineMemOperand *LMMO = *LoadInst->memoperands_begin(); 3940b57cec5SDimitry Andric MachineMemOperand *SMMO = *StoreInst->memoperands_begin(); 3950b57cec5SDimitry Andric 3968bcb0991SDimitry Andric Register Reg1 = MRI->createVirtualRegister( 3970b57cec5SDimitry Andric TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent()))); 3980b57cec5SDimitry Andric MachineInstr *NewLoad = 3990b57cec5SDimitry Andric BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode), 4000b57cec5SDimitry Andric Reg1) 4010b57cec5SDimitry Andric .add(LoadBase) 4020b57cec5SDimitry Andric .addImm(1) 4030b57cec5SDimitry Andric .addReg(X86::NoRegister) 4040b57cec5SDimitry Andric .addImm(LoadDisp) 4050b57cec5SDimitry Andric .addReg(X86::NoRegister) 4060b57cec5SDimitry Andric .addMemOperand( 4070b57cec5SDimitry Andric MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size)); 4080b57cec5SDimitry Andric if (LoadBase.isReg()) 4090b57cec5SDimitry Andric getBaseOperand(NewLoad).setIsKill(false); 4100b57cec5SDimitry Andric LLVM_DEBUG(NewLoad->dump()); 4110b57cec5SDimitry Andric // If the load and store are consecutive, use the loadInst location to 4120b57cec5SDimitry Andric // reduce register pressure. 4130b57cec5SDimitry Andric MachineInstr *StInst = StoreInst; 4145ffd83dbSDimitry Andric auto PrevInstrIt = prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst), 4150b57cec5SDimitry Andric MBB->instr_begin()); 4160b57cec5SDimitry Andric if (PrevInstrIt.getNodePtr() == LoadInst) 4170b57cec5SDimitry Andric StInst = LoadInst; 4180b57cec5SDimitry Andric MachineInstr *NewStore = 4190b57cec5SDimitry Andric BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode)) 4200b57cec5SDimitry Andric .add(StoreBase) 4210b57cec5SDimitry Andric .addImm(1) 4220b57cec5SDimitry Andric .addReg(X86::NoRegister) 4230b57cec5SDimitry Andric .addImm(StoreDisp) 4240b57cec5SDimitry Andric .addReg(X86::NoRegister) 4250b57cec5SDimitry Andric .addReg(Reg1) 4260b57cec5SDimitry Andric .addMemOperand( 4270b57cec5SDimitry Andric MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size)); 4280b57cec5SDimitry Andric if (StoreBase.isReg()) 4290b57cec5SDimitry Andric getBaseOperand(NewStore).setIsKill(false); 4300b57cec5SDimitry Andric MachineOperand &StoreSrcVReg = StoreInst->getOperand(X86::AddrNumOperands); 4310b57cec5SDimitry Andric assert(StoreSrcVReg.isReg() && "Expected virtual register"); 4320b57cec5SDimitry Andric NewStore->getOperand(X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill()); 4330b57cec5SDimitry Andric LLVM_DEBUG(NewStore->dump()); 4340b57cec5SDimitry Andric } 4350b57cec5SDimitry Andric 4360b57cec5SDimitry Andric void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst, 4370b57cec5SDimitry Andric int64_t LdDispImm, MachineInstr *StoreInst, 4380b57cec5SDimitry Andric int64_t StDispImm, int64_t LMMOffset, 4390b57cec5SDimitry Andric int64_t SMMOffset) { 4400b57cec5SDimitry Andric int LdDisp = LdDispImm; 4410b57cec5SDimitry Andric int StDisp = StDispImm; 4420b57cec5SDimitry Andric while (Size > 0) { 4430b57cec5SDimitry Andric if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) { 4440b57cec5SDimitry Andric Size = Size - MOV128SZ; 4450b57cec5SDimitry Andric buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp, 4460b57cec5SDimitry Andric StoreInst, getYMMtoXMMStoreOpcode(StoreInst->getOpcode()), 4470b57cec5SDimitry Andric StDisp, MOV128SZ, LMMOffset, SMMOffset); 4480b57cec5SDimitry Andric LdDisp += MOV128SZ; 4490b57cec5SDimitry Andric StDisp += MOV128SZ; 4500b57cec5SDimitry Andric LMMOffset += MOV128SZ; 4510b57cec5SDimitry Andric SMMOffset += MOV128SZ; 4520b57cec5SDimitry Andric continue; 4530b57cec5SDimitry Andric } 4540b57cec5SDimitry Andric if (Size - MOV64SZ >= 0) { 4550b57cec5SDimitry Andric Size = Size - MOV64SZ; 4560b57cec5SDimitry Andric buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp, 4570b57cec5SDimitry Andric MOV64SZ, LMMOffset, SMMOffset); 4580b57cec5SDimitry Andric LdDisp += MOV64SZ; 4590b57cec5SDimitry Andric StDisp += MOV64SZ; 4600b57cec5SDimitry Andric LMMOffset += MOV64SZ; 4610b57cec5SDimitry Andric SMMOffset += MOV64SZ; 4620b57cec5SDimitry Andric continue; 4630b57cec5SDimitry Andric } 4640b57cec5SDimitry Andric if (Size - MOV32SZ >= 0) { 4650b57cec5SDimitry Andric Size = Size - MOV32SZ; 4660b57cec5SDimitry Andric buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp, 4670b57cec5SDimitry Andric MOV32SZ, LMMOffset, SMMOffset); 4680b57cec5SDimitry Andric LdDisp += MOV32SZ; 4690b57cec5SDimitry Andric StDisp += MOV32SZ; 4700b57cec5SDimitry Andric LMMOffset += MOV32SZ; 4710b57cec5SDimitry Andric SMMOffset += MOV32SZ; 4720b57cec5SDimitry Andric continue; 4730b57cec5SDimitry Andric } 4740b57cec5SDimitry Andric if (Size - MOV16SZ >= 0) { 4750b57cec5SDimitry Andric Size = Size - MOV16SZ; 4760b57cec5SDimitry Andric buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp, 4770b57cec5SDimitry Andric MOV16SZ, LMMOffset, SMMOffset); 4780b57cec5SDimitry Andric LdDisp += MOV16SZ; 4790b57cec5SDimitry Andric StDisp += MOV16SZ; 4800b57cec5SDimitry Andric LMMOffset += MOV16SZ; 4810b57cec5SDimitry Andric SMMOffset += MOV16SZ; 4820b57cec5SDimitry Andric continue; 4830b57cec5SDimitry Andric } 4840b57cec5SDimitry Andric if (Size - MOV8SZ >= 0) { 4850b57cec5SDimitry Andric Size = Size - MOV8SZ; 4860b57cec5SDimitry Andric buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp, 4870b57cec5SDimitry Andric MOV8SZ, LMMOffset, SMMOffset); 4880b57cec5SDimitry Andric LdDisp += MOV8SZ; 4890b57cec5SDimitry Andric StDisp += MOV8SZ; 4900b57cec5SDimitry Andric LMMOffset += MOV8SZ; 4910b57cec5SDimitry Andric SMMOffset += MOV8SZ; 4920b57cec5SDimitry Andric continue; 4930b57cec5SDimitry Andric } 4940b57cec5SDimitry Andric } 4950b57cec5SDimitry Andric assert(Size == 0 && "Wrong size division"); 4960b57cec5SDimitry Andric } 4970b57cec5SDimitry Andric 4980b57cec5SDimitry Andric static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) { 4990b57cec5SDimitry Andric MachineOperand &LoadBase = getBaseOperand(LoadInst); 5000b57cec5SDimitry Andric MachineOperand &StoreBase = getBaseOperand(StoreInst); 5015ffd83dbSDimitry Andric auto *StorePrevNonDbgInstr = 5025ffd83dbSDimitry Andric prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst), 5035ffd83dbSDimitry Andric LoadInst->getParent()->instr_begin()) 5045ffd83dbSDimitry Andric .getNodePtr(); 5050b57cec5SDimitry Andric if (LoadBase.isReg()) { 5060b57cec5SDimitry Andric MachineInstr *LastLoad = LoadInst->getPrevNode(); 5070b57cec5SDimitry Andric // If the original load and store to xmm/ymm were consecutive 5080b57cec5SDimitry Andric // then the partial copies were also created in 5090b57cec5SDimitry Andric // a consecutive order to reduce register pressure, 5100b57cec5SDimitry Andric // and the location of the last load is before the last store. 5110b57cec5SDimitry Andric if (StorePrevNonDbgInstr == LoadInst) 5120b57cec5SDimitry Andric LastLoad = LoadInst->getPrevNode()->getPrevNode(); 5130b57cec5SDimitry Andric getBaseOperand(LastLoad).setIsKill(LoadBase.isKill()); 5140b57cec5SDimitry Andric } 5150b57cec5SDimitry Andric if (StoreBase.isReg()) { 5160b57cec5SDimitry Andric MachineInstr *StInst = StoreInst; 5170b57cec5SDimitry Andric if (StorePrevNonDbgInstr == LoadInst) 5180b57cec5SDimitry Andric StInst = LoadInst; 5190b57cec5SDimitry Andric getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill()); 5200b57cec5SDimitry Andric } 5210b57cec5SDimitry Andric } 5220b57cec5SDimitry Andric 5230b57cec5SDimitry Andric bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1, 5240b57cec5SDimitry Andric const MachineMemOperand &Op2) const { 5250b57cec5SDimitry Andric if (!Op1.getValue() || !Op2.getValue()) 5260b57cec5SDimitry Andric return true; 5270b57cec5SDimitry Andric 5280b57cec5SDimitry Andric int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset()); 5290b57cec5SDimitry Andric int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset; 5300b57cec5SDimitry Andric int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset; 5310b57cec5SDimitry Andric 5320b57cec5SDimitry Andric AliasResult AAResult = 5330b57cec5SDimitry Andric AA->alias(MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()), 5340b57cec5SDimitry Andric MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo())); 5350b57cec5SDimitry Andric return AAResult != NoAlias; 5360b57cec5SDimitry Andric } 5370b57cec5SDimitry Andric 5380b57cec5SDimitry Andric void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) { 5390b57cec5SDimitry Andric for (auto &MBB : MF) 5400b57cec5SDimitry Andric for (auto &MI : MBB) { 5410b57cec5SDimitry Andric if (!isPotentialBlockedMemCpyLd(MI.getOpcode())) 5420b57cec5SDimitry Andric continue; 5430b57cec5SDimitry Andric int DefVR = MI.getOperand(0).getReg(); 5440b57cec5SDimitry Andric if (!MRI->hasOneNonDBGUse(DefVR)) 5450b57cec5SDimitry Andric continue; 5460b57cec5SDimitry Andric for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end(); 5470b57cec5SDimitry Andric UI != UE;) { 5480b57cec5SDimitry Andric MachineOperand &StoreMO = *UI++; 5490b57cec5SDimitry Andric MachineInstr &StoreMI = *StoreMO.getParent(); 5500b57cec5SDimitry Andric // Skip cases where the memcpy may overlap. 5510b57cec5SDimitry Andric if (StoreMI.getParent() == MI.getParent() && 5520b57cec5SDimitry Andric isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) && 5530b57cec5SDimitry Andric isRelevantAddressingMode(&MI) && 5545ffd83dbSDimitry Andric isRelevantAddressingMode(&StoreMI) && 5555ffd83dbSDimitry Andric MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) { 5560b57cec5SDimitry Andric if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin())) 5570b57cec5SDimitry Andric BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI)); 5580b57cec5SDimitry Andric } 5590b57cec5SDimitry Andric } 5600b57cec5SDimitry Andric } 5610b57cec5SDimitry Andric } 5620b57cec5SDimitry Andric 5630b57cec5SDimitry Andric unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) { 5645ffd83dbSDimitry Andric const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI, 5650b57cec5SDimitry Andric *LoadInst->getParent()->getParent()); 5660b57cec5SDimitry Andric return TRI->getRegSizeInBits(*TRC) / 8; 5670b57cec5SDimitry Andric } 5680b57cec5SDimitry Andric 5690b57cec5SDimitry Andric void X86AvoidSFBPass::breakBlockedCopies( 5700b57cec5SDimitry Andric MachineInstr *LoadInst, MachineInstr *StoreInst, 5710b57cec5SDimitry Andric const DisplacementSizeMap &BlockingStoresDispSizeMap) { 5720b57cec5SDimitry Andric int64_t LdDispImm = getDispOperand(LoadInst).getImm(); 5730b57cec5SDimitry Andric int64_t StDispImm = getDispOperand(StoreInst).getImm(); 5740b57cec5SDimitry Andric int64_t LMMOffset = 0; 5750b57cec5SDimitry Andric int64_t SMMOffset = 0; 5760b57cec5SDimitry Andric 5770b57cec5SDimitry Andric int64_t LdDisp1 = LdDispImm; 5780b57cec5SDimitry Andric int64_t LdDisp2 = 0; 5790b57cec5SDimitry Andric int64_t StDisp1 = StDispImm; 5800b57cec5SDimitry Andric int64_t StDisp2 = 0; 5810b57cec5SDimitry Andric unsigned Size1 = 0; 5820b57cec5SDimitry Andric unsigned Size2 = 0; 5830b57cec5SDimitry Andric int64_t LdStDelta = StDispImm - LdDispImm; 5840b57cec5SDimitry Andric 5850b57cec5SDimitry Andric for (auto DispSizePair : BlockingStoresDispSizeMap) { 5860b57cec5SDimitry Andric LdDisp2 = DispSizePair.first; 5870b57cec5SDimitry Andric StDisp2 = DispSizePair.first + LdStDelta; 5880b57cec5SDimitry Andric Size2 = DispSizePair.second; 5890b57cec5SDimitry Andric // Avoid copying overlapping areas. 5900b57cec5SDimitry Andric if (LdDisp2 < LdDisp1) { 5910b57cec5SDimitry Andric int OverlapDelta = LdDisp1 - LdDisp2; 5920b57cec5SDimitry Andric LdDisp2 += OverlapDelta; 5930b57cec5SDimitry Andric StDisp2 += OverlapDelta; 5940b57cec5SDimitry Andric Size2 -= OverlapDelta; 5950b57cec5SDimitry Andric } 5960b57cec5SDimitry Andric Size1 = LdDisp2 - LdDisp1; 5970b57cec5SDimitry Andric 5980b57cec5SDimitry Andric // Build a copy for the point until the current blocking store's 5990b57cec5SDimitry Andric // displacement. 6000b57cec5SDimitry Andric buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset, 6010b57cec5SDimitry Andric SMMOffset); 6020b57cec5SDimitry Andric // Build a copy for the current blocking store. 6030b57cec5SDimitry Andric buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1, 6040b57cec5SDimitry Andric SMMOffset + Size1); 6050b57cec5SDimitry Andric LdDisp1 = LdDisp2 + Size2; 6060b57cec5SDimitry Andric StDisp1 = StDisp2 + Size2; 6070b57cec5SDimitry Andric LMMOffset += Size1 + Size2; 6080b57cec5SDimitry Andric SMMOffset += Size1 + Size2; 6090b57cec5SDimitry Andric } 6100b57cec5SDimitry Andric unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1; 6110b57cec5SDimitry Andric buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset, 6120b57cec5SDimitry Andric LMMOffset); 6130b57cec5SDimitry Andric } 6140b57cec5SDimitry Andric 6150b57cec5SDimitry Andric static bool hasSameBaseOpValue(MachineInstr *LoadInst, 6160b57cec5SDimitry Andric MachineInstr *StoreInst) { 6175ffd83dbSDimitry Andric const MachineOperand &LoadBase = getBaseOperand(LoadInst); 6185ffd83dbSDimitry Andric const MachineOperand &StoreBase = getBaseOperand(StoreInst); 6190b57cec5SDimitry Andric if (LoadBase.isReg() != StoreBase.isReg()) 6200b57cec5SDimitry Andric return false; 6210b57cec5SDimitry Andric if (LoadBase.isReg()) 6220b57cec5SDimitry Andric return LoadBase.getReg() == StoreBase.getReg(); 6230b57cec5SDimitry Andric return LoadBase.getIndex() == StoreBase.getIndex(); 6240b57cec5SDimitry Andric } 6250b57cec5SDimitry Andric 6260b57cec5SDimitry Andric static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize, 6270b57cec5SDimitry Andric int64_t StoreDispImm, unsigned StoreSize) { 6280b57cec5SDimitry Andric return ((StoreDispImm >= LoadDispImm) && 6290b57cec5SDimitry Andric (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize))); 6300b57cec5SDimitry Andric } 6310b57cec5SDimitry Andric 6320b57cec5SDimitry Andric // Keep track of all stores blocking a load 6330b57cec5SDimitry Andric static void 6340b57cec5SDimitry Andric updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap, 6350b57cec5SDimitry Andric int64_t DispImm, unsigned Size) { 6360b57cec5SDimitry Andric if (BlockingStoresDispSizeMap.count(DispImm)) { 6370b57cec5SDimitry Andric // Choose the smallest blocking store starting at this displacement. 6380b57cec5SDimitry Andric if (BlockingStoresDispSizeMap[DispImm] > Size) 6390b57cec5SDimitry Andric BlockingStoresDispSizeMap[DispImm] = Size; 6400b57cec5SDimitry Andric 6410b57cec5SDimitry Andric } else 6420b57cec5SDimitry Andric BlockingStoresDispSizeMap[DispImm] = Size; 6430b57cec5SDimitry Andric } 6440b57cec5SDimitry Andric 6450b57cec5SDimitry Andric // Remove blocking stores contained in each other. 6460b57cec5SDimitry Andric static void 6470b57cec5SDimitry Andric removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) { 6480b57cec5SDimitry Andric if (BlockingStoresDispSizeMap.size() <= 1) 6490b57cec5SDimitry Andric return; 6500b57cec5SDimitry Andric 6510b57cec5SDimitry Andric SmallVector<std::pair<int64_t, unsigned>, 0> DispSizeStack; 6520b57cec5SDimitry Andric for (auto DispSizePair : BlockingStoresDispSizeMap) { 6530b57cec5SDimitry Andric int64_t CurrDisp = DispSizePair.first; 6540b57cec5SDimitry Andric unsigned CurrSize = DispSizePair.second; 6550b57cec5SDimitry Andric while (DispSizeStack.size()) { 6560b57cec5SDimitry Andric int64_t PrevDisp = DispSizeStack.back().first; 6570b57cec5SDimitry Andric unsigned PrevSize = DispSizeStack.back().second; 6580b57cec5SDimitry Andric if (CurrDisp + CurrSize > PrevDisp + PrevSize) 6590b57cec5SDimitry Andric break; 6600b57cec5SDimitry Andric DispSizeStack.pop_back(); 6610b57cec5SDimitry Andric } 6620b57cec5SDimitry Andric DispSizeStack.push_back(DispSizePair); 6630b57cec5SDimitry Andric } 6640b57cec5SDimitry Andric BlockingStoresDispSizeMap.clear(); 6650b57cec5SDimitry Andric for (auto Disp : DispSizeStack) 6660b57cec5SDimitry Andric BlockingStoresDispSizeMap.insert(Disp); 6670b57cec5SDimitry Andric } 6680b57cec5SDimitry Andric 6690b57cec5SDimitry Andric bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) { 6700b57cec5SDimitry Andric bool Changed = false; 6710b57cec5SDimitry Andric 6720b57cec5SDimitry Andric if (DisableX86AvoidStoreForwardBlocks || skipFunction(MF.getFunction()) || 6730b57cec5SDimitry Andric !MF.getSubtarget<X86Subtarget>().is64Bit()) 6740b57cec5SDimitry Andric return false; 6750b57cec5SDimitry Andric 6760b57cec5SDimitry Andric MRI = &MF.getRegInfo(); 6770b57cec5SDimitry Andric assert(MRI->isSSA() && "Expected MIR to be in SSA form"); 6780b57cec5SDimitry Andric TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); 6790b57cec5SDimitry Andric TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo(); 6800b57cec5SDimitry Andric AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 6810b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";); 6820b57cec5SDimitry Andric // Look for a load then a store to XMM/YMM which look like a memcpy 6830b57cec5SDimitry Andric findPotentiallylBlockedCopies(MF); 6840b57cec5SDimitry Andric 6850b57cec5SDimitry Andric for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) { 6860b57cec5SDimitry Andric MachineInstr *LoadInst = LoadStoreInstPair.first; 6870b57cec5SDimitry Andric int64_t LdDispImm = getDispOperand(LoadInst).getImm(); 6880b57cec5SDimitry Andric DisplacementSizeMap BlockingStoresDispSizeMap; 6890b57cec5SDimitry Andric 6900b57cec5SDimitry Andric SmallVector<MachineInstr *, 2> PotentialBlockers = 6910b57cec5SDimitry Andric findPotentialBlockers(LoadInst); 6925ffd83dbSDimitry Andric for (auto *PBInst : PotentialBlockers) { 6930b57cec5SDimitry Andric if (!isPotentialBlockingStoreInst(PBInst->getOpcode(), 6940b57cec5SDimitry Andric LoadInst->getOpcode()) || 6955ffd83dbSDimitry Andric !isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand()) 6960b57cec5SDimitry Andric continue; 6970b57cec5SDimitry Andric int64_t PBstDispImm = getDispOperand(PBInst).getImm(); 6980b57cec5SDimitry Andric unsigned PBstSize = (*PBInst->memoperands_begin())->getSize(); 6990b57cec5SDimitry Andric // This check doesn't cover all cases, but it will suffice for now. 7000b57cec5SDimitry Andric // TODO: take branch probability into consideration, if the blocking 7010b57cec5SDimitry Andric // store is in an unreached block, breaking the memcopy could lose 7020b57cec5SDimitry Andric // performance. 7030b57cec5SDimitry Andric if (hasSameBaseOpValue(LoadInst, PBInst) && 7040b57cec5SDimitry Andric isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm, 7050b57cec5SDimitry Andric PBstSize)) 7060b57cec5SDimitry Andric updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm, 7070b57cec5SDimitry Andric PBstSize); 7080b57cec5SDimitry Andric } 7090b57cec5SDimitry Andric 7100b57cec5SDimitry Andric if (BlockingStoresDispSizeMap.empty()) 7110b57cec5SDimitry Andric continue; 7120b57cec5SDimitry Andric 7130b57cec5SDimitry Andric // We found a store forward block, break the memcpy's load and store 7140b57cec5SDimitry Andric // into smaller copies such that each smaller store that was causing 7150b57cec5SDimitry Andric // a store block would now be copied separately. 7160b57cec5SDimitry Andric MachineInstr *StoreInst = LoadStoreInstPair.second; 7170b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n"); 7180b57cec5SDimitry Andric LLVM_DEBUG(LoadInst->dump()); 7190b57cec5SDimitry Andric LLVM_DEBUG(StoreInst->dump()); 7200b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Replaced with:\n"); 7210b57cec5SDimitry Andric removeRedundantBlockingStores(BlockingStoresDispSizeMap); 7220b57cec5SDimitry Andric breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap); 7230b57cec5SDimitry Andric updateKillStatus(LoadInst, StoreInst); 7240b57cec5SDimitry Andric ForRemoval.push_back(LoadInst); 7250b57cec5SDimitry Andric ForRemoval.push_back(StoreInst); 7260b57cec5SDimitry Andric } 7275ffd83dbSDimitry Andric for (auto *RemovedInst : ForRemoval) { 7280b57cec5SDimitry Andric RemovedInst->eraseFromParent(); 7290b57cec5SDimitry Andric } 7300b57cec5SDimitry Andric ForRemoval.clear(); 7310b57cec5SDimitry Andric BlockedLoadsStoresPairs.clear(); 7320b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";); 7330b57cec5SDimitry Andric 7340b57cec5SDimitry Andric return Changed; 7350b57cec5SDimitry Andric } 736