1 //===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the X86SelectionDAGInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "X86SelectionDAGInfo.h"
14 #include "X86ISelLowering.h"
15 #include "X86InstrInfo.h"
16 #include "X86RegisterInfo.h"
17 #include "X86Subtarget.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/SelectionDAG.h"
20 #include "llvm/CodeGen/TargetLowering.h"
21 #include "llvm/IR/DerivedTypes.h"
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "x86-selectiondag-info"
26 
27 static cl::opt<bool>
28     UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
29                      cl::desc("Use fast short rep mov in memcpy lowering"));
30 
isBaseRegConflictPossible(SelectionDAG & DAG,ArrayRef<MCPhysReg> ClobberSet) const31 bool X86SelectionDAGInfo::isBaseRegConflictPossible(
32     SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
33   // We cannot use TRI->hasBasePointer() until *after* we select all basic
34   // blocks.  Legalization may introduce new stack temporaries with large
35   // alignment requirements.  Fall back to generic code if there are any
36   // dynamic stack adjustments (hopefully rare) and the base pointer would
37   // conflict if we had to use it.
38   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
39   if (!MFI.hasVarSizedObjects() && !MFI.hasOpaqueSPAdjustment())
40     return false;
41 
42   const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
43       DAG.getSubtarget().getRegisterInfo());
44   Register BaseReg = TRI->getBaseRegister();
45   for (unsigned R : ClobberSet)
46     if (BaseReg == R)
47       return true;
48   return false;
49 }
50 
EmitTargetCodeForMemset(SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Val,SDValue Size,Align Alignment,bool isVolatile,MachinePointerInfo DstPtrInfo) const51 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
52     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
53     SDValue Size, Align Alignment, bool isVolatile,
54     MachinePointerInfo DstPtrInfo) const {
55   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
56   const X86Subtarget &Subtarget =
57       DAG.getMachineFunction().getSubtarget<X86Subtarget>();
58 
59 #ifndef NDEBUG
60   // If the base register might conflict with our physical registers, bail out.
61   const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
62                                   X86::ECX, X86::EAX, X86::EDI};
63   assert(!isBaseRegConflictPossible(DAG, ClobberSet));
64 #endif
65 
66   // If to a segment-relative address space, use the default lowering.
67   if (DstPtrInfo.getAddrSpace() >= 256)
68     return SDValue();
69 
70   // If not DWORD aligned or size is more than the threshold, call the library.
71   // The libc version is likely to be faster for these cases. It can use the
72   // address value and run time information about the CPU.
73   if (Alignment < Align(4) || !ConstantSize ||
74       ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
75     // Check to see if there is a specialized entry-point for memory zeroing.
76     ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
77 
78     if (const char *bzeroName = (ValC && ValC->isNullValue())
79         ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO)
80         : nullptr) {
81       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
82       EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
83       Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
84       TargetLowering::ArgListTy Args;
85       TargetLowering::ArgListEntry Entry;
86       Entry.Node = Dst;
87       Entry.Ty = IntPtrTy;
88       Args.push_back(Entry);
89       Entry.Node = Size;
90       Args.push_back(Entry);
91 
92       TargetLowering::CallLoweringInfo CLI(DAG);
93       CLI.setDebugLoc(dl)
94           .setChain(Chain)
95           .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
96                         DAG.getExternalSymbol(bzeroName, IntPtr),
97                         std::move(Args))
98           .setDiscardResult();
99 
100       std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
101       return CallResult.second;
102     }
103 
104     // Otherwise have the target-independent code call memset.
105     return SDValue();
106   }
107 
108   uint64_t SizeVal = ConstantSize->getZExtValue();
109   SDValue InFlag;
110   EVT AVT;
111   SDValue Count;
112   ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
113   unsigned BytesLeft = 0;
114   if (ValC) {
115     unsigned ValReg;
116     uint64_t Val = ValC->getZExtValue() & 255;
117 
118     // If the value is a constant, then we can potentially use larger sets.
119     if (Alignment > Align(2)) {
120       // DWORD aligned
121       AVT = MVT::i32;
122       ValReg = X86::EAX;
123       Val = (Val << 8)  | Val;
124       Val = (Val << 16) | Val;
125       if (Subtarget.is64Bit() && Alignment > Align(8)) { // QWORD aligned
126         AVT = MVT::i64;
127         ValReg = X86::RAX;
128         Val = (Val << 32) | Val;
129       }
130     } else if (Alignment == Align(2)) {
131       // WORD aligned
132       AVT = MVT::i16;
133       ValReg = X86::AX;
134       Val = (Val << 8) | Val;
135     } else {
136       // Byte aligned
137       AVT = MVT::i8;
138       ValReg = X86::AL;
139       Count = DAG.getIntPtrConstant(SizeVal, dl);
140     }
141 
142     if (AVT.bitsGT(MVT::i8)) {
143       unsigned UBytes = AVT.getSizeInBits() / 8;
144       Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
145       BytesLeft = SizeVal % UBytes;
146     }
147 
148     Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
149                              InFlag);
150     InFlag = Chain.getValue(1);
151   } else {
152     AVT = MVT::i8;
153     Count  = DAG.getIntPtrConstant(SizeVal, dl);
154     Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Val, InFlag);
155     InFlag = Chain.getValue(1);
156   }
157 
158   bool Use64BitRegs = Subtarget.isTarget64BitLP64();
159   Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
160                            Count, InFlag);
161   InFlag = Chain.getValue(1);
162   Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
163                            Dst, InFlag);
164   InFlag = Chain.getValue(1);
165 
166   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
167   SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
168   Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
169 
170   if (BytesLeft) {
171     // Handle the last 1 - 7 bytes.
172     unsigned Offset = SizeVal - BytesLeft;
173     EVT AddrVT = Dst.getValueType();
174     EVT SizeVT = Size.getValueType();
175 
176     Chain =
177         DAG.getMemset(Chain, dl,
178                       DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
179                                   DAG.getConstant(Offset, dl, AddrVT)),
180                       Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
181                       isVolatile, false, DstPtrInfo.getWithOffset(Offset));
182   }
183 
184   // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
185   return Chain;
186 }
187 
188 /// Emit a single REP MOVS{B,W,D,Q} instruction.
emitRepmovs(const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,SDValue Size,MVT AVT)189 static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG,
190                            const SDLoc &dl, SDValue Chain, SDValue Dst,
191                            SDValue Src, SDValue Size, MVT AVT) {
192   const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
193   const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
194   const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
195   const unsigned SI = Use64BitRegs ? X86::RSI : X86::ESI;
196 
197   SDValue InFlag;
198   Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InFlag);
199   InFlag = Chain.getValue(1);
200   Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InFlag);
201   InFlag = Chain.getValue(1);
202   Chain = DAG.getCopyToReg(Chain, dl, SI, Src, InFlag);
203   InFlag = Chain.getValue(1);
204 
205   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
206   SDValue Ops[] = {Chain, DAG.getValueType(AVT), InFlag};
207   return DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
208 }
209 
210 /// Emit a single REP MOVSB instruction for a particular constant size.
emitRepmovsB(const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,uint64_t Size)211 static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
212                             const SDLoc &dl, SDValue Chain, SDValue Dst,
213                             SDValue Src, uint64_t Size) {
214   return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
215                      DAG.getIntPtrConstant(Size, dl), MVT::i8);
216 }
217 
218 /// Returns the best type to use with repmovs depending on alignment.
getOptimalRepmovsType(const X86Subtarget & Subtarget,uint64_t Align)219 static MVT getOptimalRepmovsType(const X86Subtarget &Subtarget,
220                                  uint64_t Align) {
221   assert((Align != 0) && "Align is normalized");
222   assert(isPowerOf2_64(Align) && "Align is a power of 2");
223   switch (Align) {
224   case 1:
225     return MVT::i8;
226   case 2:
227     return MVT::i16;
228   case 4:
229     return MVT::i32;
230   default:
231     return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
232   }
233 }
234 
235 /// Returns a REP MOVS instruction, possibly with a few load/stores to implement
236 /// a constant size memory copy. In some cases where we know REP MOVS is
237 /// inefficient we return an empty SDValue so the calling code can either
238 /// generate a load/store sequence or call the runtime memcpy function.
emitConstantSizeRepmov(SelectionDAG & DAG,const X86Subtarget & Subtarget,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,uint64_t Size,EVT SizeVT,unsigned Align,bool isVolatile,bool AlwaysInline,MachinePointerInfo DstPtrInfo,MachinePointerInfo SrcPtrInfo)239 static SDValue emitConstantSizeRepmov(
240     SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl,
241     SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
242     unsigned Align, bool isVolatile, bool AlwaysInline,
243     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
244 
245   /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
246   /// efficient.
247   if (!AlwaysInline && Size > Subtarget.getMaxInlineSizeThreshold())
248     return SDValue();
249 
250   /// If we have enhanced repmovs we use it.
251   if (Subtarget.hasERMSB())
252     return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
253 
254   assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
255   /// We assume runtime memcpy will do a better job for unaligned copies when
256   /// ERMS is not present.
257   if (!AlwaysInline && (Align & 3) != 0)
258     return SDValue();
259 
260   const MVT BlockType = getOptimalRepmovsType(Subtarget, Align);
261   const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
262   const uint64_t BlockCount = Size / BlockBytes;
263   const uint64_t BytesLeft = Size % BlockBytes;
264   SDValue RepMovs =
265       emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
266                   DAG.getIntPtrConstant(BlockCount, dl), BlockType);
267 
268   /// RepMov can process the whole length.
269   if (BytesLeft == 0)
270     return RepMovs;
271 
272   assert(BytesLeft && "We have leftover at this point");
273 
274   /// In case we optimize for size we use repmovsb even if it's less efficient
275   /// so we can save the loads/stores of the leftover.
276   if (DAG.getMachineFunction().getFunction().hasMinSize())
277     return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
278 
279   // Handle the last 1 - 7 bytes.
280   SmallVector<SDValue, 4> Results;
281   Results.push_back(RepMovs);
282   unsigned Offset = Size - BytesLeft;
283   EVT DstVT = Dst.getValueType();
284   EVT SrcVT = Src.getValueType();
285   Results.push_back(DAG.getMemcpy(
286       Chain, dl,
287       DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)),
288       DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)),
289       DAG.getConstant(BytesLeft, dl, SizeVT), llvm::Align(Align), isVolatile,
290       /*AlwaysInline*/ true, /*isTailCall*/ false,
291       DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset)));
292   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
293 }
294 
EmitTargetCodeForMemcpy(SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,SDValue Size,Align Alignment,bool isVolatile,bool AlwaysInline,MachinePointerInfo DstPtrInfo,MachinePointerInfo SrcPtrInfo) const295 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
296     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
297     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
298     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
299   // If to a segment-relative address space, use the default lowering.
300   if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256)
301     return SDValue();
302 
303   // If the base registers conflict with our physical registers, use the default
304   // lowering.
305   const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
306                                   X86::ECX, X86::ESI, X86::EDI};
307   if (isBaseRegConflictPossible(DAG, ClobberSet))
308     return SDValue();
309 
310   const X86Subtarget &Subtarget =
311       DAG.getMachineFunction().getSubtarget<X86Subtarget>();
312 
313   // If enabled and available, use fast short rep mov.
314   if (UseFSRMForMemcpy && Subtarget.hasFSRM())
315     return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);
316 
317   /// Handle constant sizes,
318   if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
319     return emitConstantSizeRepmov(
320         DAG, Subtarget, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
321         Size.getValueType(), Alignment.value(), isVolatile, AlwaysInline,
322         DstPtrInfo, SrcPtrInfo);
323 
324   return SDValue();
325 }
326