1 //===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the X86SelectionDAGInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "X86SelectionDAGInfo.h"
14 #include "X86ISelLowering.h"
15 #include "X86InstrInfo.h"
16 #include "X86RegisterInfo.h"
17 #include "X86Subtarget.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/SelectionDAG.h"
20 #include "llvm/CodeGen/TargetLowering.h"
21 #include "llvm/IR/DerivedTypes.h"
22
23 using namespace llvm;
24
25 #define DEBUG_TYPE "x86-selectiondag-info"
26
27 static cl::opt<bool>
28 UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
29 cl::desc("Use fast short rep mov in memcpy lowering"));
30
isBaseRegConflictPossible(SelectionDAG & DAG,ArrayRef<MCPhysReg> ClobberSet) const31 bool X86SelectionDAGInfo::isBaseRegConflictPossible(
32 SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
33 // We cannot use TRI->hasBasePointer() until *after* we select all basic
34 // blocks. Legalization may introduce new stack temporaries with large
35 // alignment requirements. Fall back to generic code if there are any
36 // dynamic stack adjustments (hopefully rare) and the base pointer would
37 // conflict if we had to use it.
38 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
39 if (!MFI.hasVarSizedObjects() && !MFI.hasOpaqueSPAdjustment())
40 return false;
41
42 const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
43 DAG.getSubtarget().getRegisterInfo());
44 Register BaseReg = TRI->getBaseRegister();
45 for (unsigned R : ClobberSet)
46 if (BaseReg == R)
47 return true;
48 return false;
49 }
50
EmitTargetCodeForMemset(SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Val,SDValue Size,Align Alignment,bool isVolatile,MachinePointerInfo DstPtrInfo) const51 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
52 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
53 SDValue Size, Align Alignment, bool isVolatile,
54 MachinePointerInfo DstPtrInfo) const {
55 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
56 const X86Subtarget &Subtarget =
57 DAG.getMachineFunction().getSubtarget<X86Subtarget>();
58
59 #ifndef NDEBUG
60 // If the base register might conflict with our physical registers, bail out.
61 const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
62 X86::ECX, X86::EAX, X86::EDI};
63 assert(!isBaseRegConflictPossible(DAG, ClobberSet));
64 #endif
65
66 // If to a segment-relative address space, use the default lowering.
67 if (DstPtrInfo.getAddrSpace() >= 256)
68 return SDValue();
69
70 // If not DWORD aligned or size is more than the threshold, call the library.
71 // The libc version is likely to be faster for these cases. It can use the
72 // address value and run time information about the CPU.
73 if (Alignment < Align(4) || !ConstantSize ||
74 ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
75 // Check to see if there is a specialized entry-point for memory zeroing.
76 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
77
78 if (const char *bzeroName = (ValC && ValC->isNullValue())
79 ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO)
80 : nullptr) {
81 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
82 EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
83 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
84 TargetLowering::ArgListTy Args;
85 TargetLowering::ArgListEntry Entry;
86 Entry.Node = Dst;
87 Entry.Ty = IntPtrTy;
88 Args.push_back(Entry);
89 Entry.Node = Size;
90 Args.push_back(Entry);
91
92 TargetLowering::CallLoweringInfo CLI(DAG);
93 CLI.setDebugLoc(dl)
94 .setChain(Chain)
95 .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
96 DAG.getExternalSymbol(bzeroName, IntPtr),
97 std::move(Args))
98 .setDiscardResult();
99
100 std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
101 return CallResult.second;
102 }
103
104 // Otherwise have the target-independent code call memset.
105 return SDValue();
106 }
107
108 uint64_t SizeVal = ConstantSize->getZExtValue();
109 SDValue InFlag;
110 EVT AVT;
111 SDValue Count;
112 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
113 unsigned BytesLeft = 0;
114 if (ValC) {
115 unsigned ValReg;
116 uint64_t Val = ValC->getZExtValue() & 255;
117
118 // If the value is a constant, then we can potentially use larger sets.
119 if (Alignment > Align(2)) {
120 // DWORD aligned
121 AVT = MVT::i32;
122 ValReg = X86::EAX;
123 Val = (Val << 8) | Val;
124 Val = (Val << 16) | Val;
125 if (Subtarget.is64Bit() && Alignment > Align(8)) { // QWORD aligned
126 AVT = MVT::i64;
127 ValReg = X86::RAX;
128 Val = (Val << 32) | Val;
129 }
130 } else if (Alignment == Align(2)) {
131 // WORD aligned
132 AVT = MVT::i16;
133 ValReg = X86::AX;
134 Val = (Val << 8) | Val;
135 } else {
136 // Byte aligned
137 AVT = MVT::i8;
138 ValReg = X86::AL;
139 Count = DAG.getIntPtrConstant(SizeVal, dl);
140 }
141
142 if (AVT.bitsGT(MVT::i8)) {
143 unsigned UBytes = AVT.getSizeInBits() / 8;
144 Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
145 BytesLeft = SizeVal % UBytes;
146 }
147
148 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
149 InFlag);
150 InFlag = Chain.getValue(1);
151 } else {
152 AVT = MVT::i8;
153 Count = DAG.getIntPtrConstant(SizeVal, dl);
154 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Val, InFlag);
155 InFlag = Chain.getValue(1);
156 }
157
158 bool Use64BitRegs = Subtarget.isTarget64BitLP64();
159 Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
160 Count, InFlag);
161 InFlag = Chain.getValue(1);
162 Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
163 Dst, InFlag);
164 InFlag = Chain.getValue(1);
165
166 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
167 SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
168 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
169
170 if (BytesLeft) {
171 // Handle the last 1 - 7 bytes.
172 unsigned Offset = SizeVal - BytesLeft;
173 EVT AddrVT = Dst.getValueType();
174 EVT SizeVT = Size.getValueType();
175
176 Chain =
177 DAG.getMemset(Chain, dl,
178 DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
179 DAG.getConstant(Offset, dl, AddrVT)),
180 Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
181 isVolatile, false, DstPtrInfo.getWithOffset(Offset));
182 }
183
184 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
185 return Chain;
186 }
187
188 /// Emit a single REP MOVS{B,W,D,Q} instruction.
emitRepmovs(const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,SDValue Size,MVT AVT)189 static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG,
190 const SDLoc &dl, SDValue Chain, SDValue Dst,
191 SDValue Src, SDValue Size, MVT AVT) {
192 const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
193 const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
194 const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
195 const unsigned SI = Use64BitRegs ? X86::RSI : X86::ESI;
196
197 SDValue InFlag;
198 Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InFlag);
199 InFlag = Chain.getValue(1);
200 Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InFlag);
201 InFlag = Chain.getValue(1);
202 Chain = DAG.getCopyToReg(Chain, dl, SI, Src, InFlag);
203 InFlag = Chain.getValue(1);
204
205 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
206 SDValue Ops[] = {Chain, DAG.getValueType(AVT), InFlag};
207 return DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
208 }
209
210 /// Emit a single REP MOVSB instruction for a particular constant size.
emitRepmovsB(const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,uint64_t Size)211 static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
212 const SDLoc &dl, SDValue Chain, SDValue Dst,
213 SDValue Src, uint64_t Size) {
214 return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
215 DAG.getIntPtrConstant(Size, dl), MVT::i8);
216 }
217
218 /// Returns the best type to use with repmovs depending on alignment.
getOptimalRepmovsType(const X86Subtarget & Subtarget,uint64_t Align)219 static MVT getOptimalRepmovsType(const X86Subtarget &Subtarget,
220 uint64_t Align) {
221 assert((Align != 0) && "Align is normalized");
222 assert(isPowerOf2_64(Align) && "Align is a power of 2");
223 switch (Align) {
224 case 1:
225 return MVT::i8;
226 case 2:
227 return MVT::i16;
228 case 4:
229 return MVT::i32;
230 default:
231 return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
232 }
233 }
234
235 /// Returns a REP MOVS instruction, possibly with a few load/stores to implement
236 /// a constant size memory copy. In some cases where we know REP MOVS is
237 /// inefficient we return an empty SDValue so the calling code can either
238 /// generate a load/store sequence or call the runtime memcpy function.
emitConstantSizeRepmov(SelectionDAG & DAG,const X86Subtarget & Subtarget,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,uint64_t Size,EVT SizeVT,unsigned Align,bool isVolatile,bool AlwaysInline,MachinePointerInfo DstPtrInfo,MachinePointerInfo SrcPtrInfo)239 static SDValue emitConstantSizeRepmov(
240 SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl,
241 SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
242 unsigned Align, bool isVolatile, bool AlwaysInline,
243 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
244
245 /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
246 /// efficient.
247 if (!AlwaysInline && Size > Subtarget.getMaxInlineSizeThreshold())
248 return SDValue();
249
250 /// If we have enhanced repmovs we use it.
251 if (Subtarget.hasERMSB())
252 return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
253
254 assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
255 /// We assume runtime memcpy will do a better job for unaligned copies when
256 /// ERMS is not present.
257 if (!AlwaysInline && (Align & 3) != 0)
258 return SDValue();
259
260 const MVT BlockType = getOptimalRepmovsType(Subtarget, Align);
261 const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
262 const uint64_t BlockCount = Size / BlockBytes;
263 const uint64_t BytesLeft = Size % BlockBytes;
264 SDValue RepMovs =
265 emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
266 DAG.getIntPtrConstant(BlockCount, dl), BlockType);
267
268 /// RepMov can process the whole length.
269 if (BytesLeft == 0)
270 return RepMovs;
271
272 assert(BytesLeft && "We have leftover at this point");
273
274 /// In case we optimize for size we use repmovsb even if it's less efficient
275 /// so we can save the loads/stores of the leftover.
276 if (DAG.getMachineFunction().getFunction().hasMinSize())
277 return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
278
279 // Handle the last 1 - 7 bytes.
280 SmallVector<SDValue, 4> Results;
281 Results.push_back(RepMovs);
282 unsigned Offset = Size - BytesLeft;
283 EVT DstVT = Dst.getValueType();
284 EVT SrcVT = Src.getValueType();
285 Results.push_back(DAG.getMemcpy(
286 Chain, dl,
287 DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)),
288 DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)),
289 DAG.getConstant(BytesLeft, dl, SizeVT), llvm::Align(Align), isVolatile,
290 /*AlwaysInline*/ true, /*isTailCall*/ false,
291 DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset)));
292 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
293 }
294
EmitTargetCodeForMemcpy(SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,SDValue Size,Align Alignment,bool isVolatile,bool AlwaysInline,MachinePointerInfo DstPtrInfo,MachinePointerInfo SrcPtrInfo) const295 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
296 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
297 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
298 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
299 // If to a segment-relative address space, use the default lowering.
300 if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256)
301 return SDValue();
302
303 // If the base registers conflict with our physical registers, use the default
304 // lowering.
305 const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
306 X86::ECX, X86::ESI, X86::EDI};
307 if (isBaseRegConflictPossible(DAG, ClobberSet))
308 return SDValue();
309
310 const X86Subtarget &Subtarget =
311 DAG.getMachineFunction().getSubtarget<X86Subtarget>();
312
313 // If enabled and available, use fast short rep mov.
314 if (UseFSRMForMemcpy && Subtarget.hasFSRM())
315 return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);
316
317 /// Handle constant sizes,
318 if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
319 return emitConstantSizeRepmov(
320 DAG, Subtarget, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
321 Size.getValueType(), Alignment.value(), isVolatile, AlwaysInline,
322 DstPtrInfo, SrcPtrInfo);
323
324 return SDValue();
325 }
326