1 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the ARMSelectionDAGInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "ARMTargetMachine.h"
14 #include "ARMTargetTransformInfo.h"
15 #include "llvm/CodeGen/SelectionDAG.h"
16 #include "llvm/IR/DerivedTypes.h"
17 #include "llvm/Support/CommandLine.h"
18 using namespace llvm;
19 
20 #define DEBUG_TYPE "arm-selectiondag-info"
21 
22 cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
23     "arm-memtransfer-tploop", cl::Hidden,
24     cl::desc("Control conversion of memcpy to "
25              "Tail predicated loops (WLSTP)"),
26     cl::init(TPLoop::ForceDisabled),
27     cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
28                           "Don't convert memcpy to TP loop."),
29                clEnumValN(TPLoop::ForceEnabled, "force-enabled",
30                           "Always convert memcpy to TP loop."),
31                clEnumValN(TPLoop::Allow, "allow",
32                           "Allow (may be subject to certain conditions) "
33                           "conversion of memcpy to TP loop.")));
34 
35 // Emit, if possible, a specialized version of the given Libcall. Typically this
36 // means selecting the appropriately aligned version, but we also convert memset
37 // of 0 into memclr.
38 SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
39     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
40     SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
41   const ARMSubtarget &Subtarget =
42       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
43   const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
44 
45   // Only use a specialized AEABI function if the default version of this
46   // Libcall is an AEABI function.
47   if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
48     return SDValue();
49 
50   // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
51   // able to translate memset to memclr and use the value to index the function
52   // name array.
53   enum {
54     AEABI_MEMCPY = 0,
55     AEABI_MEMMOVE,
56     AEABI_MEMSET,
57     AEABI_MEMCLR
58   } AEABILibcall;
59   switch (LC) {
60   case RTLIB::MEMCPY:
61     AEABILibcall = AEABI_MEMCPY;
62     break;
63   case RTLIB::MEMMOVE:
64     AEABILibcall = AEABI_MEMMOVE;
65     break;
66   case RTLIB::MEMSET:
67     AEABILibcall = AEABI_MEMSET;
68     if (isNullConstant(Src))
69       AEABILibcall = AEABI_MEMCLR;
70     break;
71   default:
72     return SDValue();
73   }
74 
75   // Choose the most-aligned libcall variant that we can
76   enum {
77     ALIGN1 = 0,
78     ALIGN4,
79     ALIGN8
80   } AlignVariant;
81   if ((Align & 7) == 0)
82     AlignVariant = ALIGN8;
83   else if ((Align & 3) == 0)
84     AlignVariant = ALIGN4;
85   else
86     AlignVariant = ALIGN1;
87 
88   TargetLowering::ArgListTy Args;
89   TargetLowering::ArgListEntry Entry;
90   Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
91   Entry.Node = Dst;
92   Args.push_back(Entry);
93   if (AEABILibcall == AEABI_MEMCLR) {
94     Entry.Node = Size;
95     Args.push_back(Entry);
96   } else if (AEABILibcall == AEABI_MEMSET) {
97     // Adjust parameters for memset, EABI uses format (ptr, size, value),
98     // GNU library uses (ptr, value, size)
99     // See RTABI section 4.3.4
100     Entry.Node = Size;
101     Args.push_back(Entry);
102 
103     // Extend or truncate the argument to be an i32 value for the call.
104     if (Src.getValueType().bitsGT(MVT::i32))
105       Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
106     else if (Src.getValueType().bitsLT(MVT::i32))
107       Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
108 
109     Entry.Node = Src;
110     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
111     Entry.IsSExt = false;
112     Args.push_back(Entry);
113   } else {
114     Entry.Node = Src;
115     Args.push_back(Entry);
116 
117     Entry.Node = Size;
118     Args.push_back(Entry);
119   }
120 
121   char const *FunctionNames[4][3] = {
122     { "__aeabi_memcpy",  "__aeabi_memcpy4",  "__aeabi_memcpy8"  },
123     { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
124     { "__aeabi_memset",  "__aeabi_memset4",  "__aeabi_memset8"  },
125     { "__aeabi_memclr",  "__aeabi_memclr4",  "__aeabi_memclr8"  }
126   };
127   TargetLowering::CallLoweringInfo CLI(DAG);
128   CLI.setDebugLoc(dl)
129       .setChain(Chain)
130       .setLibCallee(
131           TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
132           DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
133                                 TLI->getPointerTy(DAG.getDataLayout())),
134           std::move(Args))
135       .setDiscardResult();
136   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
137 
138   return CallResult.second;
139 }
140 
141 static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
142                                        const SelectionDAG &DAG,
143                                        ConstantSDNode *ConstantSize,
144                                        Align Alignment, bool IsMemcpy) {
145   auto &F = DAG.getMachineFunction().getFunction();
146   if (!EnableMemtransferTPLoop)
147     return false;
148   if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
149     return true;
150   // Do not generate inline TP loop if optimizations is disabled,
151   // or if optimization for size (-Os or -Oz) is on.
152   if (F.hasOptNone() || F.hasOptSize())
153     return false;
154   // If cli option is unset, for memset always generate inline TP.
155   // For memcpy, check some conditions
156   if (!IsMemcpy)
157     return true;
158   if (!ConstantSize && Alignment >= Align(4))
159     return true;
160   if (ConstantSize &&
161       ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
162       ConstantSize->getZExtValue() <
163           Subtarget.getMaxMemcpyTPInlineSizeThreshold())
164     return true;
165   return false;
166 }
167 
168 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
169     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
170     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
171     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
172   const ARMSubtarget &Subtarget =
173       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
174   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
175 
176   if (Subtarget.hasMVEIntegerOps() &&
177       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
178     return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
179                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
180 
181   // Do repeated 4-byte loads and stores. To be improved.
182   // This requires 4-byte alignment.
183   if (Alignment < Align(4))
184     return SDValue();
185   // This requires the copy size to be a constant, preferably
186   // within a subtarget-specific limit.
187   if (!ConstantSize)
188     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
189                                   Alignment.value(), RTLIB::MEMCPY);
190   uint64_t SizeVal = ConstantSize->getZExtValue();
191   if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
192     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
193                                   Alignment.value(), RTLIB::MEMCPY);
194 
195   unsigned BytesLeft = SizeVal & 3;
196   unsigned NumMemOps = SizeVal >> 2;
197   unsigned EmittedNumMemOps = 0;
198   EVT VT = MVT::i32;
199   unsigned VTSize = 4;
200   unsigned i = 0;
201   // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
202   const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
203   SDValue TFOps[6];
204   SDValue Loads[6];
205   uint64_t SrcOff = 0, DstOff = 0;
206 
207   // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
208   // VLDM/VSTM and make this code emit it when appropriate. This would reduce
209   // pressure on the general purpose registers. However this seems harder to map
210   // onto the register allocator's view of the world.
211 
212   // The number of MEMCPY pseudo-instructions to emit. We use up to
213   // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
214   // later on. This is a lower bound on the number of MEMCPY operations we must
215   // emit.
216   unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
217 
218   // Code size optimisation: do not inline memcpy if expansion results in
219   // more instructions than the libary call.
220   if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
221     return SDValue();
222   }
223 
224   SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
225 
226   for (unsigned I = 0; I != NumMEMCPYs; ++I) {
227     // Evenly distribute registers among MEMCPY operations to reduce register
228     // pressure.
229     unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
230     unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
231 
232     Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
233                       DAG.getConstant(NumRegs, dl, MVT::i32));
234     Src = Dst.getValue(1);
235     Chain = Dst.getValue(2);
236 
237     DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
238     SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
239 
240     EmittedNumMemOps = NextEmittedNumMemOps;
241   }
242 
243   if (BytesLeft == 0)
244     return Chain;
245 
246   // Issue loads / stores for the trailing (1 - 3) bytes.
247   auto getRemainingValueType = [](unsigned BytesLeft) {
248     return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
249   };
250   auto getRemainingSize = [](unsigned BytesLeft) {
251     return (BytesLeft >= 2) ? 2 : 1;
252   };
253 
254   unsigned BytesLeftSave = BytesLeft;
255   i = 0;
256   while (BytesLeft) {
257     VT = getRemainingValueType(BytesLeft);
258     VTSize = getRemainingSize(BytesLeft);
259     Loads[i] = DAG.getLoad(VT, dl, Chain,
260                            DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
261                                        DAG.getConstant(SrcOff, dl, MVT::i32)),
262                            SrcPtrInfo.getWithOffset(SrcOff));
263     TFOps[i] = Loads[i].getValue(1);
264     ++i;
265     SrcOff += VTSize;
266     BytesLeft -= VTSize;
267   }
268   Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
269 
270   i = 0;
271   BytesLeft = BytesLeftSave;
272   while (BytesLeft) {
273     VT = getRemainingValueType(BytesLeft);
274     VTSize = getRemainingSize(BytesLeft);
275     TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
276                             DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
277                                         DAG.getConstant(DstOff, dl, MVT::i32)),
278                             DstPtrInfo.getWithOffset(DstOff));
279     ++i;
280     DstOff += VTSize;
281     BytesLeft -= VTSize;
282   }
283   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
284 }
285 
286 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
287     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
288     SDValue Size, Align Alignment, bool isVolatile,
289     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
290   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
291                                 Alignment.value(), RTLIB::MEMMOVE);
292 }
293 
294 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
295     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
296     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
297     MachinePointerInfo DstPtrInfo) const {
298 
299   const ARMSubtarget &Subtarget =
300       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
301 
302   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
303 
304   // Generate TP loop for llvm.memset
305   if (Subtarget.hasMVEIntegerOps() &&
306       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
307                                  false)) {
308     Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
309                                   DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
310     return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
311                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
312   }
313 
314   if (!AlwaysInline)
315     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
316                                   Alignment.value(), RTLIB::MEMSET);
317 
318   return SDValue();
319 }
320