1 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the ARMSelectionDAGInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "ARMTargetMachine.h"
14 #include "ARMTargetTransformInfo.h"
15 #include "llvm/CodeGen/SelectionDAG.h"
16 #include "llvm/IR/DerivedTypes.h"
17 #include "llvm/Support/CommandLine.h"
18 using namespace llvm;
19 
20 #define DEBUG_TYPE "arm-selectiondag-info"
21 
22 cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
23     "arm-memtransfer-tploop", cl::Hidden,
24     cl::desc("Control conversion of memcpy to "
25              "Tail predicated loops (WLSTP)"),
26     cl::init(TPLoop::ForceDisabled),
27     cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
28                           "Don't convert memcpy to TP loop."),
29                clEnumValN(TPLoop::ForceEnabled, "force-enabled",
30                           "Always convert memcpy to TP loop."),
31                clEnumValN(TPLoop::Allow, "allow",
32                           "Allow (may be subject to certain conditions) "
33                           "conversion of memcpy to TP loop.")));
34 
35 // Emit, if possible, a specialized version of the given Libcall. Typically this
36 // means selecting the appropriately aligned version, but we also convert memset
37 // of 0 into memclr.
38 SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
39     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
40     SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
41   const ARMSubtarget &Subtarget =
42       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
43   const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
44 
45   // Only use a specialized AEABI function if the default version of this
46   // Libcall is an AEABI function.
47   if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
48     return SDValue();
49 
50   // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
51   // able to translate memset to memclr and use the value to index the function
52   // name array.
53   enum {
54     AEABI_MEMCPY = 0,
55     AEABI_MEMMOVE,
56     AEABI_MEMSET,
57     AEABI_MEMCLR
58   } AEABILibcall;
59   switch (LC) {
60   case RTLIB::MEMCPY:
61     AEABILibcall = AEABI_MEMCPY;
62     break;
63   case RTLIB::MEMMOVE:
64     AEABILibcall = AEABI_MEMMOVE;
65     break;
66   case RTLIB::MEMSET:
67     AEABILibcall = AEABI_MEMSET;
68     if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
69       if (ConstantSrc->getZExtValue() == 0)
70         AEABILibcall = AEABI_MEMCLR;
71     break;
72   default:
73     return SDValue();
74   }
75 
76   // Choose the most-aligned libcall variant that we can
77   enum {
78     ALIGN1 = 0,
79     ALIGN4,
80     ALIGN8
81   } AlignVariant;
82   if ((Align & 7) == 0)
83     AlignVariant = ALIGN8;
84   else if ((Align & 3) == 0)
85     AlignVariant = ALIGN4;
86   else
87     AlignVariant = ALIGN1;
88 
89   TargetLowering::ArgListTy Args;
90   TargetLowering::ArgListEntry Entry;
91   Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
92   Entry.Node = Dst;
93   Args.push_back(Entry);
94   if (AEABILibcall == AEABI_MEMCLR) {
95     Entry.Node = Size;
96     Args.push_back(Entry);
97   } else if (AEABILibcall == AEABI_MEMSET) {
98     // Adjust parameters for memset, EABI uses format (ptr, size, value),
99     // GNU library uses (ptr, value, size)
100     // See RTABI section 4.3.4
101     Entry.Node = Size;
102     Args.push_back(Entry);
103 
104     // Extend or truncate the argument to be an i32 value for the call.
105     if (Src.getValueType().bitsGT(MVT::i32))
106       Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
107     else if (Src.getValueType().bitsLT(MVT::i32))
108       Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
109 
110     Entry.Node = Src;
111     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
112     Entry.IsSExt = false;
113     Args.push_back(Entry);
114   } else {
115     Entry.Node = Src;
116     Args.push_back(Entry);
117 
118     Entry.Node = Size;
119     Args.push_back(Entry);
120   }
121 
122   char const *FunctionNames[4][3] = {
123     { "__aeabi_memcpy",  "__aeabi_memcpy4",  "__aeabi_memcpy8"  },
124     { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
125     { "__aeabi_memset",  "__aeabi_memset4",  "__aeabi_memset8"  },
126     { "__aeabi_memclr",  "__aeabi_memclr4",  "__aeabi_memclr8"  }
127   };
128   TargetLowering::CallLoweringInfo CLI(DAG);
129   CLI.setDebugLoc(dl)
130       .setChain(Chain)
131       .setLibCallee(
132           TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
133           DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
134                                 TLI->getPointerTy(DAG.getDataLayout())),
135           std::move(Args))
136       .setDiscardResult();
137   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
138 
139   return CallResult.second;
140 }
141 
142 static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
143                                        const SelectionDAG &DAG,
144                                        ConstantSDNode *ConstantSize,
145                                        Align Alignment, bool IsMemcpy) {
146   auto &F = DAG.getMachineFunction().getFunction();
147   if (!EnableMemtransferTPLoop)
148     return false;
149   if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
150     return true;
151   // Do not generate inline TP loop if optimizations is disabled,
152   // or if optimization for size (-Os or -Oz) is on.
153   if (F.hasOptNone() || F.hasOptSize())
154     return false;
155   // If cli option is unset, for memset always generate inline TP.
156   // For memcpy, check some conditions
157   if (!IsMemcpy)
158     return true;
159   if (!ConstantSize && Alignment >= Align(4))
160     return true;
161   if (ConstantSize &&
162       ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
163       ConstantSize->getZExtValue() <
164           Subtarget.getMaxMemcpyTPInlineSizeThreshold())
165     return true;
166   return false;
167 }
168 
169 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
170     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
171     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
172     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
173   const ARMSubtarget &Subtarget =
174       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
175   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
176 
177   if (Subtarget.hasMVEIntegerOps() &&
178       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
179     return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
180                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
181 
182   // Do repeated 4-byte loads and stores. To be improved.
183   // This requires 4-byte alignment.
184   if (Alignment < Align(4))
185     return SDValue();
186   // This requires the copy size to be a constant, preferably
187   // within a subtarget-specific limit.
188   if (!ConstantSize)
189     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
190                                   Alignment.value(), RTLIB::MEMCPY);
191   uint64_t SizeVal = ConstantSize->getZExtValue();
192   if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
193     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
194                                   Alignment.value(), RTLIB::MEMCPY);
195 
196   unsigned BytesLeft = SizeVal & 3;
197   unsigned NumMemOps = SizeVal >> 2;
198   unsigned EmittedNumMemOps = 0;
199   EVT VT = MVT::i32;
200   unsigned VTSize = 4;
201   unsigned i = 0;
202   // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
203   const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
204   SDValue TFOps[6];
205   SDValue Loads[6];
206   uint64_t SrcOff = 0, DstOff = 0;
207 
208   // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
209   // VLDM/VSTM and make this code emit it when appropriate. This would reduce
210   // pressure on the general purpose registers. However this seems harder to map
211   // onto the register allocator's view of the world.
212 
213   // The number of MEMCPY pseudo-instructions to emit. We use up to
214   // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
215   // later on. This is a lower bound on the number of MEMCPY operations we must
216   // emit.
217   unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
218 
219   // Code size optimisation: do not inline memcpy if expansion results in
220   // more instructions than the libary call.
221   if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
222     return SDValue();
223   }
224 
225   SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
226 
227   for (unsigned I = 0; I != NumMEMCPYs; ++I) {
228     // Evenly distribute registers among MEMCPY operations to reduce register
229     // pressure.
230     unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
231     unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
232 
233     Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
234                       DAG.getConstant(NumRegs, dl, MVT::i32));
235     Src = Dst.getValue(1);
236     Chain = Dst.getValue(2);
237 
238     DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
239     SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
240 
241     EmittedNumMemOps = NextEmittedNumMemOps;
242   }
243 
244   if (BytesLeft == 0)
245     return Chain;
246 
247   // Issue loads / stores for the trailing (1 - 3) bytes.
248   auto getRemainingValueType = [](unsigned BytesLeft) {
249     return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
250   };
251   auto getRemainingSize = [](unsigned BytesLeft) {
252     return (BytesLeft >= 2) ? 2 : 1;
253   };
254 
255   unsigned BytesLeftSave = BytesLeft;
256   i = 0;
257   while (BytesLeft) {
258     VT = getRemainingValueType(BytesLeft);
259     VTSize = getRemainingSize(BytesLeft);
260     Loads[i] = DAG.getLoad(VT, dl, Chain,
261                            DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
262                                        DAG.getConstant(SrcOff, dl, MVT::i32)),
263                            SrcPtrInfo.getWithOffset(SrcOff));
264     TFOps[i] = Loads[i].getValue(1);
265     ++i;
266     SrcOff += VTSize;
267     BytesLeft -= VTSize;
268   }
269   Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
270 
271   i = 0;
272   BytesLeft = BytesLeftSave;
273   while (BytesLeft) {
274     VT = getRemainingValueType(BytesLeft);
275     VTSize = getRemainingSize(BytesLeft);
276     TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
277                             DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
278                                         DAG.getConstant(DstOff, dl, MVT::i32)),
279                             DstPtrInfo.getWithOffset(DstOff));
280     ++i;
281     DstOff += VTSize;
282     BytesLeft -= VTSize;
283   }
284   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
285 }
286 
287 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
288     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
289     SDValue Size, Align Alignment, bool isVolatile,
290     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
291   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
292                                 Alignment.value(), RTLIB::MEMMOVE);
293 }
294 
295 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
296     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
297     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
298     MachinePointerInfo DstPtrInfo) const {
299 
300   const ARMSubtarget &Subtarget =
301       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
302 
303   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
304 
305   // Generate TP loop for llvm.memset
306   if (Subtarget.hasMVEIntegerOps() &&
307       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
308                                  false)) {
309     Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
310                                   DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
311     return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
312                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
313   }
314 
315   if (!AlwaysInline)
316     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
317                                   Alignment.value(), RTLIB::MEMSET);
318 
319   return SDValue();
320 }
321