1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64Subtarget.h"
14 
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/GlobalValue.h"
26 #include "llvm/Support/TargetParser.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "aarch64-subtarget"
31 
32 #define GET_SUBTARGETINFO_CTOR
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #include "AArch64GenSubtargetInfo.inc"
35 
36 static cl::opt<bool>
37 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
38                      "converter pass"), cl::init(true), cl::Hidden);
39 
40 // If OS supports TBI, use this flag to enable it.
41 static cl::opt<bool>
42 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
43                          "an address is ignored"), cl::init(false), cl::Hidden);
44 
45 static cl::opt<bool>
46     UseNonLazyBind("aarch64-enable-nonlazybind",
47                    cl::desc("Call nonlazybind functions via direct GOT load"),
48                    cl::init(false), cl::Hidden);
49 
50 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
51                            cl::desc("Enable the use of AA during codegen."));
52 
53 AArch64Subtarget &
54 AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
55                                                   StringRef CPUString) {
56   // Determine default and user-specified characteristics
57 
58   if (CPUString.empty())
59     CPUString = "generic";
60 
61   ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS);
62   initializeProperties();
63 
64   return *this;
65 }
66 
67 void AArch64Subtarget::initializeProperties() {
68   // Initialize CPU specific properties. We should add a tablegen feature for
69   // this in the future so we can specify it together with the subtarget
70   // features.
71   switch (ARMProcFamily) {
72   case Others:
73     break;
74   case Carmel:
75     CacheLineSize = 64;
76     break;
77   case CortexA35:
78     break;
79   case CortexA53:
80   case CortexA55:
81     PrefFunctionLogAlignment = 4;
82     break;
83   case CortexA57:
84     MaxInterleaveFactor = 4;
85     PrefFunctionLogAlignment = 4;
86     break;
87   case CortexA65:
88     PrefFunctionLogAlignment = 3;
89     break;
90   case CortexA72:
91   case CortexA73:
92   case CortexA75:
93   case CortexA76:
94   case CortexA77:
95   case CortexA78:
96   case CortexA78C:
97   case CortexR82:
98   case CortexX1:
99     PrefFunctionLogAlignment = 4;
100     break;
101   case A64FX:
102     CacheLineSize = 256;
103     PrefFunctionLogAlignment = 3;
104     PrefLoopLogAlignment = 2;
105     MaxInterleaveFactor = 4;
106     PrefetchDistance = 128;
107     MinPrefetchStride = 1024;
108     MaxPrefetchIterationsAhead = 4;
109     break;
110   case AppleA7:
111   case AppleA10:
112   case AppleA11:
113   case AppleA12:
114   case AppleA13:
115   case AppleA14:
116     CacheLineSize = 64;
117     PrefetchDistance = 280;
118     MinPrefetchStride = 2048;
119     MaxPrefetchIterationsAhead = 3;
120     break;
121   case ExynosM3:
122     MaxInterleaveFactor = 4;
123     MaxJumpTableSize = 20;
124     PrefFunctionLogAlignment = 5;
125     PrefLoopLogAlignment = 4;
126     break;
127   case Falkor:
128     MaxInterleaveFactor = 4;
129     // FIXME: remove this to enable 64-bit SLP if performance looks good.
130     MinVectorRegisterBitWidth = 128;
131     CacheLineSize = 128;
132     PrefetchDistance = 820;
133     MinPrefetchStride = 2048;
134     MaxPrefetchIterationsAhead = 8;
135     break;
136   case Kryo:
137     MaxInterleaveFactor = 4;
138     VectorInsertExtractBaseCost = 2;
139     CacheLineSize = 128;
140     PrefetchDistance = 740;
141     MinPrefetchStride = 1024;
142     MaxPrefetchIterationsAhead = 11;
143     // FIXME: remove this to enable 64-bit SLP if performance looks good.
144     MinVectorRegisterBitWidth = 128;
145     break;
146   case NeoverseE1:
147     PrefFunctionLogAlignment = 3;
148     break;
149   case NeoverseN1:
150   case NeoverseN2:
151   case NeoverseV1:
152     PrefFunctionLogAlignment = 4;
153     break;
154   case Saphira:
155     MaxInterleaveFactor = 4;
156     // FIXME: remove this to enable 64-bit SLP if performance looks good.
157     MinVectorRegisterBitWidth = 128;
158     break;
159   case ThunderX2T99:
160     CacheLineSize = 64;
161     PrefFunctionLogAlignment = 3;
162     PrefLoopLogAlignment = 2;
163     MaxInterleaveFactor = 4;
164     PrefetchDistance = 128;
165     MinPrefetchStride = 1024;
166     MaxPrefetchIterationsAhead = 4;
167     // FIXME: remove this to enable 64-bit SLP if performance looks good.
168     MinVectorRegisterBitWidth = 128;
169     break;
170   case ThunderX:
171   case ThunderXT88:
172   case ThunderXT81:
173   case ThunderXT83:
174     CacheLineSize = 128;
175     PrefFunctionLogAlignment = 3;
176     PrefLoopLogAlignment = 2;
177     // FIXME: remove this to enable 64-bit SLP if performance looks good.
178     MinVectorRegisterBitWidth = 128;
179     break;
180   case TSV110:
181     CacheLineSize = 64;
182     PrefFunctionLogAlignment = 4;
183     PrefLoopLogAlignment = 2;
184     break;
185   case ThunderX3T110:
186     CacheLineSize = 64;
187     PrefFunctionLogAlignment = 4;
188     PrefLoopLogAlignment = 2;
189     MaxInterleaveFactor = 4;
190     PrefetchDistance = 128;
191     MinPrefetchStride = 1024;
192     MaxPrefetchIterationsAhead = 4;
193     // FIXME: remove this to enable 64-bit SLP if performance looks good.
194     MinVectorRegisterBitWidth = 128;
195     break;
196   }
197 }
198 
199 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
200                                    const std::string &FS,
201                                    const TargetMachine &TM, bool LittleEndian,
202                                    unsigned MinSVEVectorSizeInBitsOverride,
203                                    unsigned MaxSVEVectorSizeInBitsOverride)
204     : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
205       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
206       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
207       IsLittle(LittleEndian),
208       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
209       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
210       FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS, CPU)),
211       TSInfo(), TLInfo(TM, *this) {
212   if (AArch64::isX18ReservedByDefault(TT))
213     ReserveXRegister.set(18);
214 
215   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
216   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
217   Legalizer.reset(new AArch64LegalizerInfo(*this));
218 
219   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
220 
221   // FIXME: At this point, we can't rely on Subtarget having RBI.
222   // It's awkward to mix passing RBI and the Subtarget; should we pass
223   // TII/TRI as well?
224   InstSelector.reset(createAArch64InstructionSelector(
225       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
226 
227   RegBankInfo.reset(RBI);
228 }
229 
230 const CallLowering *AArch64Subtarget::getCallLowering() const {
231   return CallLoweringInfo.get();
232 }
233 
234 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
235   return InlineAsmLoweringInfo.get();
236 }
237 
238 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
239   return InstSelector.get();
240 }
241 
242 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
243   return Legalizer.get();
244 }
245 
246 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
247   return RegBankInfo.get();
248 }
249 
250 /// Find the target operand flags that describe how a global value should be
251 /// referenced for the current subtarget.
252 unsigned
253 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
254                                           const TargetMachine &TM) const {
255   // MachO large model always goes via a GOT, simply to get a single 8-byte
256   // absolute relocation on all global addresses.
257   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
258     return AArch64II::MO_GOT;
259 
260   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
261     if (GV->hasDLLImportStorageClass())
262       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
263     if (getTargetTriple().isOSWindows())
264       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
265     return AArch64II::MO_GOT;
266   }
267 
268   // The small code model's direct accesses use ADRP, which cannot
269   // necessarily produce the value 0 (if the code is above 4GB).
270   // Same for the tiny code model, where we have a pc relative LDR.
271   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
272       GV->hasExternalWeakLinkage())
273     return AArch64II::MO_GOT;
274 
275   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
276   // that their nominal addresses are tagged and outside of the code model. In
277   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
278   // tag if necessary based on MO_TAGGED.
279   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
280     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
281 
282   return AArch64II::MO_NO_FLAG;
283 }
284 
285 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
286     const GlobalValue *GV, const TargetMachine &TM) const {
287   // MachO large model always goes via a GOT, because we don't have the
288   // relocations available to do anything else..
289   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
290       !GV->hasInternalLinkage())
291     return AArch64II::MO_GOT;
292 
293   // NonLazyBind goes via GOT unless we know it's available locally.
294   auto *F = dyn_cast<Function>(GV);
295   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
296       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
297     return AArch64II::MO_GOT;
298 
299   // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
300   if (getTargetTriple().isOSWindows())
301     return ClassifyGlobalReference(GV, TM);
302 
303   return AArch64II::MO_NO_FLAG;
304 }
305 
306 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
307                                            unsigned NumRegionInstrs) const {
308   // LNT run (at least on Cyclone) showed reasonably significant gains for
309   // bi-directional scheduling. 253.perlbmk.
310   Policy.OnlyTopDown = false;
311   Policy.OnlyBottomUp = false;
312   // Enabling or Disabling the latency heuristic is a close call: It seems to
313   // help nearly no benchmark on out-of-order architectures, on the other hand
314   // it regresses register pressure on a few benchmarking.
315   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
316 }
317 
318 bool AArch64Subtarget::enableEarlyIfConversion() const {
319   return EnableEarlyIfConvert;
320 }
321 
322 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
323   if (!UseAddressTopByteIgnored)
324     return false;
325 
326   if (TargetTriple.isiOS()) {
327     unsigned Major, Minor, Micro;
328     TargetTriple.getiOSVersion(Major, Minor, Micro);
329     return Major >= 8;
330   }
331 
332   return false;
333 }
334 
335 std::unique_ptr<PBQPRAConstraint>
336 AArch64Subtarget::getCustomPBQPConstraints() const {
337   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
338 }
339 
340 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
341   // We usually compute max call frame size after ISel. Do the computation now
342   // if the .mir file didn't specify it. Note that this will probably give you
343   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
344   // instructions, specify explicitly if you need it to be correct.
345   MachineFrameInfo &MFI = MF.getFrameInfo();
346   if (!MFI.isMaxCallFrameSizeComputed())
347     MFI.computeMaxCallFrameSize(MF);
348 }
349 
350 bool AArch64Subtarget::useSVEForFixedLengthVectors() const {
351   // Prefer NEON unless larger SVE registers are available.
352   return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
353 }
354 
355 bool AArch64Subtarget::useAA() const { return UseAA; }
356