1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64Subtarget.h"
14 
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/IR/GlobalValue.h"
27 #include "llvm/Support/AArch64TargetParser.h"
28 #include "llvm/Support/TargetParser.h"
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "aarch64-subtarget"
33 
34 #define GET_SUBTARGETINFO_CTOR
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #include "AArch64GenSubtargetInfo.inc"
37 
38 static cl::opt<bool>
39 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
40                      "converter pass"), cl::init(true), cl::Hidden);
41 
42 // If OS supports TBI, use this flag to enable it.
43 static cl::opt<bool>
44 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
45                          "an address is ignored"), cl::init(false), cl::Hidden);
46 
47 static cl::opt<bool>
48     UseNonLazyBind("aarch64-enable-nonlazybind",
49                    cl::desc("Call nonlazybind functions via direct GOT load"),
50                    cl::init(false), cl::Hidden);
51 
52 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
53                            cl::desc("Enable the use of AA during codegen."));
54 
55 static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost(
56     "aarch64-insert-extract-base-cost",
57     cl::desc("Base cost of vector insert/extract element"), cl::Hidden);
58 
59 unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const {
60   if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0)
61     return OverrideVectorInsertExtractBaseCost;
62   return VectorInsertExtractBaseCost;
63 }
64 
65 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
66     StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
67   // Determine default and user-specified characteristics
68 
69   if (CPUString.empty())
70     CPUString = "generic";
71 
72   if (TuneCPUString.empty())
73     TuneCPUString = CPUString;
74 
75   ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
76   initializeProperties();
77 
78   return *this;
79 }
80 
81 void AArch64Subtarget::initializeProperties() {
82   // Initialize CPU specific properties. We should add a tablegen feature for
83   // this in the future so we can specify it together with the subtarget
84   // features.
85   switch (ARMProcFamily) {
86   case Others:
87     break;
88   case Carmel:
89     CacheLineSize = 64;
90     break;
91   case CortexA35:
92   case CortexA53:
93   case CortexA55:
94     PrefFunctionLogAlignment = 4;
95     PrefLoopLogAlignment = 4;
96     MaxBytesForLoopAlignment = 8;
97     break;
98   case CortexA57:
99     MaxInterleaveFactor = 4;
100     PrefFunctionLogAlignment = 4;
101     PrefLoopLogAlignment = 4;
102     MaxBytesForLoopAlignment = 8;
103     break;
104   case CortexA65:
105     PrefFunctionLogAlignment = 3;
106     break;
107   case CortexA72:
108   case CortexA73:
109   case CortexA75:
110     PrefFunctionLogAlignment = 4;
111     PrefLoopLogAlignment = 4;
112     MaxBytesForLoopAlignment = 8;
113     break;
114   case CortexA76:
115   case CortexA77:
116   case CortexA78:
117   case CortexA78C:
118   case CortexR82:
119   case CortexX1:
120   case CortexX1C:
121     PrefFunctionLogAlignment = 4;
122     PrefLoopLogAlignment = 5;
123     MaxBytesForLoopAlignment = 16;
124     break;
125   case CortexA510:
126     PrefFunctionLogAlignment = 4;
127     VScaleForTuning = 1;
128     PrefLoopLogAlignment = 4;
129     MaxBytesForLoopAlignment = 8;
130     break;
131   case CortexA710:
132   case CortexX2:
133     PrefFunctionLogAlignment = 4;
134     VScaleForTuning = 1;
135     PrefLoopLogAlignment = 5;
136     MaxBytesForLoopAlignment = 16;
137     break;
138   case A64FX:
139     CacheLineSize = 256;
140     PrefFunctionLogAlignment = 3;
141     PrefLoopLogAlignment = 2;
142     MaxInterleaveFactor = 4;
143     PrefetchDistance = 128;
144     MinPrefetchStride = 1024;
145     MaxPrefetchIterationsAhead = 4;
146     VScaleForTuning = 4;
147     break;
148   case AppleA7:
149   case AppleA10:
150   case AppleA11:
151   case AppleA12:
152   case AppleA13:
153   case AppleA14:
154     CacheLineSize = 64;
155     PrefetchDistance = 280;
156     MinPrefetchStride = 2048;
157     MaxPrefetchIterationsAhead = 3;
158     break;
159   case ExynosM3:
160     MaxInterleaveFactor = 4;
161     MaxJumpTableSize = 20;
162     PrefFunctionLogAlignment = 5;
163     PrefLoopLogAlignment = 4;
164     break;
165   case Falkor:
166     MaxInterleaveFactor = 4;
167     // FIXME: remove this to enable 64-bit SLP if performance looks good.
168     MinVectorRegisterBitWidth = 128;
169     CacheLineSize = 128;
170     PrefetchDistance = 820;
171     MinPrefetchStride = 2048;
172     MaxPrefetchIterationsAhead = 8;
173     break;
174   case Kryo:
175     MaxInterleaveFactor = 4;
176     VectorInsertExtractBaseCost = 2;
177     CacheLineSize = 128;
178     PrefetchDistance = 740;
179     MinPrefetchStride = 1024;
180     MaxPrefetchIterationsAhead = 11;
181     // FIXME: remove this to enable 64-bit SLP if performance looks good.
182     MinVectorRegisterBitWidth = 128;
183     break;
184   case NeoverseE1:
185     PrefFunctionLogAlignment = 3;
186     break;
187   case NeoverseN1:
188     PrefFunctionLogAlignment = 4;
189     PrefLoopLogAlignment = 5;
190     MaxBytesForLoopAlignment = 16;
191     break;
192   case NeoverseN2:
193     PrefFunctionLogAlignment = 4;
194     PrefLoopLogAlignment = 5;
195     MaxBytesForLoopAlignment = 16;
196     VScaleForTuning = 1;
197     break;
198   case NeoverseV1:
199     PrefFunctionLogAlignment = 4;
200     PrefLoopLogAlignment = 5;
201     MaxBytesForLoopAlignment = 16;
202     VScaleForTuning = 2;
203     break;
204   case Neoverse512TVB:
205     PrefFunctionLogAlignment = 4;
206     VScaleForTuning = 1;
207     MaxInterleaveFactor = 4;
208     break;
209   case Saphira:
210     MaxInterleaveFactor = 4;
211     // FIXME: remove this to enable 64-bit SLP if performance looks good.
212     MinVectorRegisterBitWidth = 128;
213     break;
214   case ThunderX2T99:
215     CacheLineSize = 64;
216     PrefFunctionLogAlignment = 3;
217     PrefLoopLogAlignment = 2;
218     MaxInterleaveFactor = 4;
219     PrefetchDistance = 128;
220     MinPrefetchStride = 1024;
221     MaxPrefetchIterationsAhead = 4;
222     // FIXME: remove this to enable 64-bit SLP if performance looks good.
223     MinVectorRegisterBitWidth = 128;
224     break;
225   case ThunderX:
226   case ThunderXT88:
227   case ThunderXT81:
228   case ThunderXT83:
229     CacheLineSize = 128;
230     PrefFunctionLogAlignment = 3;
231     PrefLoopLogAlignment = 2;
232     // FIXME: remove this to enable 64-bit SLP if performance looks good.
233     MinVectorRegisterBitWidth = 128;
234     break;
235   case TSV110:
236     CacheLineSize = 64;
237     PrefFunctionLogAlignment = 4;
238     PrefLoopLogAlignment = 2;
239     break;
240   case ThunderX3T110:
241     CacheLineSize = 64;
242     PrefFunctionLogAlignment = 4;
243     PrefLoopLogAlignment = 2;
244     MaxInterleaveFactor = 4;
245     PrefetchDistance = 128;
246     MinPrefetchStride = 1024;
247     MaxPrefetchIterationsAhead = 4;
248     // FIXME: remove this to enable 64-bit SLP if performance looks good.
249     MinVectorRegisterBitWidth = 128;
250     break;
251   case Ampere1:
252     CacheLineSize = 64;
253     PrefFunctionLogAlignment = 6;
254     PrefLoopLogAlignment = 6;
255     MaxInterleaveFactor = 4;
256     break;
257   }
258 }
259 
260 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
261                                    const std::string &TuneCPU,
262                                    const std::string &FS,
263                                    const TargetMachine &TM, bool LittleEndian,
264                                    unsigned MinSVEVectorSizeInBitsOverride,
265                                    unsigned MaxSVEVectorSizeInBitsOverride)
266     : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
267       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
268       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
269       IsLittle(LittleEndian),
270       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
271       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
272       InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)),
273       TLInfo(TM, *this) {
274   if (AArch64::isX18ReservedByDefault(TT))
275     ReserveXRegister.set(18);
276 
277   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
278   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
279   Legalizer.reset(new AArch64LegalizerInfo(*this));
280 
281   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
282 
283   // FIXME: At this point, we can't rely on Subtarget having RBI.
284   // It's awkward to mix passing RBI and the Subtarget; should we pass
285   // TII/TRI as well?
286   InstSelector.reset(createAArch64InstructionSelector(
287       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
288 
289   RegBankInfo.reset(RBI);
290 }
291 
292 const CallLowering *AArch64Subtarget::getCallLowering() const {
293   return CallLoweringInfo.get();
294 }
295 
296 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
297   return InlineAsmLoweringInfo.get();
298 }
299 
300 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
301   return InstSelector.get();
302 }
303 
304 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
305   return Legalizer.get();
306 }
307 
308 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
309   return RegBankInfo.get();
310 }
311 
312 /// Find the target operand flags that describe how a global value should be
313 /// referenced for the current subtarget.
314 unsigned
315 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
316                                           const TargetMachine &TM) const {
317   // MachO large model always goes via a GOT, simply to get a single 8-byte
318   // absolute relocation on all global addresses.
319   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
320     return AArch64II::MO_GOT;
321 
322   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
323     if (GV->hasDLLImportStorageClass())
324       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
325     if (getTargetTriple().isOSWindows())
326       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
327     return AArch64II::MO_GOT;
328   }
329 
330   // The small code model's direct accesses use ADRP, which cannot
331   // necessarily produce the value 0 (if the code is above 4GB).
332   // Same for the tiny code model, where we have a pc relative LDR.
333   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
334       GV->hasExternalWeakLinkage())
335     return AArch64II::MO_GOT;
336 
337   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
338   // that their nominal addresses are tagged and outside of the code model. In
339   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
340   // tag if necessary based on MO_TAGGED.
341   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
342     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
343 
344   return AArch64II::MO_NO_FLAG;
345 }
346 
347 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
348     const GlobalValue *GV, const TargetMachine &TM) const {
349   // MachO large model always goes via a GOT, because we don't have the
350   // relocations available to do anything else..
351   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
352       !GV->hasInternalLinkage())
353     return AArch64II::MO_GOT;
354 
355   // NonLazyBind goes via GOT unless we know it's available locally.
356   auto *F = dyn_cast<Function>(GV);
357   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
358       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
359     return AArch64II::MO_GOT;
360 
361   // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
362   if (getTargetTriple().isOSWindows())
363     return ClassifyGlobalReference(GV, TM);
364 
365   return AArch64II::MO_NO_FLAG;
366 }
367 
368 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
369                                            unsigned NumRegionInstrs) const {
370   // LNT run (at least on Cyclone) showed reasonably significant gains for
371   // bi-directional scheduling. 253.perlbmk.
372   Policy.OnlyTopDown = false;
373   Policy.OnlyBottomUp = false;
374   // Enabling or Disabling the latency heuristic is a close call: It seems to
375   // help nearly no benchmark on out-of-order architectures, on the other hand
376   // it regresses register pressure on a few benchmarking.
377   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
378 }
379 
380 bool AArch64Subtarget::enableEarlyIfConversion() const {
381   return EnableEarlyIfConvert;
382 }
383 
384 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
385   if (!UseAddressTopByteIgnored)
386     return false;
387 
388   if (TargetTriple.isDriverKit())
389     return true;
390   if (TargetTriple.isiOS()) {
391     return TargetTriple.getiOSVersion() >= VersionTuple(8);
392   }
393 
394   return false;
395 }
396 
397 std::unique_ptr<PBQPRAConstraint>
398 AArch64Subtarget::getCustomPBQPConstraints() const {
399   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
400 }
401 
402 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
403   // We usually compute max call frame size after ISel. Do the computation now
404   // if the .mir file didn't specify it. Note that this will probably give you
405   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
406   // instructions, specify explicitly if you need it to be correct.
407   MachineFrameInfo &MFI = MF.getFrameInfo();
408   if (!MFI.isMaxCallFrameSizeComputed())
409     MFI.computeMaxCallFrameSize(MF);
410 }
411 
412 bool AArch64Subtarget::useAA() const { return UseAA; }
413