1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64Subtarget.h"
14 
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/IR/GlobalValue.h"
27 #include "llvm/TargetParser/AArch64TargetParser.h"
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "aarch64-subtarget"
32 
33 #define GET_SUBTARGETINFO_CTOR
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #include "AArch64GenSubtargetInfo.inc"
36 
37 static cl::opt<bool>
38 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
39                      "converter pass"), cl::init(true), cl::Hidden);
40 
41 // If OS supports TBI, use this flag to enable it.
42 static cl::opt<bool>
43 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
44                          "an address is ignored"), cl::init(false), cl::Hidden);
45 
46 static cl::opt<bool>
47     UseNonLazyBind("aarch64-enable-nonlazybind",
48                    cl::desc("Call nonlazybind functions via direct GOT load"),
49                    cl::init(false), cl::Hidden);
50 
51 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
52                            cl::desc("Enable the use of AA during codegen."));
53 
54 static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost(
55     "aarch64-insert-extract-base-cost",
56     cl::desc("Base cost of vector insert/extract element"), cl::Hidden);
57 
58 // Reserve a list of X# registers, so they are unavailable for register
59 // allocator, but can still be used as ABI requests, such as passing arguments
60 // to function call.
61 static cl::list<std::string>
62 ReservedRegsForRA("reserve-regs-for-regalloc", cl::desc("Reserve physical "
63                   "registers, so they can't be used by register allocator. "
64                   "Should only be used for testing register allocator."),
65                   cl::CommaSeparated, cl::Hidden);
66 
67 static cl::opt<bool> ForceStreamingCompatibleSVE(
68     "force-streaming-compatible-sve",
69     cl::desc(
70         "Force the use of streaming-compatible SVE code for all functions"),
71     cl::Hidden);
72 
73 unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const {
74   if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0)
75     return OverrideVectorInsertExtractBaseCost;
76   return VectorInsertExtractBaseCost;
77 }
78 
79 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
80     StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
81   // Determine default and user-specified characteristics
82 
83   if (CPUString.empty())
84     CPUString = "generic";
85 
86   if (TuneCPUString.empty())
87     TuneCPUString = CPUString;
88 
89   ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
90   initializeProperties();
91 
92   return *this;
93 }
94 
95 void AArch64Subtarget::initializeProperties() {
96   // Initialize CPU specific properties. We should add a tablegen feature for
97   // this in the future so we can specify it together with the subtarget
98   // features.
99   switch (ARMProcFamily) {
100   case Others:
101     break;
102   case Carmel:
103     CacheLineSize = 64;
104     break;
105   case CortexA35:
106   case CortexA53:
107   case CortexA55:
108     PrefFunctionAlignment = Align(16);
109     PrefLoopAlignment = Align(16);
110     MaxBytesForLoopAlignment = 8;
111     break;
112   case CortexA57:
113     MaxInterleaveFactor = 4;
114     PrefFunctionAlignment = Align(16);
115     PrefLoopAlignment = Align(16);
116     MaxBytesForLoopAlignment = 8;
117     break;
118   case CortexA65:
119     PrefFunctionAlignment = Align(8);
120     break;
121   case CortexA72:
122   case CortexA73:
123   case CortexA75:
124     PrefFunctionAlignment = Align(16);
125     PrefLoopAlignment = Align(16);
126     MaxBytesForLoopAlignment = 8;
127     break;
128   case CortexA76:
129   case CortexA77:
130   case CortexA78:
131   case CortexA78C:
132   case CortexR82:
133   case CortexX1:
134   case CortexX1C:
135     PrefFunctionAlignment = Align(16);
136     PrefLoopAlignment = Align(32);
137     MaxBytesForLoopAlignment = 16;
138     break;
139   case CortexA510:
140     PrefFunctionAlignment = Align(16);
141     VScaleForTuning = 1;
142     PrefLoopAlignment = Align(16);
143     MaxBytesForLoopAlignment = 8;
144     break;
145   case CortexA710:
146   case CortexA715:
147   case CortexX2:
148   case CortexX3:
149     PrefFunctionAlignment = Align(16);
150     VScaleForTuning = 1;
151     PrefLoopAlignment = Align(32);
152     MaxBytesForLoopAlignment = 16;
153     break;
154   case A64FX:
155     CacheLineSize = 256;
156     PrefFunctionAlignment = Align(8);
157     PrefLoopAlignment = Align(4);
158     MaxInterleaveFactor = 4;
159     PrefetchDistance = 128;
160     MinPrefetchStride = 1024;
161     MaxPrefetchIterationsAhead = 4;
162     VScaleForTuning = 4;
163     break;
164   case AppleA7:
165   case AppleA10:
166   case AppleA11:
167   case AppleA12:
168   case AppleA13:
169   case AppleA14:
170   case AppleA15:
171   case AppleA16:
172     CacheLineSize = 64;
173     PrefetchDistance = 280;
174     MinPrefetchStride = 2048;
175     MaxPrefetchIterationsAhead = 3;
176     switch (ARMProcFamily) {
177     case AppleA14:
178     case AppleA15:
179     case AppleA16:
180       MaxInterleaveFactor = 4;
181       break;
182     default:
183       break;
184     }
185     break;
186   case ExynosM3:
187     MaxInterleaveFactor = 4;
188     MaxJumpTableSize = 20;
189     PrefFunctionAlignment = Align(32);
190     PrefLoopAlignment = Align(16);
191     break;
192   case Falkor:
193     MaxInterleaveFactor = 4;
194     // FIXME: remove this to enable 64-bit SLP if performance looks good.
195     MinVectorRegisterBitWidth = 128;
196     CacheLineSize = 128;
197     PrefetchDistance = 820;
198     MinPrefetchStride = 2048;
199     MaxPrefetchIterationsAhead = 8;
200     break;
201   case Kryo:
202     MaxInterleaveFactor = 4;
203     VectorInsertExtractBaseCost = 2;
204     CacheLineSize = 128;
205     PrefetchDistance = 740;
206     MinPrefetchStride = 1024;
207     MaxPrefetchIterationsAhead = 11;
208     // FIXME: remove this to enable 64-bit SLP if performance looks good.
209     MinVectorRegisterBitWidth = 128;
210     break;
211   case NeoverseE1:
212     PrefFunctionAlignment = Align(8);
213     break;
214   case NeoverseN1:
215     PrefFunctionAlignment = Align(16);
216     PrefLoopAlignment = Align(32);
217     MaxBytesForLoopAlignment = 16;
218     break;
219   case NeoverseN2:
220   case NeoverseV2:
221     PrefFunctionAlignment = Align(16);
222     PrefLoopAlignment = Align(32);
223     MaxBytesForLoopAlignment = 16;
224     VScaleForTuning = 1;
225     break;
226   case NeoverseV1:
227     PrefFunctionAlignment = Align(16);
228     PrefLoopAlignment = Align(32);
229     MaxBytesForLoopAlignment = 16;
230     VScaleForTuning = 2;
231     DefaultSVETFOpts = TailFoldingOpts::Simple;
232     break;
233   case Neoverse512TVB:
234     PrefFunctionAlignment = Align(16);
235     VScaleForTuning = 1;
236     MaxInterleaveFactor = 4;
237     break;
238   case Saphira:
239     MaxInterleaveFactor = 4;
240     // FIXME: remove this to enable 64-bit SLP if performance looks good.
241     MinVectorRegisterBitWidth = 128;
242     break;
243   case ThunderX2T99:
244     CacheLineSize = 64;
245     PrefFunctionAlignment = Align(8);
246     PrefLoopAlignment = Align(4);
247     MaxInterleaveFactor = 4;
248     PrefetchDistance = 128;
249     MinPrefetchStride = 1024;
250     MaxPrefetchIterationsAhead = 4;
251     // FIXME: remove this to enable 64-bit SLP if performance looks good.
252     MinVectorRegisterBitWidth = 128;
253     break;
254   case ThunderX:
255   case ThunderXT88:
256   case ThunderXT81:
257   case ThunderXT83:
258     CacheLineSize = 128;
259     PrefFunctionAlignment = Align(8);
260     PrefLoopAlignment = Align(4);
261     // FIXME: remove this to enable 64-bit SLP if performance looks good.
262     MinVectorRegisterBitWidth = 128;
263     break;
264   case TSV110:
265     CacheLineSize = 64;
266     PrefFunctionAlignment = Align(16);
267     PrefLoopAlignment = Align(4);
268     break;
269   case ThunderX3T110:
270     CacheLineSize = 64;
271     PrefFunctionAlignment = Align(16);
272     PrefLoopAlignment = Align(4);
273     MaxInterleaveFactor = 4;
274     PrefetchDistance = 128;
275     MinPrefetchStride = 1024;
276     MaxPrefetchIterationsAhead = 4;
277     // FIXME: remove this to enable 64-bit SLP if performance looks good.
278     MinVectorRegisterBitWidth = 128;
279     break;
280   case Ampere1:
281   case Ampere1A:
282     CacheLineSize = 64;
283     PrefFunctionAlignment = Align(64);
284     PrefLoopAlignment = Align(64);
285     MaxInterleaveFactor = 4;
286     break;
287   }
288 }
289 
290 AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
291                                    StringRef TuneCPU, StringRef FS,
292                                    const TargetMachine &TM, bool LittleEndian,
293                                    unsigned MinSVEVectorSizeInBitsOverride,
294                                    unsigned MaxSVEVectorSizeInBitsOverride,
295                                    bool StreamingSVEMode,
296                                    bool StreamingCompatibleSVEMode)
297     : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
298       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
299       ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()),
300       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
301       IsLittle(LittleEndian),
302       StreamingSVEMode(StreamingSVEMode),
303       StreamingCompatibleSVEMode(StreamingCompatibleSVEMode),
304       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
305       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
306       InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)),
307       TLInfo(TM, *this) {
308   if (AArch64::isX18ReservedByDefault(TT))
309     ReserveXRegister.set(18);
310 
311   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
312   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
313   Legalizer.reset(new AArch64LegalizerInfo(*this));
314 
315   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
316 
317   // FIXME: At this point, we can't rely on Subtarget having RBI.
318   // It's awkward to mix passing RBI and the Subtarget; should we pass
319   // TII/TRI as well?
320   InstSelector.reset(createAArch64InstructionSelector(
321       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
322 
323   RegBankInfo.reset(RBI);
324 
325   auto TRI = getRegisterInfo();
326   StringSet<> ReservedRegNames;
327   ReservedRegNames.insert(ReservedRegsForRA.begin(), ReservedRegsForRA.end());
328   for (unsigned i = 0; i < 29; ++i) {
329     if (ReservedRegNames.count(TRI->getName(AArch64::X0 + i)))
330       ReserveXRegisterForRA.set(i);
331   }
332   // X30 is named LR, so we can't use TRI->getName to check X30.
333   if (ReservedRegNames.count("X30") || ReservedRegNames.count("LR"))
334     ReserveXRegisterForRA.set(30);
335   // X29 is named FP, so we can't use TRI->getName to check X29.
336   if (ReservedRegNames.count("X29") || ReservedRegNames.count("FP"))
337     ReserveXRegisterForRA.set(29);
338 }
339 
340 const CallLowering *AArch64Subtarget::getCallLowering() const {
341   return CallLoweringInfo.get();
342 }
343 
344 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
345   return InlineAsmLoweringInfo.get();
346 }
347 
348 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
349   return InstSelector.get();
350 }
351 
352 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
353   return Legalizer.get();
354 }
355 
356 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
357   return RegBankInfo.get();
358 }
359 
360 /// Find the target operand flags that describe how a global value should be
361 /// referenced for the current subtarget.
362 unsigned
363 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
364                                           const TargetMachine &TM) const {
365   // MachO large model always goes via a GOT, simply to get a single 8-byte
366   // absolute relocation on all global addresses.
367   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
368     return AArch64II::MO_GOT;
369 
370   // All globals dynamically protected by MTE must have their address tags
371   // synthesized. This is done by having the loader stash the tag in the GOT
372   // entry. Force all tagged globals (even ones with internal linkage) through
373   // the GOT.
374   if (GV->isTagged())
375     return AArch64II::MO_GOT;
376 
377   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
378     if (GV->hasDLLImportStorageClass()) {
379       if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy())
380         return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORTAUX;
381       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
382     }
383     if (getTargetTriple().isOSWindows())
384       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
385     return AArch64II::MO_GOT;
386   }
387 
388   // The small code model's direct accesses use ADRP, which cannot
389   // necessarily produce the value 0 (if the code is above 4GB).
390   // Same for the tiny code model, where we have a pc relative LDR.
391   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
392       GV->hasExternalWeakLinkage())
393     return AArch64II::MO_GOT;
394 
395   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
396   // that their nominal addresses are tagged and outside of the code model. In
397   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
398   // tag if necessary based on MO_TAGGED.
399   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
400     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
401 
402   return AArch64II::MO_NO_FLAG;
403 }
404 
405 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
406     const GlobalValue *GV, const TargetMachine &TM) const {
407   // MachO large model always goes via a GOT, because we don't have the
408   // relocations available to do anything else..
409   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
410       !GV->hasInternalLinkage())
411     return AArch64II::MO_GOT;
412 
413   // NonLazyBind goes via GOT unless we know it's available locally.
414   auto *F = dyn_cast<Function>(GV);
415   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
416       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
417     return AArch64II::MO_GOT;
418 
419   if (getTargetTriple().isOSWindows()) {
420     if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy() &&
421         GV->hasDLLImportStorageClass()) {
422       // On Arm64EC, if we're calling a function directly, use MO_DLLIMPORT,
423       // not MO_DLLIMPORTAUX.
424       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
425     }
426 
427     // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
428     return ClassifyGlobalReference(GV, TM);
429   }
430 
431   return AArch64II::MO_NO_FLAG;
432 }
433 
434 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
435                                            unsigned NumRegionInstrs) const {
436   // LNT run (at least on Cyclone) showed reasonably significant gains for
437   // bi-directional scheduling. 253.perlbmk.
438   Policy.OnlyTopDown = false;
439   Policy.OnlyBottomUp = false;
440   // Enabling or Disabling the latency heuristic is a close call: It seems to
441   // help nearly no benchmark on out-of-order architectures, on the other hand
442   // it regresses register pressure on a few benchmarking.
443   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
444 }
445 
446 bool AArch64Subtarget::enableEarlyIfConversion() const {
447   return EnableEarlyIfConvert;
448 }
449 
450 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
451   if (!UseAddressTopByteIgnored)
452     return false;
453 
454   if (TargetTriple.isDriverKit())
455     return true;
456   if (TargetTriple.isiOS()) {
457     return TargetTriple.getiOSVersion() >= VersionTuple(8);
458   }
459 
460   return false;
461 }
462 
463 std::unique_ptr<PBQPRAConstraint>
464 AArch64Subtarget::getCustomPBQPConstraints() const {
465   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
466 }
467 
468 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
469   // We usually compute max call frame size after ISel. Do the computation now
470   // if the .mir file didn't specify it. Note that this will probably give you
471   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
472   // instructions, specify explicitly if you need it to be correct.
473   MachineFrameInfo &MFI = MF.getFrameInfo();
474   if (!MFI.isMaxCallFrameSizeComputed())
475     MFI.computeMaxCallFrameSize(MF);
476 }
477 
478 bool AArch64Subtarget::useAA() const { return UseAA; }
479 
480 bool AArch64Subtarget::isNeonAvailable() const {
481   if (!hasNEON())
482     return false;
483 
484   // The 'force-streaming-comaptible-sve' flag overrides the streaming
485   // function attributes.
486   if (ForceStreamingCompatibleSVE.getNumOccurrences() > 0)
487     return !ForceStreamingCompatibleSVE;
488 
489   return !isStreaming() && !isStreamingCompatible();
490 }
491