1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64Subtarget.h"
14 
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/IR/GlobalValue.h"
27 #include "llvm/TargetParser/AArch64TargetParser.h"
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "aarch64-subtarget"
32 
33 #define GET_SUBTARGETINFO_CTOR
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #include "AArch64GenSubtargetInfo.inc"
36 
37 static cl::opt<bool>
38 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
39                      "converter pass"), cl::init(true), cl::Hidden);
40 
41 // If OS supports TBI, use this flag to enable it.
42 static cl::opt<bool>
43 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
44                          "an address is ignored"), cl::init(false), cl::Hidden);
45 
46 static cl::opt<bool>
47     UseNonLazyBind("aarch64-enable-nonlazybind",
48                    cl::desc("Call nonlazybind functions via direct GOT load"),
49                    cl::init(false), cl::Hidden);
50 
51 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
52                            cl::desc("Enable the use of AA during codegen."));
53 
54 static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost(
55     "aarch64-insert-extract-base-cost",
56     cl::desc("Base cost of vector insert/extract element"), cl::Hidden);
57 
58 // Reserve a list of X# registers, so they are unavailable for register
59 // allocator, but can still be used as ABI requests, such as passing arguments
60 // to function call.
61 static cl::list<std::string>
62 ReservedRegsForRA("reserve-regs-for-regalloc", cl::desc("Reserve physical "
63                   "registers, so they can't be used by register allocator. "
64                   "Should only be used for testing register allocator."),
65                   cl::CommaSeparated, cl::Hidden);
66 
67 static cl::opt<bool> ForceStreamingCompatibleSVE(
68     "force-streaming-compatible-sve",
69     cl::desc(
70         "Force the use of streaming-compatible SVE code for all functions"),
71     cl::Hidden);
72 
73 static cl::opt<AArch64PAuth::AuthCheckMethod>
74     AuthenticatedLRCheckMethod("aarch64-authenticated-lr-check-method",
75                                cl::Hidden,
76                                cl::desc("Override the variant of check applied "
77                                         "to authenticated LR during tail call"),
78                                cl::values(AUTH_CHECK_METHOD_CL_VALUES_LR));
79 
80 static cl::opt<unsigned> AArch64MinimumJumpTableEntries(
81     "aarch64-min-jump-table-entries", cl::init(13), cl::Hidden,
82     cl::desc("Set minimum number of entries to use a jump table on AArch64"));
83 
getVectorInsertExtractBaseCost() const84 unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const {
85   if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0)
86     return OverrideVectorInsertExtractBaseCost;
87   return VectorInsertExtractBaseCost;
88 }
89 
initializeSubtargetDependencies(StringRef FS,StringRef CPUString,StringRef TuneCPUString,bool HasMinSize)90 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
91     StringRef FS, StringRef CPUString, StringRef TuneCPUString,
92     bool HasMinSize) {
93   // Determine default and user-specified characteristics
94 
95   if (CPUString.empty())
96     CPUString = "generic";
97 
98   if (TuneCPUString.empty())
99     TuneCPUString = CPUString;
100 
101   ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
102   initializeProperties(HasMinSize);
103 
104   return *this;
105 }
106 
initializeProperties(bool HasMinSize)107 void AArch64Subtarget::initializeProperties(bool HasMinSize) {
108   // Initialize CPU specific properties. We should add a tablegen feature for
109   // this in the future so we can specify it together with the subtarget
110   // features.
111   switch (ARMProcFamily) {
112   case Others:
113     break;
114   case Carmel:
115     CacheLineSize = 64;
116     break;
117   case CortexA35:
118   case CortexA53:
119   case CortexA55:
120     PrefFunctionAlignment = Align(16);
121     PrefLoopAlignment = Align(16);
122     MaxBytesForLoopAlignment = 8;
123     break;
124   case CortexA57:
125     MaxInterleaveFactor = 4;
126     PrefFunctionAlignment = Align(16);
127     PrefLoopAlignment = Align(16);
128     MaxBytesForLoopAlignment = 8;
129     break;
130   case CortexA65:
131     PrefFunctionAlignment = Align(8);
132     break;
133   case CortexA72:
134   case CortexA73:
135   case CortexA75:
136     PrefFunctionAlignment = Align(16);
137     PrefLoopAlignment = Align(16);
138     MaxBytesForLoopAlignment = 8;
139     break;
140   case CortexA76:
141   case CortexA77:
142   case CortexA78:
143   case CortexA78C:
144   case CortexR82:
145   case CortexX1:
146   case CortexX1C:
147     PrefFunctionAlignment = Align(16);
148     PrefLoopAlignment = Align(32);
149     MaxBytesForLoopAlignment = 16;
150     break;
151   case CortexA510:
152   case CortexA520:
153     PrefFunctionAlignment = Align(16);
154     VScaleForTuning = 1;
155     PrefLoopAlignment = Align(16);
156     MaxBytesForLoopAlignment = 8;
157     break;
158   case CortexA710:
159   case CortexA715:
160   case CortexA720:
161   case CortexX2:
162   case CortexX3:
163   case CortexX4:
164     PrefFunctionAlignment = Align(16);
165     VScaleForTuning = 1;
166     PrefLoopAlignment = Align(32);
167     MaxBytesForLoopAlignment = 16;
168     break;
169   case A64FX:
170     CacheLineSize = 256;
171     PrefFunctionAlignment = Align(8);
172     PrefLoopAlignment = Align(4);
173     MaxInterleaveFactor = 4;
174     PrefetchDistance = 128;
175     MinPrefetchStride = 1024;
176     MaxPrefetchIterationsAhead = 4;
177     VScaleForTuning = 4;
178     break;
179   case AppleA7:
180   case AppleA10:
181   case AppleA11:
182   case AppleA12:
183   case AppleA13:
184   case AppleA14:
185   case AppleA15:
186   case AppleA16:
187   case AppleA17:
188     CacheLineSize = 64;
189     PrefetchDistance = 280;
190     MinPrefetchStride = 2048;
191     MaxPrefetchIterationsAhead = 3;
192     switch (ARMProcFamily) {
193     case AppleA14:
194     case AppleA15:
195     case AppleA16:
196     case AppleA17:
197       MaxInterleaveFactor = 4;
198       break;
199     default:
200       break;
201     }
202     break;
203   case ExynosM3:
204     MaxInterleaveFactor = 4;
205     MaxJumpTableSize = 20;
206     PrefFunctionAlignment = Align(32);
207     PrefLoopAlignment = Align(16);
208     break;
209   case Falkor:
210     MaxInterleaveFactor = 4;
211     // FIXME: remove this to enable 64-bit SLP if performance looks good.
212     MinVectorRegisterBitWidth = 128;
213     CacheLineSize = 128;
214     PrefetchDistance = 820;
215     MinPrefetchStride = 2048;
216     MaxPrefetchIterationsAhead = 8;
217     break;
218   case Kryo:
219     MaxInterleaveFactor = 4;
220     VectorInsertExtractBaseCost = 2;
221     CacheLineSize = 128;
222     PrefetchDistance = 740;
223     MinPrefetchStride = 1024;
224     MaxPrefetchIterationsAhead = 11;
225     // FIXME: remove this to enable 64-bit SLP if performance looks good.
226     MinVectorRegisterBitWidth = 128;
227     break;
228   case NeoverseE1:
229     PrefFunctionAlignment = Align(8);
230     break;
231   case NeoverseN1:
232     PrefFunctionAlignment = Align(16);
233     PrefLoopAlignment = Align(32);
234     MaxBytesForLoopAlignment = 16;
235     break;
236   case NeoverseN2:
237   case NeoverseV2:
238     PrefFunctionAlignment = Align(16);
239     PrefLoopAlignment = Align(32);
240     MaxBytesForLoopAlignment = 16;
241     VScaleForTuning = 1;
242     break;
243   case NeoverseV1:
244     PrefFunctionAlignment = Align(16);
245     PrefLoopAlignment = Align(32);
246     MaxBytesForLoopAlignment = 16;
247     VScaleForTuning = 2;
248     DefaultSVETFOpts = TailFoldingOpts::Simple;
249     break;
250   case Neoverse512TVB:
251     PrefFunctionAlignment = Align(16);
252     VScaleForTuning = 1;
253     MaxInterleaveFactor = 4;
254     break;
255   case Saphira:
256     MaxInterleaveFactor = 4;
257     // FIXME: remove this to enable 64-bit SLP if performance looks good.
258     MinVectorRegisterBitWidth = 128;
259     break;
260   case ThunderX2T99:
261     CacheLineSize = 64;
262     PrefFunctionAlignment = Align(8);
263     PrefLoopAlignment = Align(4);
264     MaxInterleaveFactor = 4;
265     PrefetchDistance = 128;
266     MinPrefetchStride = 1024;
267     MaxPrefetchIterationsAhead = 4;
268     // FIXME: remove this to enable 64-bit SLP if performance looks good.
269     MinVectorRegisterBitWidth = 128;
270     break;
271   case ThunderX:
272   case ThunderXT88:
273   case ThunderXT81:
274   case ThunderXT83:
275     CacheLineSize = 128;
276     PrefFunctionAlignment = Align(8);
277     PrefLoopAlignment = Align(4);
278     // FIXME: remove this to enable 64-bit SLP if performance looks good.
279     MinVectorRegisterBitWidth = 128;
280     break;
281   case TSV110:
282     CacheLineSize = 64;
283     PrefFunctionAlignment = Align(16);
284     PrefLoopAlignment = Align(4);
285     break;
286   case ThunderX3T110:
287     CacheLineSize = 64;
288     PrefFunctionAlignment = Align(16);
289     PrefLoopAlignment = Align(4);
290     MaxInterleaveFactor = 4;
291     PrefetchDistance = 128;
292     MinPrefetchStride = 1024;
293     MaxPrefetchIterationsAhead = 4;
294     // FIXME: remove this to enable 64-bit SLP if performance looks good.
295     MinVectorRegisterBitWidth = 128;
296     break;
297   case Ampere1:
298   case Ampere1A:
299   case Ampere1B:
300     CacheLineSize = 64;
301     PrefFunctionAlignment = Align(64);
302     PrefLoopAlignment = Align(64);
303     MaxInterleaveFactor = 4;
304     break;
305   }
306 
307   if (AArch64MinimumJumpTableEntries.getNumOccurrences() > 0 || !HasMinSize)
308     MinimumJumpTableEntries = AArch64MinimumJumpTableEntries;
309 }
310 
AArch64Subtarget(const Triple & TT,StringRef CPU,StringRef TuneCPU,StringRef FS,const TargetMachine & TM,bool LittleEndian,unsigned MinSVEVectorSizeInBitsOverride,unsigned MaxSVEVectorSizeInBitsOverride,bool StreamingSVEMode,bool StreamingCompatibleSVEMode,bool HasMinSize)311 AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
312                                    StringRef TuneCPU, StringRef FS,
313                                    const TargetMachine &TM, bool LittleEndian,
314                                    unsigned MinSVEVectorSizeInBitsOverride,
315                                    unsigned MaxSVEVectorSizeInBitsOverride,
316                                    bool StreamingSVEMode,
317                                    bool StreamingCompatibleSVEMode,
318                                    bool HasMinSize)
319     : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
320       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
321       ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()),
322       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
323       IsLittle(LittleEndian), StreamingSVEMode(StreamingSVEMode),
324       StreamingCompatibleSVEMode(StreamingCompatibleSVEMode),
325       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
326       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
327       InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU, HasMinSize)),
328       TLInfo(TM, *this) {
329   if (AArch64::isX18ReservedByDefault(TT))
330     ReserveXRegister.set(18);
331 
332   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
333   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
334   Legalizer.reset(new AArch64LegalizerInfo(*this));
335 
336   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
337 
338   // FIXME: At this point, we can't rely on Subtarget having RBI.
339   // It's awkward to mix passing RBI and the Subtarget; should we pass
340   // TII/TRI as well?
341   InstSelector.reset(createAArch64InstructionSelector(
342       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
343 
344   RegBankInfo.reset(RBI);
345 
346   auto TRI = getRegisterInfo();
347   StringSet<> ReservedRegNames;
348   ReservedRegNames.insert(ReservedRegsForRA.begin(), ReservedRegsForRA.end());
349   for (unsigned i = 0; i < 29; ++i) {
350     if (ReservedRegNames.count(TRI->getName(AArch64::X0 + i)))
351       ReserveXRegisterForRA.set(i);
352   }
353   // X30 is named LR, so we can't use TRI->getName to check X30.
354   if (ReservedRegNames.count("X30") || ReservedRegNames.count("LR"))
355     ReserveXRegisterForRA.set(30);
356   // X29 is named FP, so we can't use TRI->getName to check X29.
357   if (ReservedRegNames.count("X29") || ReservedRegNames.count("FP"))
358     ReserveXRegisterForRA.set(29);
359 
360   AddressCheckPSV.reset(new AddressCheckPseudoSourceValue(TM));
361 }
362 
getCallLowering() const363 const CallLowering *AArch64Subtarget::getCallLowering() const {
364   return CallLoweringInfo.get();
365 }
366 
getInlineAsmLowering() const367 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
368   return InlineAsmLoweringInfo.get();
369 }
370 
getInstructionSelector() const371 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
372   return InstSelector.get();
373 }
374 
getLegalizerInfo() const375 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
376   return Legalizer.get();
377 }
378 
getRegBankInfo() const379 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
380   return RegBankInfo.get();
381 }
382 
383 /// Find the target operand flags that describe how a global value should be
384 /// referenced for the current subtarget.
385 unsigned
ClassifyGlobalReference(const GlobalValue * GV,const TargetMachine & TM) const386 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
387                                           const TargetMachine &TM) const {
388   // MachO large model always goes via a GOT, simply to get a single 8-byte
389   // absolute relocation on all global addresses.
390   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
391     return AArch64II::MO_GOT;
392 
393   // All globals dynamically protected by MTE must have their address tags
394   // synthesized. This is done by having the loader stash the tag in the GOT
395   // entry. Force all tagged globals (even ones with internal linkage) through
396   // the GOT.
397   if (GV->isTagged())
398     return AArch64II::MO_GOT;
399 
400   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
401     if (GV->hasDLLImportStorageClass()) {
402       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
403     }
404     if (getTargetTriple().isOSWindows())
405       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
406     return AArch64II::MO_GOT;
407   }
408 
409   // The small code model's direct accesses use ADRP, which cannot
410   // necessarily produce the value 0 (if the code is above 4GB).
411   // Same for the tiny code model, where we have a pc relative LDR.
412   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
413       GV->hasExternalWeakLinkage())
414     return AArch64II::MO_GOT;
415 
416   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
417   // that their nominal addresses are tagged and outside of the code model. In
418   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
419   // tag if necessary based on MO_TAGGED.
420   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
421     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
422 
423   return AArch64II::MO_NO_FLAG;
424 }
425 
classifyGlobalFunctionReference(const GlobalValue * GV,const TargetMachine & TM) const426 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
427     const GlobalValue *GV, const TargetMachine &TM) const {
428   // MachO large model always goes via a GOT, because we don't have the
429   // relocations available to do anything else..
430   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
431       !GV->hasInternalLinkage())
432     return AArch64II::MO_GOT;
433 
434   // NonLazyBind goes via GOT unless we know it's available locally.
435   auto *F = dyn_cast<Function>(GV);
436   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
437       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
438     return AArch64II::MO_GOT;
439 
440   if (getTargetTriple().isOSWindows()) {
441     if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy()) {
442       if (GV->hasDLLImportStorageClass()) {
443         // On Arm64EC, if we're calling a symbol from the import table
444         // directly, use MO_ARM64EC_CALLMANGLE.
445         return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT |
446                AArch64II::MO_ARM64EC_CALLMANGLE;
447       }
448       if (GV->hasExternalLinkage()) {
449         // If we're calling a symbol directly, use the mangled form in the
450         // call instruction.
451         return AArch64II::MO_ARM64EC_CALLMANGLE;
452       }
453     }
454 
455     // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
456     return ClassifyGlobalReference(GV, TM);
457   }
458 
459   return AArch64II::MO_NO_FLAG;
460 }
461 
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const462 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
463                                            unsigned NumRegionInstrs) const {
464   // LNT run (at least on Cyclone) showed reasonably significant gains for
465   // bi-directional scheduling. 253.perlbmk.
466   Policy.OnlyTopDown = false;
467   Policy.OnlyBottomUp = false;
468   // Enabling or Disabling the latency heuristic is a close call: It seems to
469   // help nearly no benchmark on out-of-order architectures, on the other hand
470   // it regresses register pressure on a few benchmarking.
471   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
472 }
473 
enableEarlyIfConversion() const474 bool AArch64Subtarget::enableEarlyIfConversion() const {
475   return EnableEarlyIfConvert;
476 }
477 
supportsAddressTopByteIgnored() const478 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
479   if (!UseAddressTopByteIgnored)
480     return false;
481 
482   if (TargetTriple.isDriverKit())
483     return true;
484   if (TargetTriple.isiOS()) {
485     return TargetTriple.getiOSVersion() >= VersionTuple(8);
486   }
487 
488   return false;
489 }
490 
491 std::unique_ptr<PBQPRAConstraint>
getCustomPBQPConstraints() const492 AArch64Subtarget::getCustomPBQPConstraints() const {
493   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
494 }
495 
mirFileLoaded(MachineFunction & MF) const496 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
497   // We usually compute max call frame size after ISel. Do the computation now
498   // if the .mir file didn't specify it. Note that this will probably give you
499   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
500   // instructions, specify explicitly if you need it to be correct.
501   MachineFrameInfo &MFI = MF.getFrameInfo();
502   if (!MFI.isMaxCallFrameSizeComputed())
503     MFI.computeMaxCallFrameSize(MF);
504 }
505 
useAA() const506 bool AArch64Subtarget::useAA() const { return UseAA; }
507 
isStreamingCompatible() const508 bool AArch64Subtarget::isStreamingCompatible() const {
509   return StreamingCompatibleSVEMode || ForceStreamingCompatibleSVE;
510 }
511 
isNeonAvailable() const512 bool AArch64Subtarget::isNeonAvailable() const {
513   return hasNEON() &&
514          (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
515 }
516 
isSVEAvailable() const517 bool AArch64Subtarget::isSVEAvailable() const {
518   return hasSVE() &&
519          (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
520 }
521 
522 // If return address signing is enabled, tail calls are emitted as follows:
523 //
524 // ```
525 //   <authenticate LR>
526 //   <check LR>
527 //   TCRETURN          ; the callee may sign and spill the LR in its prologue
528 // ```
529 //
530 // LR may require explicit checking because if FEAT_FPAC is not implemented
531 // and LR was tampered with, then `<authenticate LR>` will not generate an
532 // exception on its own. Later, if the callee spills the signed LR value and
533 // neither FEAT_PAuth2 nor FEAT_EPAC are implemented, the valid PAC replaces
534 // the higher bits of LR thus hiding the authentication failure.
535 AArch64PAuth::AuthCheckMethod
getAuthenticatedLRCheckMethod() const536 AArch64Subtarget::getAuthenticatedLRCheckMethod() const {
537   if (AuthenticatedLRCheckMethod.getNumOccurrences())
538     return AuthenticatedLRCheckMethod;
539 
540   // At now, use None by default because checks may introduce an unexpected
541   // performance regression or incompatibility with execute-only mappings.
542   return AArch64PAuth::AuthCheckMethod::None;
543 }
544