1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "AArch64ExpandImm.h"
10 #include "AArch64TargetTransformInfo.h"
11 #include "MCTargetDesc/AArch64AddressingModes.h"
12 #include "llvm/Analysis/LoopInfo.h"
13 #include "llvm/Analysis/TargetTransformInfo.h"
14 #include "llvm/CodeGen/BasicTTIImpl.h"
15 #include "llvm/CodeGen/CostTable.h"
16 #include "llvm/CodeGen/TargetLowering.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsAArch64.h"
19 #include "llvm/Support/Debug.h"
20 #include <algorithm>
21 using namespace llvm;
22
23 #define DEBUG_TYPE "aarch64tti"
24
25 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
26 cl::init(true), cl::Hidden);
27
areInlineCompatible(const Function * Caller,const Function * Callee) const28 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
29 const Function *Callee) const {
30 const TargetMachine &TM = getTLI()->getTargetMachine();
31
32 const FeatureBitset &CallerBits =
33 TM.getSubtargetImpl(*Caller)->getFeatureBits();
34 const FeatureBitset &CalleeBits =
35 TM.getSubtargetImpl(*Callee)->getFeatureBits();
36
37 // Inline a callee if its target-features are a subset of the callers
38 // target-features.
39 return (CallerBits & CalleeBits) == CalleeBits;
40 }
41
42 /// Calculate the cost of materializing a 64-bit value. This helper
43 /// method might only calculate a fraction of a larger immediate. Therefore it
44 /// is valid to return a cost of ZERO.
getIntImmCost(int64_t Val)45 int AArch64TTIImpl::getIntImmCost(int64_t Val) {
46 // Check if the immediate can be encoded within an instruction.
47 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
48 return 0;
49
50 if (Val < 0)
51 Val = ~Val;
52
53 // Calculate how many moves we will need to materialize this constant.
54 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
55 AArch64_IMM::expandMOVImm(Val, 64, Insn);
56 return Insn.size();
57 }
58
59 /// Calculate the cost of materializing the given constant.
getIntImmCost(const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)60 int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
61 TTI::TargetCostKind CostKind) {
62 assert(Ty->isIntegerTy());
63
64 unsigned BitSize = Ty->getPrimitiveSizeInBits();
65 if (BitSize == 0)
66 return ~0U;
67
68 // Sign-extend all constants to a multiple of 64-bit.
69 APInt ImmVal = Imm;
70 if (BitSize & 0x3f)
71 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
72
73 // Split the constant into 64-bit chunks and calculate the cost for each
74 // chunk.
75 int Cost = 0;
76 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
77 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
78 int64_t Val = Tmp.getSExtValue();
79 Cost += getIntImmCost(Val);
80 }
81 // We need at least one instruction to materialze the constant.
82 return std::max(1, Cost);
83 }
84
getIntImmCostInst(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)85 int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
86 const APInt &Imm, Type *Ty,
87 TTI::TargetCostKind CostKind) {
88 assert(Ty->isIntegerTy());
89
90 unsigned BitSize = Ty->getPrimitiveSizeInBits();
91 // There is no cost model for constants with a bit size of 0. Return TCC_Free
92 // here, so that constant hoisting will ignore this constant.
93 if (BitSize == 0)
94 return TTI::TCC_Free;
95
96 unsigned ImmIdx = ~0U;
97 switch (Opcode) {
98 default:
99 return TTI::TCC_Free;
100 case Instruction::GetElementPtr:
101 // Always hoist the base address of a GetElementPtr.
102 if (Idx == 0)
103 return 2 * TTI::TCC_Basic;
104 return TTI::TCC_Free;
105 case Instruction::Store:
106 ImmIdx = 0;
107 break;
108 case Instruction::Add:
109 case Instruction::Sub:
110 case Instruction::Mul:
111 case Instruction::UDiv:
112 case Instruction::SDiv:
113 case Instruction::URem:
114 case Instruction::SRem:
115 case Instruction::And:
116 case Instruction::Or:
117 case Instruction::Xor:
118 case Instruction::ICmp:
119 ImmIdx = 1;
120 break;
121 // Always return TCC_Free for the shift value of a shift instruction.
122 case Instruction::Shl:
123 case Instruction::LShr:
124 case Instruction::AShr:
125 if (Idx == 1)
126 return TTI::TCC_Free;
127 break;
128 case Instruction::Trunc:
129 case Instruction::ZExt:
130 case Instruction::SExt:
131 case Instruction::IntToPtr:
132 case Instruction::PtrToInt:
133 case Instruction::BitCast:
134 case Instruction::PHI:
135 case Instruction::Call:
136 case Instruction::Select:
137 case Instruction::Ret:
138 case Instruction::Load:
139 break;
140 }
141
142 if (Idx == ImmIdx) {
143 int NumConstants = (BitSize + 63) / 64;
144 int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
145 return (Cost <= NumConstants * TTI::TCC_Basic)
146 ? static_cast<int>(TTI::TCC_Free)
147 : Cost;
148 }
149 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
150 }
151
getIntImmCostIntrin(Intrinsic::ID IID,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)152 int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
153 const APInt &Imm, Type *Ty,
154 TTI::TargetCostKind CostKind) {
155 assert(Ty->isIntegerTy());
156
157 unsigned BitSize = Ty->getPrimitiveSizeInBits();
158 // There is no cost model for constants with a bit size of 0. Return TCC_Free
159 // here, so that constant hoisting will ignore this constant.
160 if (BitSize == 0)
161 return TTI::TCC_Free;
162
163 // Most (all?) AArch64 intrinsics do not support folding immediates into the
164 // selected instruction, so we compute the materialization cost for the
165 // immediate directly.
166 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
167 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
168
169 switch (IID) {
170 default:
171 return TTI::TCC_Free;
172 case Intrinsic::sadd_with_overflow:
173 case Intrinsic::uadd_with_overflow:
174 case Intrinsic::ssub_with_overflow:
175 case Intrinsic::usub_with_overflow:
176 case Intrinsic::smul_with_overflow:
177 case Intrinsic::umul_with_overflow:
178 if (Idx == 1) {
179 int NumConstants = (BitSize + 63) / 64;
180 int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
181 return (Cost <= NumConstants * TTI::TCC_Basic)
182 ? static_cast<int>(TTI::TCC_Free)
183 : Cost;
184 }
185 break;
186 case Intrinsic::experimental_stackmap:
187 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
188 return TTI::TCC_Free;
189 break;
190 case Intrinsic::experimental_patchpoint_void:
191 case Intrinsic::experimental_patchpoint_i64:
192 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
193 return TTI::TCC_Free;
194 break;
195 }
196 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
197 }
198
199 TargetTransformInfo::PopcntSupportKind
getPopcntSupport(unsigned TyWidth)200 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
201 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
202 if (TyWidth == 32 || TyWidth == 64)
203 return TTI::PSK_FastHardware;
204 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
205 return TTI::PSK_Software;
206 }
207
isWideningInstruction(Type * DstTy,unsigned Opcode,ArrayRef<const Value * > Args)208 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
209 ArrayRef<const Value *> Args) {
210
211 // A helper that returns a vector type from the given type. The number of
212 // elements in type Ty determine the vector width.
213 auto toVectorTy = [&](Type *ArgTy) {
214 return FixedVectorType::get(ArgTy->getScalarType(),
215 cast<FixedVectorType>(DstTy)->getNumElements());
216 };
217
218 // Exit early if DstTy is not a vector type whose elements are at least
219 // 16-bits wide.
220 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
221 return false;
222
223 // Determine if the operation has a widening variant. We consider both the
224 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
225 // instructions.
226 //
227 // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
228 // verify that their extending operands are eliminated during code
229 // generation.
230 switch (Opcode) {
231 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
232 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
233 break;
234 default:
235 return false;
236 }
237
238 // To be a widening instruction (either the "wide" or "long" versions), the
239 // second operand must be a sign- or zero extend having a single user. We
240 // only consider extends having a single user because they may otherwise not
241 // be eliminated.
242 if (Args.size() != 2 ||
243 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
244 !Args[1]->hasOneUse())
245 return false;
246 auto *Extend = cast<CastInst>(Args[1]);
247
248 // Legalize the destination type and ensure it can be used in a widening
249 // operation.
250 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
251 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
252 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
253 return false;
254
255 // Legalize the source type and ensure it can be used in a widening
256 // operation.
257 auto *SrcTy = toVectorTy(Extend->getSrcTy());
258 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
259 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
260 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
261 return false;
262
263 // Get the total number of vector elements in the legalized types.
264 unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
265 unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
266
267 // Return true if the legalized types have the same number of vector elements
268 // and the destination element type size is twice that of the source type.
269 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
270 }
271
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src,TTI::TargetCostKind CostKind,const Instruction * I)272 int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
273 TTI::TargetCostKind CostKind,
274 const Instruction *I) {
275 int ISD = TLI->InstructionOpcodeToISD(Opcode);
276 assert(ISD && "Invalid opcode");
277
278 // If the cast is observable, and it is used by a widening instruction (e.g.,
279 // uaddl, saddw, etc.), it may be free.
280 if (I && I->hasOneUse()) {
281 auto *SingleUser = cast<Instruction>(*I->user_begin());
282 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
283 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
284 // If the cast is the second operand, it is free. We will generate either
285 // a "wide" or "long" version of the widening instruction.
286 if (I == SingleUser->getOperand(1))
287 return 0;
288 // If the cast is not the second operand, it will be free if it looks the
289 // same as the second operand. In this case, we will generate a "long"
290 // version of the widening instruction.
291 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
292 if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
293 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
294 return 0;
295 }
296 }
297
298 // TODO: Allow non-throughput costs that aren't binary.
299 auto AdjustCost = [&CostKind](int Cost) {
300 if (CostKind != TTI::TCK_RecipThroughput)
301 return Cost == 0 ? 0 : 1;
302 return Cost;
303 };
304
305 EVT SrcTy = TLI->getValueType(DL, Src);
306 EVT DstTy = TLI->getValueType(DL, Dst);
307
308 if (!SrcTy.isSimple() || !DstTy.isSimple())
309 return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
310
311 static const TypeConversionCostTblEntry
312 ConversionTbl[] = {
313 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
314 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
315 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
316 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
317
318 // The number of shll instructions for the extension.
319 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
320 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
321 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
322 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
323 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
324 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
325 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
326 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
327 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
328 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
329 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
330 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
331 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
332 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
333 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
334 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
335
336 // LowerVectorINT_TO_FP:
337 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
338 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
339 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
340 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
341 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
342 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
343
344 // Complex: to v2f32
345 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
346 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
347 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
348 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
349 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
350 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
351
352 // Complex: to v4f32
353 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
354 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
355 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
356 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
357
358 // Complex: to v8f32
359 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
360 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
361 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
362 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
363
364 // Complex: to v16f32
365 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
366 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
367
368 // Complex: to v2f64
369 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
370 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
371 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
372 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
373 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
374 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
375
376
377 // LowerVectorFP_TO_INT
378 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
379 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
380 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
381 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
382 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
383 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
384
385 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
386 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
387 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
388 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
389 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
390 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
391 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
392
393 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
394 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
395 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
396 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
397 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
398
399 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
400 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
401 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
402 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
403 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
404 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
405 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
406 };
407
408 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
409 DstTy.getSimpleVT(),
410 SrcTy.getSimpleVT()))
411 return AdjustCost(Entry->Cost);
412
413 return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
414 }
415
getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)416 int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
417 VectorType *VecTy,
418 unsigned Index) {
419
420 // Make sure we were given a valid extend opcode.
421 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
422 "Invalid opcode");
423
424 // We are extending an element we extract from a vector, so the source type
425 // of the extend is the element type of the vector.
426 auto *Src = VecTy->getElementType();
427
428 // Sign- and zero-extends are for integer types only.
429 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
430
431 // Get the cost for the extract. We compute the cost (if any) for the extend
432 // below.
433 auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
434
435 // Legalize the types.
436 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
437 auto DstVT = TLI->getValueType(DL, Dst);
438 auto SrcVT = TLI->getValueType(DL, Src);
439 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
440
441 // If the resulting type is still a vector and the destination type is legal,
442 // we may get the extension for free. If not, get the default cost for the
443 // extend.
444 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
445 return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
446
447 // The destination type should be larger than the element type. If not, get
448 // the default cost for the extend.
449 if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
450 return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
451
452 switch (Opcode) {
453 default:
454 llvm_unreachable("Opcode should be either SExt or ZExt");
455
456 // For sign-extends, we only need a smov, which performs the extension
457 // automatically.
458 case Instruction::SExt:
459 return Cost;
460
461 // For zero-extends, the extend is performed automatically by a umov unless
462 // the destination type is i64 and the element type is i8 or i16.
463 case Instruction::ZExt:
464 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
465 return Cost;
466 }
467
468 // If we are unable to perform the extend for free, get the default cost.
469 return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
470 }
471
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind)472 unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
473 TTI::TargetCostKind CostKind) {
474 if (CostKind != TTI::TCK_RecipThroughput)
475 return Opcode == Instruction::PHI ? 0 : 1;
476 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
477 // Branches are assumed to be predicted.
478 return 0;
479 }
480
getVectorInstrCost(unsigned Opcode,Type * Val,unsigned Index)481 int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
482 unsigned Index) {
483 assert(Val->isVectorTy() && "This must be a vector type");
484
485 if (Index != -1U) {
486 // Legalize the type.
487 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
488
489 // This type is legalized to a scalar type.
490 if (!LT.second.isVector())
491 return 0;
492
493 // The type may be split. Normalize the index to the new type.
494 unsigned Width = LT.second.getVectorNumElements();
495 Index = Index % Width;
496
497 // The element at index zero is already inside the vector.
498 if (Index == 0)
499 return 0;
500 }
501
502 // All other insert/extracts cost this much.
503 return ST->getVectorInsertExtractBaseCost();
504 }
505
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueKind Opd1Info,TTI::OperandValueKind Opd2Info,TTI::OperandValueProperties Opd1PropInfo,TTI::OperandValueProperties Opd2PropInfo,ArrayRef<const Value * > Args,const Instruction * CxtI)506 int AArch64TTIImpl::getArithmeticInstrCost(
507 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
508 TTI::OperandValueKind Opd1Info,
509 TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
510 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
511 const Instruction *CxtI) {
512 // TODO: Handle more cost kinds.
513 if (CostKind != TTI::TCK_RecipThroughput)
514 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
515 Opd2Info, Opd1PropInfo,
516 Opd2PropInfo, Args, CxtI);
517
518 // Legalize the type.
519 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
520
521 // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
522 // add in the widening overhead specified by the sub-target. Since the
523 // extends feeding widening instructions are performed automatically, they
524 // aren't present in the generated code and have a zero cost. By adding a
525 // widening overhead here, we attach the total cost of the combined operation
526 // to the widening instruction.
527 int Cost = 0;
528 if (isWideningInstruction(Ty, Opcode, Args))
529 Cost += ST->getWideningBaseCost();
530
531 int ISD = TLI->InstructionOpcodeToISD(Opcode);
532
533 switch (ISD) {
534 default:
535 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
536 Opd2Info,
537 Opd1PropInfo, Opd2PropInfo);
538 case ISD::SDIV:
539 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
540 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
541 // On AArch64, scalar signed division by constants power-of-two are
542 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
543 // The OperandValue properties many not be same as that of previous
544 // operation; conservatively assume OP_None.
545 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
546 Opd1Info, Opd2Info,
547 TargetTransformInfo::OP_None,
548 TargetTransformInfo::OP_None);
549 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
550 Opd1Info, Opd2Info,
551 TargetTransformInfo::OP_None,
552 TargetTransformInfo::OP_None);
553 Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
554 Opd1Info, Opd2Info,
555 TargetTransformInfo::OP_None,
556 TargetTransformInfo::OP_None);
557 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
558 Opd1Info, Opd2Info,
559 TargetTransformInfo::OP_None,
560 TargetTransformInfo::OP_None);
561 return Cost;
562 }
563 LLVM_FALLTHROUGH;
564 case ISD::UDIV:
565 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
566 auto VT = TLI->getValueType(DL, Ty);
567 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
568 // Vector signed division by constant are expanded to the
569 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
570 // to MULHS + SUB + SRL + ADD + SRL.
571 int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
572 Opd1Info, Opd2Info,
573 TargetTransformInfo::OP_None,
574 TargetTransformInfo::OP_None);
575 int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
576 Opd1Info, Opd2Info,
577 TargetTransformInfo::OP_None,
578 TargetTransformInfo::OP_None);
579 int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
580 Opd1Info, Opd2Info,
581 TargetTransformInfo::OP_None,
582 TargetTransformInfo::OP_None);
583 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
584 }
585 }
586
587 Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
588 Opd2Info,
589 Opd1PropInfo, Opd2PropInfo);
590 if (Ty->isVectorTy()) {
591 // On AArch64, vector divisions are not supported natively and are
592 // expanded into scalar divisions of each pair of elements.
593 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
594 Opd1Info, Opd2Info, Opd1PropInfo,
595 Opd2PropInfo);
596 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
597 Opd1Info, Opd2Info, Opd1PropInfo,
598 Opd2PropInfo);
599 // TODO: if one of the arguments is scalar, then it's not necessary to
600 // double the cost of handling the vector elements.
601 Cost += Cost;
602 }
603 return Cost;
604
605 case ISD::ADD:
606 case ISD::MUL:
607 case ISD::XOR:
608 case ISD::OR:
609 case ISD::AND:
610 // These nodes are marked as 'custom' for combining purposes only.
611 // We know that they are legal. See LowerAdd in ISelLowering.
612 return (Cost + 1) * LT.first;
613
614 case ISD::FADD:
615 // These nodes are marked as 'custom' just to lower them to SVE.
616 // We know said lowering will incur no additional cost.
617 if (isa<FixedVectorType>(Ty) && !Ty->getScalarType()->isFP128Ty())
618 return (Cost + 2) * LT.first;
619
620 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
621 Opd2Info,
622 Opd1PropInfo, Opd2PropInfo);
623 }
624 }
625
getAddressComputationCost(Type * Ty,ScalarEvolution * SE,const SCEV * Ptr)626 int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
627 const SCEV *Ptr) {
628 // Address computations in vectorized code with non-consecutive addresses will
629 // likely result in more instructions compared to scalar code where the
630 // computation can more often be merged into the index mode. The resulting
631 // extra micro-ops can significantly decrease throughput.
632 unsigned NumVectorInstToHideOverhead = 10;
633 int MaxMergeDistance = 64;
634
635 if (Ty->isVectorTy() && SE &&
636 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
637 return NumVectorInstToHideOverhead;
638
639 // In many cases the address computation is not merged into the instruction
640 // addressing mode.
641 return 1;
642 }
643
getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy,TTI::TargetCostKind CostKind,const Instruction * I)644 int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
645 Type *CondTy,
646 TTI::TargetCostKind CostKind,
647 const Instruction *I) {
648 // TODO: Handle other cost kinds.
649 if (CostKind != TTI::TCK_RecipThroughput)
650 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
651
652 int ISD = TLI->InstructionOpcodeToISD(Opcode);
653 // We don't lower some vector selects well that are wider than the register
654 // width.
655 if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
656 // We would need this many instructions to hide the scalarization happening.
657 const int AmortizationCost = 20;
658 static const TypeConversionCostTblEntry
659 VectorSelectTbl[] = {
660 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
661 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
662 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
663 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
664 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
665 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
666 };
667
668 EVT SelCondTy = TLI->getValueType(DL, CondTy);
669 EVT SelValTy = TLI->getValueType(DL, ValTy);
670 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
671 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
672 SelCondTy.getSimpleVT(),
673 SelValTy.getSimpleVT()))
674 return Entry->Cost;
675 }
676 }
677 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
678 }
679
680 AArch64TTIImpl::TTI::MemCmpExpansionOptions
enableMemCmpExpansion(bool OptSize,bool IsZeroCmp) const681 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
682 TTI::MemCmpExpansionOptions Options;
683 if (ST->requiresStrictAlign()) {
684 // TODO: Add cost modeling for strict align. Misaligned loads expand to
685 // a bunch of instructions when strict align is enabled.
686 return Options;
687 }
688 Options.AllowOverlappingLoads = true;
689 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
690 Options.NumLoadsPerBlock = Options.MaxNumLoads;
691 // TODO: Though vector loads usually perform well on AArch64, in some targets
692 // they may wake up the FP unit, which raises the power consumption. Perhaps
693 // they could be used with no holds barred (-O3).
694 Options.LoadSizes = {8, 4, 2, 1};
695 return Options;
696 }
697
getMemoryOpCost(unsigned Opcode,Type * Ty,MaybeAlign Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,const Instruction * I)698 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
699 MaybeAlign Alignment, unsigned AddressSpace,
700 TTI::TargetCostKind CostKind,
701 const Instruction *I) {
702 // TODO: Handle other cost kinds.
703 if (CostKind != TTI::TCK_RecipThroughput)
704 return 1;
705
706 // Type legalization can't handle structs
707 if (TLI->getValueType(DL, Ty, true) == MVT::Other)
708 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
709 CostKind);
710
711 auto LT = TLI->getTypeLegalizationCost(DL, Ty);
712
713 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
714 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
715 // Unaligned stores are extremely inefficient. We don't split all
716 // unaligned 128-bit stores because the negative impact that has shown in
717 // practice on inlined block copy code.
718 // We make such stores expensive so that we will only vectorize if there
719 // are 6 other instructions getting vectorized.
720 const int AmortizationCost = 6;
721
722 return LT.first * 2 * AmortizationCost;
723 }
724
725 if (Ty->isVectorTy() &&
726 cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) {
727 unsigned ProfitableNumElements;
728 if (Opcode == Instruction::Store)
729 // We use a custom trunc store lowering so v.4b should be profitable.
730 ProfitableNumElements = 4;
731 else
732 // We scalarize the loads because there is not v.4b register and we
733 // have to promote the elements to v.2.
734 ProfitableNumElements = 8;
735
736 if (cast<FixedVectorType>(Ty)->getNumElements() < ProfitableNumElements) {
737 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
738 unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
739 // We generate 2 instructions per vector element.
740 return NumVectorizableInstsToAmortize * NumVecElts * 2;
741 }
742 }
743
744 return LT.first;
745 }
746
getInterleavedMemoryOpCost(unsigned Opcode,Type * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,bool UseMaskForCond,bool UseMaskForGaps)747 int AArch64TTIImpl::getInterleavedMemoryOpCost(
748 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
749 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
750 bool UseMaskForCond, bool UseMaskForGaps) {
751 assert(Factor >= 2 && "Invalid interleave factor");
752 auto *VecVTy = cast<FixedVectorType>(VecTy);
753
754 if (!UseMaskForCond && !UseMaskForGaps &&
755 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
756 unsigned NumElts = VecVTy->getNumElements();
757 auto *SubVecTy =
758 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
759
760 // ldN/stN only support legal vector types of size 64 or 128 in bits.
761 // Accesses having vector types that are a multiple of 128 bits can be
762 // matched to more than one ldN/stN instruction.
763 if (NumElts % Factor == 0 &&
764 TLI->isLegalInterleavedAccessType(SubVecTy, DL))
765 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
766 }
767
768 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
769 Alignment, AddressSpace, CostKind,
770 UseMaskForCond, UseMaskForGaps);
771 }
772
getCostOfKeepingLiveOverCall(ArrayRef<Type * > Tys)773 int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
774 int Cost = 0;
775 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
776 for (auto *I : Tys) {
777 if (!I->isVectorTy())
778 continue;
779 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
780 128)
781 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
782 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
783 }
784 return Cost;
785 }
786
getMaxInterleaveFactor(unsigned VF)787 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
788 return ST->getMaxInterleaveFactor();
789 }
790
791 // For Falkor, we want to avoid having too many strided loads in a loop since
792 // that can exhaust the HW prefetcher resources. We adjust the unroller
793 // MaxCount preference below to attempt to ensure unrolling doesn't create too
794 // many strided loads.
795 static void
getFalkorUnrollingPreferences(Loop * L,ScalarEvolution & SE,TargetTransformInfo::UnrollingPreferences & UP)796 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
797 TargetTransformInfo::UnrollingPreferences &UP) {
798 enum { MaxStridedLoads = 7 };
799 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
800 int StridedLoads = 0;
801 // FIXME? We could make this more precise by looking at the CFG and
802 // e.g. not counting loads in each side of an if-then-else diamond.
803 for (const auto BB : L->blocks()) {
804 for (auto &I : *BB) {
805 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
806 if (!LMemI)
807 continue;
808
809 Value *PtrValue = LMemI->getPointerOperand();
810 if (L->isLoopInvariant(PtrValue))
811 continue;
812
813 const SCEV *LSCEV = SE.getSCEV(PtrValue);
814 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
815 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
816 continue;
817
818 // FIXME? We could take pairing of unrolled load copies into account
819 // by looking at the AddRec, but we would probably have to limit this
820 // to loops with no stores or other memory optimization barriers.
821 ++StridedLoads;
822 // We've seen enough strided loads that seeing more won't make a
823 // difference.
824 if (StridedLoads > MaxStridedLoads / 2)
825 return StridedLoads;
826 }
827 }
828 return StridedLoads;
829 };
830
831 int StridedLoads = countStridedLoads(L, SE);
832 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
833 << " strided loads\n");
834 // Pick the largest power of 2 unroll count that won't result in too many
835 // strided loads.
836 if (StridedLoads) {
837 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
838 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
839 << UP.MaxCount << '\n');
840 }
841 }
842
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP)843 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
844 TTI::UnrollingPreferences &UP) {
845 // Enable partial unrolling and runtime unrolling.
846 BaseT::getUnrollingPreferences(L, SE, UP);
847
848 // For inner loop, it is more likely to be a hot one, and the runtime check
849 // can be promoted out from LICM pass, so the overhead is less, let's try
850 // a larger threshold to unroll more loops.
851 if (L->getLoopDepth() > 1)
852 UP.PartialThreshold *= 2;
853
854 // Disable partial & runtime unrolling on -Os.
855 UP.PartialOptSizeThreshold = 0;
856
857 if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
858 EnableFalkorHWPFUnrollFix)
859 getFalkorUnrollingPreferences(L, SE, UP);
860 }
861
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)862 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
863 TTI::PeelingPreferences &PP) {
864 BaseT::getPeelingPreferences(L, SE, PP);
865 }
866
getOrCreateResultFromMemIntrinsic(IntrinsicInst * Inst,Type * ExpectedType)867 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
868 Type *ExpectedType) {
869 switch (Inst->getIntrinsicID()) {
870 default:
871 return nullptr;
872 case Intrinsic::aarch64_neon_st2:
873 case Intrinsic::aarch64_neon_st3:
874 case Intrinsic::aarch64_neon_st4: {
875 // Create a struct type
876 StructType *ST = dyn_cast<StructType>(ExpectedType);
877 if (!ST)
878 return nullptr;
879 unsigned NumElts = Inst->getNumArgOperands() - 1;
880 if (ST->getNumElements() != NumElts)
881 return nullptr;
882 for (unsigned i = 0, e = NumElts; i != e; ++i) {
883 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
884 return nullptr;
885 }
886 Value *Res = UndefValue::get(ExpectedType);
887 IRBuilder<> Builder(Inst);
888 for (unsigned i = 0, e = NumElts; i != e; ++i) {
889 Value *L = Inst->getArgOperand(i);
890 Res = Builder.CreateInsertValue(Res, L, i);
891 }
892 return Res;
893 }
894 case Intrinsic::aarch64_neon_ld2:
895 case Intrinsic::aarch64_neon_ld3:
896 case Intrinsic::aarch64_neon_ld4:
897 if (Inst->getType() == ExpectedType)
898 return Inst;
899 return nullptr;
900 }
901 }
902
getTgtMemIntrinsic(IntrinsicInst * Inst,MemIntrinsicInfo & Info)903 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
904 MemIntrinsicInfo &Info) {
905 switch (Inst->getIntrinsicID()) {
906 default:
907 break;
908 case Intrinsic::aarch64_neon_ld2:
909 case Intrinsic::aarch64_neon_ld3:
910 case Intrinsic::aarch64_neon_ld4:
911 Info.ReadMem = true;
912 Info.WriteMem = false;
913 Info.PtrVal = Inst->getArgOperand(0);
914 break;
915 case Intrinsic::aarch64_neon_st2:
916 case Intrinsic::aarch64_neon_st3:
917 case Intrinsic::aarch64_neon_st4:
918 Info.ReadMem = false;
919 Info.WriteMem = true;
920 Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
921 break;
922 }
923
924 switch (Inst->getIntrinsicID()) {
925 default:
926 return false;
927 case Intrinsic::aarch64_neon_ld2:
928 case Intrinsic::aarch64_neon_st2:
929 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
930 break;
931 case Intrinsic::aarch64_neon_ld3:
932 case Intrinsic::aarch64_neon_st3:
933 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
934 break;
935 case Intrinsic::aarch64_neon_ld4:
936 case Intrinsic::aarch64_neon_st4:
937 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
938 break;
939 }
940 return true;
941 }
942
943 /// See if \p I should be considered for address type promotion. We check if \p
944 /// I is a sext with right type and used in memory accesses. If it used in a
945 /// "complex" getelementptr, we allow it to be promoted without finding other
946 /// sext instructions that sign extended the same initial value. A getelementptr
947 /// is considered as "complex" if it has more than 2 operands.
shouldConsiderAddressTypePromotion(const Instruction & I,bool & AllowPromotionWithoutCommonHeader)948 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
949 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
950 bool Considerable = false;
951 AllowPromotionWithoutCommonHeader = false;
952 if (!isa<SExtInst>(&I))
953 return false;
954 Type *ConsideredSExtType =
955 Type::getInt64Ty(I.getParent()->getParent()->getContext());
956 if (I.getType() != ConsideredSExtType)
957 return false;
958 // See if the sext is the one with the right type and used in at least one
959 // GetElementPtrInst.
960 for (const User *U : I.users()) {
961 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
962 Considerable = true;
963 // A getelementptr is considered as "complex" if it has more than 2
964 // operands. We will promote a SExt used in such complex GEP as we
965 // expect some computation to be merged if they are done on 64 bits.
966 if (GEPInst->getNumOperands() > 2) {
967 AllowPromotionWithoutCommonHeader = true;
968 break;
969 }
970 }
971 }
972 return Considerable;
973 }
974
useReductionIntrinsic(unsigned Opcode,Type * Ty,TTI::ReductionFlags Flags) const975 bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
976 TTI::ReductionFlags Flags) const {
977 auto *VTy = cast<VectorType>(Ty);
978 unsigned ScalarBits = Ty->getScalarSizeInBits();
979 switch (Opcode) {
980 case Instruction::FAdd:
981 case Instruction::FMul:
982 case Instruction::And:
983 case Instruction::Or:
984 case Instruction::Xor:
985 case Instruction::Mul:
986 return false;
987 case Instruction::Add:
988 return ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128;
989 case Instruction::ICmp:
990 return (ScalarBits < 64) &&
991 (ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128);
992 case Instruction::FCmp:
993 return Flags.NoNaN;
994 default:
995 llvm_unreachable("Unhandled reduction opcode");
996 }
997 return false;
998 }
999
getArithmeticReductionCost(unsigned Opcode,VectorType * ValTy,bool IsPairwiseForm,TTI::TargetCostKind CostKind)1000 int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode,
1001 VectorType *ValTy,
1002 bool IsPairwiseForm,
1003 TTI::TargetCostKind CostKind) {
1004
1005 if (IsPairwiseForm)
1006 return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
1007 CostKind);
1008
1009 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1010 MVT MTy = LT.second;
1011 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1012 assert(ISD && "Invalid opcode");
1013
1014 // Horizontal adds can use the 'addv' instruction. We model the cost of these
1015 // instructions as normal vector adds. This is the only arithmetic vector
1016 // reduction operation for which we have an instruction.
1017 static const CostTblEntry CostTblNoPairwise[]{
1018 {ISD::ADD, MVT::v8i8, 1},
1019 {ISD::ADD, MVT::v16i8, 1},
1020 {ISD::ADD, MVT::v4i16, 1},
1021 {ISD::ADD, MVT::v8i16, 1},
1022 {ISD::ADD, MVT::v4i32, 1},
1023 };
1024
1025 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
1026 return LT.first * Entry->Cost;
1027
1028 return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
1029 CostKind);
1030 }
1031
getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,int Index,VectorType * SubTp)1032 int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
1033 int Index, VectorType *SubTp) {
1034 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
1035 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) {
1036 static const CostTblEntry ShuffleTbl[] = {
1037 // Broadcast shuffle kinds can be performed with 'dup'.
1038 { TTI::SK_Broadcast, MVT::v8i8, 1 },
1039 { TTI::SK_Broadcast, MVT::v16i8, 1 },
1040 { TTI::SK_Broadcast, MVT::v4i16, 1 },
1041 { TTI::SK_Broadcast, MVT::v8i16, 1 },
1042 { TTI::SK_Broadcast, MVT::v2i32, 1 },
1043 { TTI::SK_Broadcast, MVT::v4i32, 1 },
1044 { TTI::SK_Broadcast, MVT::v2i64, 1 },
1045 { TTI::SK_Broadcast, MVT::v2f32, 1 },
1046 { TTI::SK_Broadcast, MVT::v4f32, 1 },
1047 { TTI::SK_Broadcast, MVT::v2f64, 1 },
1048 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
1049 // 'zip1/zip2' instructions.
1050 { TTI::SK_Transpose, MVT::v8i8, 1 },
1051 { TTI::SK_Transpose, MVT::v16i8, 1 },
1052 { TTI::SK_Transpose, MVT::v4i16, 1 },
1053 { TTI::SK_Transpose, MVT::v8i16, 1 },
1054 { TTI::SK_Transpose, MVT::v2i32, 1 },
1055 { TTI::SK_Transpose, MVT::v4i32, 1 },
1056 { TTI::SK_Transpose, MVT::v2i64, 1 },
1057 { TTI::SK_Transpose, MVT::v2f32, 1 },
1058 { TTI::SK_Transpose, MVT::v4f32, 1 },
1059 { TTI::SK_Transpose, MVT::v2f64, 1 },
1060 // Select shuffle kinds.
1061 // TODO: handle vXi8/vXi16.
1062 { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
1063 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
1064 { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
1065 { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
1066 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
1067 { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
1068 // PermuteSingleSrc shuffle kinds.
1069 // TODO: handle vXi8/vXi16.
1070 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
1071 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
1072 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
1073 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
1074 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
1075 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
1076 };
1077 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1078 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
1079 return LT.first * Entry->Cost;
1080 }
1081
1082 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1083 }
1084