1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of
17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost.
21 /// Some examples of other technologies/CPUs:
22 /// SSE 3 - Pentium4 / Athlon64
23 /// SSE 4.1 - Penryn
24 /// SSE 4.2 - Nehalem
25 /// AVX - Sandy Bridge
26 /// AVX2 - Haswell
27 /// AVX-512 - Xeon Phi / Skylake
28 /// And some examples of instruction target dependent costs (latency)
29 /// divss sqrtss rsqrtss
30 /// AMD K7 11-16 19 3
31 /// Piledriver 9-24 13-15 5
32 /// Jaguar 14 16 2
33 /// Pentium II,III 18 30 2
34 /// Nehalem 7-14 7-18 3
35 /// Haswell 10-13 11 5
36 /// TODO: Develop and implement the target dependent cost model and
37 /// specialize cost numbers for different Cost Model Targets such as throughput,
38 /// code size, latency and uop count.
39 //===----------------------------------------------------------------------===//
40
41 #include "X86TargetTransformInfo.h"
42 #include "llvm/Analysis/TargetTransformInfo.h"
43 #include "llvm/CodeGen/BasicTTIImpl.h"
44 #include "llvm/CodeGen/CostTable.h"
45 #include "llvm/CodeGen/TargetLowering.h"
46 #include "llvm/IR/IntrinsicInst.h"
47 #include "llvm/Support/Debug.h"
48
49 using namespace llvm;
50
51 #define DEBUG_TYPE "x86tti"
52
53 //===----------------------------------------------------------------------===//
54 //
55 // X86 cost model.
56 //
57 //===----------------------------------------------------------------------===//
58
59 TargetTransformInfo::PopcntSupportKind
getPopcntSupport(unsigned TyWidth)60 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
62 // TODO: Currently the __builtin_popcount() implementation using SSE3
63 // instructions is inefficient. Once the problem is fixed, we should
64 // call ST->hasSSE3() instead of ST->hasPOPCNT().
65 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
66 }
67
getCacheSize(TargetTransformInfo::CacheLevel Level) const68 llvm::Optional<unsigned> X86TTIImpl::getCacheSize(
69 TargetTransformInfo::CacheLevel Level) const {
70 switch (Level) {
71 case TargetTransformInfo::CacheLevel::L1D:
72 // - Penryn
73 // - Nehalem
74 // - Westmere
75 // - Sandy Bridge
76 // - Ivy Bridge
77 // - Haswell
78 // - Broadwell
79 // - Skylake
80 // - Kabylake
81 return 32 * 1024; // 32 KByte
82 case TargetTransformInfo::CacheLevel::L2D:
83 // - Penryn
84 // - Nehalem
85 // - Westmere
86 // - Sandy Bridge
87 // - Ivy Bridge
88 // - Haswell
89 // - Broadwell
90 // - Skylake
91 // - Kabylake
92 return 256 * 1024; // 256 KByte
93 }
94
95 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
96 }
97
getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const98 llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
99 TargetTransformInfo::CacheLevel Level) const {
100 // - Penryn
101 // - Nehalem
102 // - Westmere
103 // - Sandy Bridge
104 // - Ivy Bridge
105 // - Haswell
106 // - Broadwell
107 // - Skylake
108 // - Kabylake
109 switch (Level) {
110 case TargetTransformInfo::CacheLevel::L1D:
111 LLVM_FALLTHROUGH;
112 case TargetTransformInfo::CacheLevel::L2D:
113 return 8;
114 }
115
116 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
117 }
118
getNumberOfRegisters(unsigned ClassID) const119 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
120 bool Vector = (ClassID == 1);
121 if (Vector && !ST->hasSSE1())
122 return 0;
123
124 if (ST->is64Bit()) {
125 if (Vector && ST->hasAVX512())
126 return 32;
127 return 16;
128 }
129 return 8;
130 }
131
132 TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const133 X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
134 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
135 switch (K) {
136 case TargetTransformInfo::RGK_Scalar:
137 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
138 case TargetTransformInfo::RGK_FixedWidthVector:
139 if (ST->hasAVX512() && PreferVectorWidth >= 512)
140 return TypeSize::getFixed(512);
141 if (ST->hasAVX() && PreferVectorWidth >= 256)
142 return TypeSize::getFixed(256);
143 if (ST->hasSSE1() && PreferVectorWidth >= 128)
144 return TypeSize::getFixed(128);
145 return TypeSize::getFixed(0);
146 case TargetTransformInfo::RGK_ScalableVector:
147 return TypeSize::getScalable(0);
148 }
149
150 llvm_unreachable("Unsupported register kind");
151 }
152
getLoadStoreVecRegBitWidth(unsigned) const153 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
154 return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
155 .getFixedSize();
156 }
157
getMaxInterleaveFactor(unsigned VF)158 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
159 // If the loop will not be vectorized, don't interleave the loop.
160 // Let regular unroll to unroll the loop, which saves the overflow
161 // check and memory check cost.
162 if (VF == 1)
163 return 1;
164
165 if (ST->isAtom())
166 return 1;
167
168 // Sandybridge and Haswell have multiple execution ports and pipelined
169 // vector units.
170 if (ST->hasAVX())
171 return 4;
172
173 return 2;
174 }
175
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueKind Op1Info,TTI::OperandValueKind Op2Info,TTI::OperandValueProperties Opd1PropInfo,TTI::OperandValueProperties Opd2PropInfo,ArrayRef<const Value * > Args,const Instruction * CxtI)176 InstructionCost X86TTIImpl::getArithmeticInstrCost(
177 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
178 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
179 TTI::OperandValueProperties Opd1PropInfo,
180 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
181 const Instruction *CxtI) {
182 // TODO: Handle more cost kinds.
183 if (CostKind != TTI::TCK_RecipThroughput)
184 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
185 Op2Info, Opd1PropInfo,
186 Opd2PropInfo, Args, CxtI);
187
188 // vXi8 multiplications are always promoted to vXi16.
189 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
190 Ty->getScalarSizeInBits() == 8) {
191 Type *WideVecTy =
192 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
193 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
194 TargetTransformInfo::CastContextHint::None,
195 CostKind) +
196 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
197 TargetTransformInfo::CastContextHint::None,
198 CostKind) +
199 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info,
200 Opd1PropInfo, Opd2PropInfo);
201 }
202
203 // Legalize the type.
204 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
205
206 int ISD = TLI->InstructionOpcodeToISD(Opcode);
207 assert(ISD && "Invalid opcode");
208
209 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
210 LT.second.getScalarType() == MVT::i32) {
211 // Check if the operands can be represented as a smaller datatype.
212 bool Op1Signed = false, Op2Signed = false;
213 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
214 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
215 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
216
217 // If both are representable as i15 and at least one is constant,
218 // zero-extended, or sign-extended from vXi16 then we can treat this as
219 // PMADDWD which has the same costs as a vXi16 multiply.
220 if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) {
221 bool Op1Constant =
222 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
223 bool Op2Constant =
224 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
225 bool Op1Sext16 = isa<SExtInst>(Args[0]) && Op1MinSize == 15;
226 bool Op2Sext16 = isa<SExtInst>(Args[1]) && Op2MinSize == 15;
227
228 bool IsZeroExtended = !Op1Signed || !Op2Signed;
229 bool IsConstant = Op1Constant || Op2Constant;
230 bool IsSext16 = Op1Sext16 || Op2Sext16;
231 if (IsConstant || IsZeroExtended || IsSext16)
232 LT.second =
233 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
234 }
235 }
236
237 if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
238 ISD == ISD::UREM) &&
239 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
240 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
241 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
242 if (ISD == ISD::SDIV || ISD == ISD::SREM) {
243 // On X86, vector signed division by constants power-of-two are
244 // normally expanded to the sequence SRA + SRL + ADD + SRA.
245 // The OperandValue properties may not be the same as that of the previous
246 // operation; conservatively assume OP_None.
247 InstructionCost Cost =
248 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
249 Op2Info, TargetTransformInfo::OP_None,
250 TargetTransformInfo::OP_None);
251 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
252 Op2Info, TargetTransformInfo::OP_None,
253 TargetTransformInfo::OP_None);
254 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
255 Op2Info, TargetTransformInfo::OP_None,
256 TargetTransformInfo::OP_None);
257
258 if (ISD == ISD::SREM) {
259 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
260 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
261 Op2Info);
262 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
263 Op2Info);
264 }
265
266 return Cost;
267 }
268
269 // Vector unsigned division/remainder will be simplified to shifts/masks.
270 if (ISD == ISD::UDIV)
271 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
272 Op2Info, TargetTransformInfo::OP_None,
273 TargetTransformInfo::OP_None);
274 // UREM
275 return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Op1Info,
276 Op2Info, TargetTransformInfo::OP_None,
277 TargetTransformInfo::OP_None);
278 }
279
280 static const CostTblEntry GLMCostTable[] = {
281 { ISD::FDIV, MVT::f32, 18 }, // divss
282 { ISD::FDIV, MVT::v4f32, 35 }, // divps
283 { ISD::FDIV, MVT::f64, 33 }, // divsd
284 { ISD::FDIV, MVT::v2f64, 65 }, // divpd
285 };
286
287 if (ST->useGLMDivSqrtCosts())
288 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
289 LT.second))
290 return LT.first * Entry->Cost;
291
292 static const CostTblEntry SLMCostTable[] = {
293 { ISD::MUL, MVT::v4i32, 11 }, // pmulld
294 { ISD::MUL, MVT::v8i16, 2 }, // pmullw
295 { ISD::FMUL, MVT::f64, 2 }, // mulsd
296 { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
297 { ISD::FMUL, MVT::v4f32, 2 }, // mulps
298 { ISD::FDIV, MVT::f32, 17 }, // divss
299 { ISD::FDIV, MVT::v4f32, 39 }, // divps
300 { ISD::FDIV, MVT::f64, 32 }, // divsd
301 { ISD::FDIV, MVT::v2f64, 69 }, // divpd
302 { ISD::FADD, MVT::v2f64, 2 }, // addpd
303 { ISD::FSUB, MVT::v2f64, 2 }, // subpd
304 // v2i64/v4i64 mul is custom lowered as a series of long:
305 // multiplies(3), shifts(3) and adds(2)
306 // slm muldq version throughput is 2 and addq throughput 4
307 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
308 // 3X4 (addq throughput) = 17
309 { ISD::MUL, MVT::v2i64, 17 },
310 // slm addq\subq throughput is 4
311 { ISD::ADD, MVT::v2i64, 4 },
312 { ISD::SUB, MVT::v2i64, 4 },
313 };
314
315 if (ST->isSLM()) {
316 if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
317 // Check if the operands can be shrinked into a smaller datatype.
318 // TODO: Merge this into generiic vXi32 MUL patterns above.
319 bool Op1Signed = false;
320 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
321 bool Op2Signed = false;
322 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
323
324 bool SignedMode = Op1Signed || Op2Signed;
325 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
326
327 if (OpMinSize <= 7)
328 return LT.first * 3; // pmullw/sext
329 if (!SignedMode && OpMinSize <= 8)
330 return LT.first * 3; // pmullw/zext
331 if (OpMinSize <= 15)
332 return LT.first * 5; // pmullw/pmulhw/pshuf
333 if (!SignedMode && OpMinSize <= 16)
334 return LT.first * 5; // pmullw/pmulhw/pshuf
335 }
336
337 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
338 LT.second)) {
339 return LT.first * Entry->Cost;
340 }
341 }
342
343 static const CostTblEntry AVX512BWUniformConstCostTable[] = {
344 { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
345 { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
346 { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
347 };
348
349 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
350 ST->hasBWI()) {
351 if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
352 LT.second))
353 return LT.first * Entry->Cost;
354 }
355
356 static const CostTblEntry AVX512UniformConstCostTable[] = {
357 { ISD::SRA, MVT::v2i64, 1 },
358 { ISD::SRA, MVT::v4i64, 1 },
359 { ISD::SRA, MVT::v8i64, 1 },
360
361 { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
362 { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
363 { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
364
365 { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
366 { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
367 { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
368 { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
369 };
370
371 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
372 ST->hasAVX512()) {
373 if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
374 LT.second))
375 return LT.first * Entry->Cost;
376 }
377
378 static const CostTblEntry AVX2UniformConstCostTable[] = {
379 { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
380 { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
381 { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
382
383 { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
384
385 { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
386 { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
387 { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
388 { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
389 };
390
391 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
392 ST->hasAVX2()) {
393 if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
394 LT.second))
395 return LT.first * Entry->Cost;
396 }
397
398 static const CostTblEntry SSE2UniformConstCostTable[] = {
399 { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
400 { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
401 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
402
403 { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
404 { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
405 { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
406
407 { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
408 { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
409 { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
410 { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
411 { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
412 { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
413 { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
414 { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
415 };
416
417 // XOP has faster vXi8 shifts.
418 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
419 ST->hasSSE2() && !ST->hasXOP()) {
420 if (const auto *Entry =
421 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
422 return LT.first * Entry->Cost;
423 }
424
425 static const CostTblEntry AVX512BWConstCostTable[] = {
426 { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
427 { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
428 { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
429 { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
430 { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
431 { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
432 { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
433 { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
434 };
435
436 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
437 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
438 ST->hasBWI()) {
439 if (const auto *Entry =
440 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
441 return LT.first * Entry->Cost;
442 }
443
444 static const CostTblEntry AVX512ConstCostTable[] = {
445 { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
446 { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
447 { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
448 { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
449 { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
450 { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
451 { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
452 { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
453 { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
454 { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
455 { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
456 { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
457 };
458
459 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
460 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
461 ST->hasAVX512()) {
462 if (const auto *Entry =
463 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
464 return LT.first * Entry->Cost;
465 }
466
467 static const CostTblEntry AVX2ConstCostTable[] = {
468 { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
469 { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
470 { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
471 { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
472 { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
473 { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
474 { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
475 { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
476 { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
477 { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
478 { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
479 { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
480 };
481
482 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
483 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
484 ST->hasAVX2()) {
485 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
486 return LT.first * Entry->Cost;
487 }
488
489 static const CostTblEntry SSE2ConstCostTable[] = {
490 { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
491 { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
492 { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
493 { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
494 { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
495 { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
496 { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
497 { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
498 { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
499 { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
500 { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
501 { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
502 { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
503 { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
504 { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
505 { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
506 { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
507 { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
508 { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
509 { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
510 { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
511 { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
512 { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
513 { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
514 };
515
516 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
517 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
518 ST->hasSSE2()) {
519 // pmuldq sequence.
520 if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
521 return LT.first * 32;
522 if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
523 return LT.first * 38;
524 if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
525 return LT.first * 15;
526 if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
527 return LT.first * 20;
528
529 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
530 return LT.first * Entry->Cost;
531 }
532
533 static const CostTblEntry AVX512BWShiftCostTable[] = {
534 { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence.
535 { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence.
536 { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence.
537 { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence.
538 { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence.
539 { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence.
540 { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence.
541 { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence.
542 { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence.
543
544 { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
545 { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
546 { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
547 { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
548 { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
549 { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
550 { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
551 { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
552 { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
553 };
554
555 if (ST->hasBWI())
556 if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
557 return LT.first * Entry->Cost;
558
559 static const CostTblEntry AVX2UniformCostTable[] = {
560 // Uniform splats are cheaper for the following instructions.
561 { ISD::SHL, MVT::v16i16, 1 }, // psllw.
562 { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
563 { ISD::SRA, MVT::v16i16, 1 }, // psraw.
564 { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
565 { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
566 { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
567
568 { ISD::SHL, MVT::v8i32, 1 }, // pslld
569 { ISD::SRL, MVT::v8i32, 1 }, // psrld
570 { ISD::SRA, MVT::v8i32, 1 }, // psrad
571 { ISD::SHL, MVT::v4i64, 1 }, // psllq
572 { ISD::SRL, MVT::v4i64, 1 }, // psrlq
573 };
574
575 if (ST->hasAVX2() &&
576 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
577 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
578 if (const auto *Entry =
579 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
580 return LT.first * Entry->Cost;
581 }
582
583 static const CostTblEntry SSE2UniformCostTable[] = {
584 // Uniform splats are cheaper for the following instructions.
585 { ISD::SHL, MVT::v8i16, 1 }, // psllw.
586 { ISD::SHL, MVT::v4i32, 1 }, // pslld
587 { ISD::SHL, MVT::v2i64, 1 }, // psllq.
588
589 { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
590 { ISD::SRL, MVT::v4i32, 1 }, // psrld.
591 { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
592
593 { ISD::SRA, MVT::v8i16, 1 }, // psraw.
594 { ISD::SRA, MVT::v4i32, 1 }, // psrad.
595 };
596
597 if (ST->hasSSE2() &&
598 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
599 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
600 if (const auto *Entry =
601 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
602 return LT.first * Entry->Cost;
603 }
604
605 static const CostTblEntry AVX512DQCostTable[] = {
606 { ISD::MUL, MVT::v2i64, 2 }, // pmullq
607 { ISD::MUL, MVT::v4i64, 2 }, // pmullq
608 { ISD::MUL, MVT::v8i64, 2 } // pmullq
609 };
610
611 // Look for AVX512DQ lowering tricks for custom cases.
612 if (ST->hasDQI())
613 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
614 return LT.first * Entry->Cost;
615
616 static const CostTblEntry AVX512BWCostTable[] = {
617 { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
618 { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
619 { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
620 };
621
622 // Look for AVX512BW lowering tricks for custom cases.
623 if (ST->hasBWI())
624 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
625 return LT.first * Entry->Cost;
626
627 static const CostTblEntry AVX512CostTable[] = {
628 { ISD::SHL, MVT::v4i32, 1 },
629 { ISD::SRL, MVT::v4i32, 1 },
630 { ISD::SRA, MVT::v4i32, 1 },
631 { ISD::SHL, MVT::v8i32, 1 },
632 { ISD::SRL, MVT::v8i32, 1 },
633 { ISD::SRA, MVT::v8i32, 1 },
634 { ISD::SHL, MVT::v16i32, 1 },
635 { ISD::SRL, MVT::v16i32, 1 },
636 { ISD::SRA, MVT::v16i32, 1 },
637
638 { ISD::SHL, MVT::v2i64, 1 },
639 { ISD::SRL, MVT::v2i64, 1 },
640 { ISD::SHL, MVT::v4i64, 1 },
641 { ISD::SRL, MVT::v4i64, 1 },
642 { ISD::SHL, MVT::v8i64, 1 },
643 { ISD::SRL, MVT::v8i64, 1 },
644
645 { ISD::SRA, MVT::v2i64, 1 },
646 { ISD::SRA, MVT::v4i64, 1 },
647 { ISD::SRA, MVT::v8i64, 1 },
648
649 { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
650 { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
651 { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
652 { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add
653
654 { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
655 { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
656 { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
657 { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
658 { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/
659 { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/
660 { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/
661 { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/
662
663 { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
664 { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
665 { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
666 { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
667 { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/
668 { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/
669 { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/
670 { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/
671 };
672
673 if (ST->hasAVX512())
674 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
675 return LT.first * Entry->Cost;
676
677 static const CostTblEntry AVX2ShiftCostTable[] = {
678 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
679 // customize them to detect the cases where shift amount is a scalar one.
680 { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org)
681 { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org)
682 { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org)
683 { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org)
684 { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org)
685 { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org)
686 { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org)
687 { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org)
688 { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org)
689 { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org)
690 };
691
692 if (ST->hasAVX512()) {
693 if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
694 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
695 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
696 // On AVX512, a packed v32i16 shift left by a constant build_vector
697 // is lowered into a vector multiply (vpmullw).
698 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
699 Op1Info, Op2Info,
700 TargetTransformInfo::OP_None,
701 TargetTransformInfo::OP_None);
702 }
703
704 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
705 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
706 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
707 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
708 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
709 // On AVX2, a packed v16i16 shift left by a constant build_vector
710 // is lowered into a vector multiply (vpmullw).
711 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
712 Op1Info, Op2Info,
713 TargetTransformInfo::OP_None,
714 TargetTransformInfo::OP_None);
715
716 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
717 return LT.first * Entry->Cost;
718 }
719
720 static const CostTblEntry XOPShiftCostTable[] = {
721 // 128bit shifts take 1cy, but right shifts require negation beforehand.
722 { ISD::SHL, MVT::v16i8, 1 },
723 { ISD::SRL, MVT::v16i8, 2 },
724 { ISD::SRA, MVT::v16i8, 2 },
725 { ISD::SHL, MVT::v8i16, 1 },
726 { ISD::SRL, MVT::v8i16, 2 },
727 { ISD::SRA, MVT::v8i16, 2 },
728 { ISD::SHL, MVT::v4i32, 1 },
729 { ISD::SRL, MVT::v4i32, 2 },
730 { ISD::SRA, MVT::v4i32, 2 },
731 { ISD::SHL, MVT::v2i64, 1 },
732 { ISD::SRL, MVT::v2i64, 2 },
733 { ISD::SRA, MVT::v2i64, 2 },
734 // 256bit shifts require splitting if AVX2 didn't catch them above.
735 { ISD::SHL, MVT::v32i8, 2+2 },
736 { ISD::SRL, MVT::v32i8, 4+2 },
737 { ISD::SRA, MVT::v32i8, 4+2 },
738 { ISD::SHL, MVT::v16i16, 2+2 },
739 { ISD::SRL, MVT::v16i16, 4+2 },
740 { ISD::SRA, MVT::v16i16, 4+2 },
741 { ISD::SHL, MVT::v8i32, 2+2 },
742 { ISD::SRL, MVT::v8i32, 4+2 },
743 { ISD::SRA, MVT::v8i32, 4+2 },
744 { ISD::SHL, MVT::v4i64, 2+2 },
745 { ISD::SRL, MVT::v4i64, 4+2 },
746 { ISD::SRA, MVT::v4i64, 4+2 },
747 };
748
749 // Look for XOP lowering tricks.
750 if (ST->hasXOP()) {
751 // If the right shift is constant then we'll fold the negation so
752 // it's as cheap as a left shift.
753 int ShiftISD = ISD;
754 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
755 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
756 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
757 ShiftISD = ISD::SHL;
758 if (const auto *Entry =
759 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
760 return LT.first * Entry->Cost;
761 }
762
763 static const CostTblEntry SSE2UniformShiftCostTable[] = {
764 // Uniform splats are cheaper for the following instructions.
765 { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
766 { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
767 { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
768
769 { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
770 { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
771 { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
772
773 { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
774 { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
775 { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
776 { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
777 };
778
779 if (ST->hasSSE2() &&
780 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
781 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
782
783 // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
784 if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
785 return LT.first * 4; // 2*psrad + shuffle.
786
787 if (const auto *Entry =
788 CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
789 return LT.first * Entry->Cost;
790 }
791
792 if (ISD == ISD::SHL &&
793 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
794 MVT VT = LT.second;
795 // Vector shift left by non uniform constant can be lowered
796 // into vector multiply.
797 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
798 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
799 ISD = ISD::MUL;
800 }
801
802 static const CostTblEntry AVX2CostTable[] = {
803 { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence.
804 { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence.
805 { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
806 { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
807 { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
808 { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
809
810 { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence.
811 { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence.
812 { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
813 { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
814 { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
815 { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
816
817 { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence.
818 { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence.
819 { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence.
820 { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence.
821 { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence.
822 { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence.
823 { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence.
824 { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence.
825
826 { ISD::SUB, MVT::v32i8, 1 }, // psubb
827 { ISD::ADD, MVT::v32i8, 1 }, // paddb
828 { ISD::SUB, MVT::v16i16, 1 }, // psubw
829 { ISD::ADD, MVT::v16i16, 1 }, // paddw
830 { ISD::SUB, MVT::v8i32, 1 }, // psubd
831 { ISD::ADD, MVT::v8i32, 1 }, // paddd
832 { ISD::SUB, MVT::v4i64, 1 }, // psubq
833 { ISD::ADD, MVT::v4i64, 1 }, // paddq
834
835 { ISD::MUL, MVT::v16i16, 1 }, // pmullw
836 { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
837 { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add
838
839 { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
840 { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
841 { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
842 { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
843 { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
844 { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
845 { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/
846 { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/
847 { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
848 { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
849
850 { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
851 { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
852 { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
853 { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
854 { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
855 { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
856 };
857
858 // Look for AVX2 lowering tricks for custom cases.
859 if (ST->hasAVX2())
860 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
861 return LT.first * Entry->Cost;
862
863 static const CostTblEntry AVX1CostTable[] = {
864 // We don't have to scalarize unsupported ops. We can issue two half-sized
865 // operations and we only need to extract the upper YMM half.
866 // Two ops + 1 extract + 1 insert = 4.
867 { ISD::MUL, MVT::v16i16, 4 },
868 { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/
869 { ISD::MUL, MVT::v4i64, 12 },
870
871 { ISD::SUB, MVT::v32i8, 4 },
872 { ISD::ADD, MVT::v32i8, 4 },
873 { ISD::SUB, MVT::v16i16, 4 },
874 { ISD::ADD, MVT::v16i16, 4 },
875 { ISD::SUB, MVT::v8i32, 4 },
876 { ISD::ADD, MVT::v8i32, 4 },
877 { ISD::SUB, MVT::v4i64, 4 },
878 { ISD::ADD, MVT::v4i64, 4 },
879
880 { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split.
881 { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence.
882 { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split.
883 { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld
884 { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split
885 { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend.
886 { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
887
888 { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split.
889 { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split.
890 { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend.
891 { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split.
892 { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend.
893 { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
894
895 { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split.
896 { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split.
897 { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend.
898 { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split.
899 { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend.
900 { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split.
901
902 { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
903 { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
904
905 { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/
906 { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/
907 { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/
908
909 { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
910 { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
911 { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
912 { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
913 { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
914 { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
915 };
916
917 if (ST->hasAVX())
918 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
919 return LT.first * Entry->Cost;
920
921 static const CostTblEntry SSE42CostTable[] = {
922 { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
923 { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
924 { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
925 { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
926
927 { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
928 { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
929 { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
930 { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
931
932 { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
933 { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
934 { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
935 { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
936
937 { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
938 { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
939 { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
940 { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
941
942 { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add
943 };
944
945 if (ST->hasSSE42())
946 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
947 return LT.first * Entry->Cost;
948
949 static const CostTblEntry SSE41CostTable[] = {
950 { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence.
951 { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence.
952 { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
953
954 { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence.
955 { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence.
956 { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
957
958 { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence.
959 { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence.
960
961 { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
962 };
963
964 if (ST->hasSSE41())
965 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
966 return LT.first * Entry->Cost;
967
968 static const CostTblEntry SSE2CostTable[] = {
969 // We don't correctly identify costs of casts because they are marked as
970 // custom.
971 { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence.
972 { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence.
973 { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq.
974 { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
975
976 { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence.
977 { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence.
978 { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend.
979 { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
980
981 { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence.
982 { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence.
983 { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
984 { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence.
985
986 { ISD::MUL, MVT::v8i16, 1 }, // pmullw
987 { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
988 { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
989
990 { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
991 { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
992 { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
993 { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
994
995 { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/
996 { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/
997 { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/
998 { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/
999
1000 { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
1001 { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
1002
1003 { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
1004 { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
1005 };
1006
1007 if (ST->hasSSE2())
1008 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1009 return LT.first * Entry->Cost;
1010
1011 static const CostTblEntry SSE1CostTable[] = {
1012 { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
1013 { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
1014
1015 { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/
1016 { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1017
1018 { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1019 { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1020
1021 { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1022 { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1023 };
1024
1025 if (ST->hasSSE1())
1026 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1027 return LT.first * Entry->Cost;
1028
1029 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1030 { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1031 { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1032 { ISD::MUL, MVT::i64, 2 }, // Nehalem from http://www.agner.org/
1033 };
1034
1035 if (ST->is64Bit())
1036 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1037 return LT.first * Entry->Cost;
1038
1039 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1040 { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1041 { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1042 { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1043
1044 { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1045 { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1046 { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1047 };
1048
1049 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1050 return LT.first * Entry->Cost;
1051
1052 // It is not a good idea to vectorize division. We have to scalarize it and
1053 // in the process we will often end up having to spilling regular
1054 // registers. The overhead of division is going to dominate most kernels
1055 // anyways so try hard to prevent vectorization of division - it is
1056 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1057 // to hide "20 cycles" for each lane.
1058 if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
1059 ISD == ISD::UDIV || ISD == ISD::UREM)) {
1060 InstructionCost ScalarCost = getArithmeticInstrCost(
1061 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
1062 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1063 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1064 }
1065
1066 // Fallback to the default implementation.
1067 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
1068 }
1069
getShuffleCost(TTI::ShuffleKind Kind,VectorType * BaseTp,ArrayRef<int> Mask,int Index,VectorType * SubTp)1070 InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1071 VectorType *BaseTp,
1072 ArrayRef<int> Mask, int Index,
1073 VectorType *SubTp) {
1074 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1075 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1076 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
1077
1078 Kind = improveShuffleKindFromMask(Kind, Mask);
1079 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1080 if (Kind == TTI::SK_Transpose)
1081 Kind = TTI::SK_PermuteTwoSrc;
1082
1083 // For Broadcasts we are splatting the first element from the first input
1084 // register, so only need to reference that input and all the output
1085 // registers are the same.
1086 if (Kind == TTI::SK_Broadcast)
1087 LT.first = 1;
1088
1089 // Subvector extractions are free if they start at the beginning of a
1090 // vector and cheap if the subvectors are aligned.
1091 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1092 int NumElts = LT.second.getVectorNumElements();
1093 if ((Index % NumElts) == 0)
1094 return 0;
1095 std::pair<InstructionCost, MVT> SubLT =
1096 TLI->getTypeLegalizationCost(DL, SubTp);
1097 if (SubLT.second.isVector()) {
1098 int NumSubElts = SubLT.second.getVectorNumElements();
1099 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1100 return SubLT.first;
1101 // Handle some cases for widening legalization. For now we only handle
1102 // cases where the original subvector was naturally aligned and evenly
1103 // fit in its legalized subvector type.
1104 // FIXME: Remove some of the alignment restrictions.
1105 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1106 // vectors.
1107 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1108 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1109 (NumSubElts % OrigSubElts) == 0 &&
1110 LT.second.getVectorElementType() ==
1111 SubLT.second.getVectorElementType() &&
1112 LT.second.getVectorElementType().getSizeInBits() ==
1113 BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1114 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1115 "Unexpected number of elements!");
1116 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1117 LT.second.getVectorNumElements());
1118 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1119 SubLT.second.getVectorNumElements());
1120 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1121 InstructionCost ExtractCost = getShuffleCost(
1122 TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy);
1123
1124 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1125 // if we have SSSE3 we can use pshufb.
1126 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1127 return ExtractCost + 1; // pshufd or pshufb
1128
1129 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1130 "Unexpected vector size");
1131
1132 return ExtractCost + 2; // worst case pshufhw + pshufd
1133 }
1134 }
1135 }
1136
1137 // Subvector insertions are cheap if the subvectors are aligned.
1138 // Note that in general, the insertion starting at the beginning of a vector
1139 // isn't free, because we need to preserve the rest of the wide vector.
1140 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1141 int NumElts = LT.second.getVectorNumElements();
1142 std::pair<InstructionCost, MVT> SubLT =
1143 TLI->getTypeLegalizationCost(DL, SubTp);
1144 if (SubLT.second.isVector()) {
1145 int NumSubElts = SubLT.second.getVectorNumElements();
1146 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1147 return SubLT.first;
1148 }
1149
1150 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1151 Kind = TTI::SK_PermuteTwoSrc;
1152 }
1153
1154 // Handle some common (illegal) sub-vector types as they are often very cheap
1155 // to shuffle even on targets without PSHUFB.
1156 EVT VT = TLI->getValueType(DL, BaseTp);
1157 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1158 !ST->hasSSSE3()) {
1159 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1160 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1161 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1162 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1163 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1164 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1165
1166 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1167 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1168 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1169 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1170
1171 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1172 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1173 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1174 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1175 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1176
1177 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1178 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1179 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1180 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1181 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1182 };
1183
1184 if (ST->hasSSE2())
1185 if (const auto *Entry =
1186 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1187 return Entry->Cost;
1188 }
1189
1190 // We are going to permute multiple sources and the result will be in multiple
1191 // destinations. Providing an accurate cost only for splits where the element
1192 // type remains the same.
1193 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1194 MVT LegalVT = LT.second;
1195 if (LegalVT.isVector() &&
1196 LegalVT.getVectorElementType().getSizeInBits() ==
1197 BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1198 LegalVT.getVectorNumElements() <
1199 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1200
1201 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1202 unsigned LegalVTSize = LegalVT.getStoreSize();
1203 // Number of source vectors after legalization:
1204 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1205 // Number of destination vectors after legalization:
1206 InstructionCost NumOfDests = LT.first;
1207
1208 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1209 LegalVT.getVectorNumElements());
1210
1211 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1212 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1213 None, 0, nullptr);
1214 }
1215
1216 return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1217 }
1218
1219 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1220 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1221 // We assume that source and destination have the same vector type.
1222 InstructionCost NumOfDests = LT.first;
1223 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1224 LT.first = NumOfDests * NumOfShufflesPerDest;
1225 }
1226
1227 static const CostTblEntry AVX512FP16ShuffleTbl[] = {
1228 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1229 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1230 {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw
1231
1232 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1233 {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw
1234 {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb
1235
1236 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1237 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1238 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb
1239
1240 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1241 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w
1242 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w
1243 };
1244
1245 if (!ST->useSoftFloat() && ST->hasFP16())
1246 if (const auto *Entry =
1247 CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second))
1248 return LT.first * Entry->Cost;
1249
1250 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1251 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1252 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1253
1254 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1255 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1256
1257 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1258 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1259 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1260 };
1261
1262 if (ST->hasVBMI())
1263 if (const auto *Entry =
1264 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1265 return LT.first * Entry->Cost;
1266
1267 static const CostTblEntry AVX512BWShuffleTbl[] = {
1268 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1269 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1270
1271 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1272 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1273 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1274
1275 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1276 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1277 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1278
1279 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1280 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1281 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1282 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1283
1284 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1285 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1286 };
1287
1288 if (ST->hasBWI())
1289 if (const auto *Entry =
1290 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1291 return LT.first * Entry->Cost;
1292
1293 static const CostTblEntry AVX512ShuffleTbl[] = {
1294 {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1295 {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1296 {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1297 {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1298 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1299 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1300
1301 {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1302 {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1303 {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1304 {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1305 {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
1306 {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
1307
1308 {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1309 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1310 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1311 {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1312 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1313 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1314 {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1315 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1316 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1317 {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1318 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1319 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1320 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1321
1322 {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1323 {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1324 {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1325 {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1326 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1327 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1328 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1329 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1330 {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1331 {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1332 {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1333 {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
1334
1335 // FIXME: This just applies the type legalization cost rules above
1336 // assuming these completely split.
1337 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
1338 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
1339 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
1340 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
1341
1342 {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
1343 {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
1344 {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
1345 {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
1346 {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
1347 {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
1348 };
1349
1350 if (ST->hasAVX512())
1351 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1352 return LT.first * Entry->Cost;
1353
1354 static const CostTblEntry AVX2ShuffleTbl[] = {
1355 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1356 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1357 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1358 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1359 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1360 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1361
1362 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1363 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1364 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1365 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1366 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1367 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1368
1369 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1370 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1371
1372 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1373 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1374 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1375 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1376 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1377 // + vpblendvb
1378 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1379 // + vpblendvb
1380
1381 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1382 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1383 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1384 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1385 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1386 // + vpblendvb
1387 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1388 // + vpblendvb
1389 };
1390
1391 if (ST->hasAVX2())
1392 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1393 return LT.first * Entry->Cost;
1394
1395 static const CostTblEntry XOPShuffleTbl[] = {
1396 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1397 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1398 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1399 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1400 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1401 // + vinsertf128
1402 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1403 // + vinsertf128
1404
1405 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1406 // + vinsertf128
1407 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1408 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1409 // + vinsertf128
1410 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1411 };
1412
1413 if (ST->hasXOP())
1414 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1415 return LT.first * Entry->Cost;
1416
1417 static const CostTblEntry AVX1ShuffleTbl[] = {
1418 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1419 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1420 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1421 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1422 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1423 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1424
1425 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1426 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1427 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1428 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1429 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1430 // + vinsertf128
1431 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1432 // + vinsertf128
1433
1434 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1435 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1436 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1437 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1438 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1439 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1440
1441 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1442 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1443 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1444 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1445 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1446 // + 2*por + vinsertf128
1447 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1448 // + 2*por + vinsertf128
1449
1450 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1451 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1452 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1453 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1454 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1455 // + 4*por + vinsertf128
1456 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1457 // + 4*por + vinsertf128
1458 };
1459
1460 if (ST->hasAVX())
1461 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1462 return LT.first * Entry->Cost;
1463
1464 static const CostTblEntry SSE41ShuffleTbl[] = {
1465 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1466 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1467 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1468 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1469 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1470 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1471 };
1472
1473 if (ST->hasSSE41())
1474 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1475 return LT.first * Entry->Cost;
1476
1477 static const CostTblEntry SSSE3ShuffleTbl[] = {
1478 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1479 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1480
1481 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1482 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1483
1484 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1485 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1486
1487 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1488 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1489
1490 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1491 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1492 };
1493
1494 if (ST->hasSSSE3())
1495 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1496 return LT.first * Entry->Cost;
1497
1498 static const CostTblEntry SSE2ShuffleTbl[] = {
1499 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1500 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1501 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1502 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1503 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1504
1505 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1506 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1507 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1508 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1509 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1510 // + 2*pshufd + 2*unpck + packus
1511
1512 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1513 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1514 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1515 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1516 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1517
1518 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1519 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1520 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1521 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1522 // + pshufd/unpck
1523 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1524 // + 2*pshufd + 2*unpck + 2*packus
1525
1526 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1527 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1528 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1529 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1530 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1531 };
1532
1533 if (ST->hasSSE2())
1534 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1535 return LT.first * Entry->Cost;
1536
1537 static const CostTblEntry SSE1ShuffleTbl[] = {
1538 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1539 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1540 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1541 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1542 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1543 };
1544
1545 if (ST->hasSSE1())
1546 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1547 return LT.first * Entry->Cost;
1548
1549 return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1550 }
1551
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src,TTI::CastContextHint CCH,TTI::TargetCostKind CostKind,const Instruction * I)1552 InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1553 Type *Src,
1554 TTI::CastContextHint CCH,
1555 TTI::TargetCostKind CostKind,
1556 const Instruction *I) {
1557 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1558 assert(ISD && "Invalid opcode");
1559
1560 // TODO: Allow non-throughput costs that aren't binary.
1561 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1562 if (CostKind != TTI::TCK_RecipThroughput)
1563 return Cost == 0 ? 0 : 1;
1564 return Cost;
1565 };
1566
1567 // The cost tables include both specific, custom (non-legal) src/dst type
1568 // conversions and generic, legalized types. We test for customs first, before
1569 // falling back to legalization.
1570 // FIXME: Need a better design of the cost table to handle non-simple types of
1571 // potential massive combinations (elem_num x src_type x dst_type).
1572 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1573 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1574 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1575
1576 // Mask sign extend has an instruction.
1577 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
1578 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
1579 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
1580 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
1581 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
1582 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
1583 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
1584 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1585 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
1586 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
1587 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
1588
1589 // Mask zero extend is a sext + shift.
1590 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
1591 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
1592 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
1593 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
1594 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
1595 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
1596 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
1597 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1598 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
1599 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
1600 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
1601
1602 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
1603 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
1604 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
1605 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
1606 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
1607 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
1608 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
1609 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
1610 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
1611 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
1612 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
1613 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
1614 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
1615 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
1616 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
1617 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
1618 };
1619
1620 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1621 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
1622 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
1623
1624 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
1625 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
1626
1627 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
1628 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
1629
1630 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
1631 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
1632 };
1633
1634 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1635 // 256-bit wide vectors.
1636
1637 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1638 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
1639 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
1640 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
1641
1642 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1643 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1644 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1645 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
1646 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1647 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1648 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1649 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
1650 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
1651 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
1652 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
1653 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
1654 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
1655 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
1656 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
1657 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
1658 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
1659 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
1660 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdb
1661 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
1662 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
1663 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
1664 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
1665 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
1666 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
1667 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
1668
1669 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
1670 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
1671
1672 // Sign extend is zmm vpternlogd+vptruncdb.
1673 // Zero extend is zmm broadcast load+vptruncdw.
1674 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
1675 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
1676 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
1677 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
1678 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
1679 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
1680 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
1681 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
1682
1683 // Sign extend is zmm vpternlogd+vptruncdw.
1684 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
1685 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
1686 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
1687 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
1688 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
1689 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
1690 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
1691 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
1692 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1693
1694 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
1695 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
1696 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
1697 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
1698 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
1699 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
1700 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
1701 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
1702 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
1703 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
1704
1705 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
1706 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
1707 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
1708 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
1709
1710 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
1711 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
1712 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1713 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1714 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1715 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1716 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
1717 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
1718 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
1719 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
1720
1721 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1722 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1723
1724 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
1725 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
1726 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
1727 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
1728 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
1729 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
1730 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
1731 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
1732
1733 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
1734 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
1735 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
1736 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
1737 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
1738 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
1739 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
1740 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
1741 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
1742 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
1743
1744 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
1745 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 },
1746 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 },
1747 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 },
1748 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 },
1749 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
1750 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 },
1751 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 },
1752 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 },
1753 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
1754 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
1755
1756 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
1757 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
1758 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
1759 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
1760 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
1761 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
1762 };
1763
1764 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
1765 // Mask sign extend has an instruction.
1766 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
1767 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
1768 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
1769 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
1770 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
1771 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
1772 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
1773 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1774 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
1775
1776 // Mask zero extend is a sext + shift.
1777 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
1778 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
1779 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
1780 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
1781 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
1782 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
1783 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
1784 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1785 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
1786
1787 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
1788 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb
1789 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw
1790 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb
1791 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw
1792 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb
1793 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw
1794 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb
1795 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw
1796 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb
1797 };
1798
1799 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
1800 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
1801 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1802 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
1803 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
1804
1805 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
1806 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1807 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
1808 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
1809
1810 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 },
1811 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
1812 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
1813 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
1814
1815 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 },
1816 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
1817 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
1818 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
1819 };
1820
1821 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
1822 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1823 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1824 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1825 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
1826 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1827 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1828 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1829 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
1830 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
1831 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
1832 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
1833 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
1834 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
1835 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
1836 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
1837 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
1838 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
1839
1840 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
1841 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
1842 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
1843 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
1844 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
1845 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
1846 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
1847 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
1848 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
1849 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
1850
1851 // sign extend is vpcmpeq+maskedmove+vpmovdw
1852 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
1853 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
1854 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
1855 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
1856 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
1857 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
1858 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
1859 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
1860 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
1861
1862 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
1863 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
1864 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
1865 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
1866 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
1867 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
1868 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
1869 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
1870 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
1871 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
1872
1873 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
1874 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
1875 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
1876 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
1877 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
1878 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
1879 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
1880 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
1881 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
1882 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
1883 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
1884 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
1885
1886 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
1887 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
1888 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
1889 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
1890
1891 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
1892 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
1893 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
1894 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
1895 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
1896 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
1897 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1898 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1899 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
1900 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
1901 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
1902 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
1903 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
1904
1905 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
1906 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
1907 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 },
1908
1909 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
1910 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
1911 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
1912 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 },
1913 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
1914 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
1915 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
1916 };
1917
1918 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1919 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
1920 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
1921 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
1922 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
1923 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1924 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1925
1926 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
1927 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
1928 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
1929 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
1930 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1931 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1932 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
1933 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
1934 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1935 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1936 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
1937 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
1938 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1939 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1940
1941 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
1942
1943 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 },
1944 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 },
1945 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 },
1946 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 },
1947 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 },
1948 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 },
1949 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 },
1950 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 },
1951 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 },
1952 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 },
1953 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
1954 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
1955
1956 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
1957 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
1958
1959 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 },
1960 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 },
1961 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 },
1962 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 },
1963
1964 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 },
1965 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 },
1966 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 },
1967 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
1968 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
1969 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 },
1970 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 },
1971 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 },
1972
1973 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
1974 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
1975 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
1976 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
1977 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
1978 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
1979 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 },
1980
1981 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
1982 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
1983 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
1984 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
1985 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
1986 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
1987 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 },
1988 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
1989 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
1990 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
1991 };
1992
1993 static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1994 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
1995 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
1996 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
1997 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
1998 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1999 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2000
2001 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2002 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2003 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2004 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2005 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2006 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2007 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2008 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2009 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2010 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2011 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2012 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2013
2014 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
2015 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
2016 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
2017 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
2018 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
2019
2020 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
2021 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
2022 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2023 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 },
2024 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2025 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 },
2026 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2027 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
2028
2029 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
2030 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
2031 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
2032 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2033 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2034 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2035 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2036 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2037 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2038 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2039 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 },
2040 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 },
2041
2042 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
2043 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
2044 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
2045 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2046 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2047 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2048 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2049 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 },
2050 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
2051 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2052 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
2053 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
2054 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
2055 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 },
2056 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
2057 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2058 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
2059
2060 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2061 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 },
2062 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 },
2063 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 },
2064 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 },
2065 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 },
2066 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 },
2067 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 },
2068 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 },
2069 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 },
2070 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 },
2071
2072 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 },
2073 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 },
2074 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 },
2075 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 },
2076 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 },
2077 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 },
2078 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 },
2079 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 },
2080 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2081 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2082 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 },
2083 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 },
2084 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 },
2085
2086 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
2087 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
2088 };
2089
2090 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2091 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2092 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2093 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2094 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2095 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2096 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2097 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2098 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2099 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2100 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2101 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2102 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2103
2104 // These truncates end up widening elements.
2105 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2106 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2107 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2108
2109 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 },
2110 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 },
2111 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 },
2112
2113 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 },
2114 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 },
2115 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 },
2116 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 },
2117 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2118 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2119 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2120 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2121 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2122 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
2123 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2124
2125 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 },
2126 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 },
2127 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
2128 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
2129 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2130 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2131 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2132 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2133 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 },
2134 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2135 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 },
2136 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
2137 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
2138 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
2139
2140 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 },
2141 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 },
2142 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 },
2143 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 },
2144 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 },
2145 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 },
2146 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 },
2147 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 },
2148 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2149 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 },
2150
2151 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 },
2152 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2153 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 },
2154 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
2155 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 },
2156 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 },
2157 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 },
2158 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 },
2159 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 },
2160 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2161 };
2162
2163 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2164 // These are somewhat magic numbers justified by comparing the
2165 // output of llvm-mca for our various supported scheduler models
2166 // and basing it off the worst case scenario.
2167 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 },
2168 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 },
2169 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 },
2170 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 },
2171 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
2172 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2173 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
2174 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2175 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2176 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
2177 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
2178 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
2179
2180 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 },
2181 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 },
2182 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 },
2183 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 },
2184 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2185 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
2186 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
2187 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2188 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 },
2189 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
2190 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2191 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
2192 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
2193
2194 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 },
2195 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 },
2196 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 },
2197 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 },
2198 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 },
2199 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 },
2200 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 },
2201 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 },
2202 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 },
2203 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 },
2204
2205 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 },
2206 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2207 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 },
2208 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2209 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 },
2210 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 },
2211 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 },
2212 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 },
2213 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
2214 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 },
2215
2216 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2217 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2218 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 },
2219 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 },
2220 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2221 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 },
2222 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 },
2223 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 },
2224 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2225 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 },
2226 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2227 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 },
2228
2229 // These truncates are really widening elements.
2230 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2231 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2232 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2233 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2234 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2235 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2236
2237 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2238 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
2239 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2240 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
2241 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
2242 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 },
2243 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2244 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 },
2245 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2246 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2247 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2248 };
2249
2250 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2251 EVT SrcTy = TLI->getValueType(DL, Src);
2252 EVT DstTy = TLI->getValueType(DL, Dst);
2253
2254 // The function getSimpleVT only handles simple value types.
2255 if (SrcTy.isSimple() && DstTy.isSimple()) {
2256 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2257 MVT SimpleDstTy = DstTy.getSimpleVT();
2258
2259 if (ST->useAVX512Regs()) {
2260 if (ST->hasBWI())
2261 if (const auto *Entry = ConvertCostTableLookup(
2262 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2263 return AdjustCost(Entry->Cost);
2264
2265 if (ST->hasDQI())
2266 if (const auto *Entry = ConvertCostTableLookup(
2267 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2268 return AdjustCost(Entry->Cost);
2269
2270 if (ST->hasAVX512())
2271 if (const auto *Entry = ConvertCostTableLookup(
2272 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2273 return AdjustCost(Entry->Cost);
2274 }
2275
2276 if (ST->hasBWI())
2277 if (const auto *Entry = ConvertCostTableLookup(
2278 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2279 return AdjustCost(Entry->Cost);
2280
2281 if (ST->hasDQI())
2282 if (const auto *Entry = ConvertCostTableLookup(
2283 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2284 return AdjustCost(Entry->Cost);
2285
2286 if (ST->hasAVX512())
2287 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2288 SimpleDstTy, SimpleSrcTy))
2289 return AdjustCost(Entry->Cost);
2290
2291 if (ST->hasAVX2()) {
2292 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2293 SimpleDstTy, SimpleSrcTy))
2294 return AdjustCost(Entry->Cost);
2295 }
2296
2297 if (ST->hasAVX()) {
2298 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2299 SimpleDstTy, SimpleSrcTy))
2300 return AdjustCost(Entry->Cost);
2301 }
2302
2303 if (ST->hasSSE41()) {
2304 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2305 SimpleDstTy, SimpleSrcTy))
2306 return AdjustCost(Entry->Cost);
2307 }
2308
2309 if (ST->hasSSE2()) {
2310 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2311 SimpleDstTy, SimpleSrcTy))
2312 return AdjustCost(Entry->Cost);
2313 }
2314 }
2315
2316 // Fall back to legalized types.
2317 std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
2318 std::pair<InstructionCost, MVT> LTDest =
2319 TLI->getTypeLegalizationCost(DL, Dst);
2320
2321 if (ST->useAVX512Regs()) {
2322 if (ST->hasBWI())
2323 if (const auto *Entry = ConvertCostTableLookup(
2324 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2325 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2326
2327 if (ST->hasDQI())
2328 if (const auto *Entry = ConvertCostTableLookup(
2329 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
2330 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2331
2332 if (ST->hasAVX512())
2333 if (const auto *Entry = ConvertCostTableLookup(
2334 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
2335 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2336 }
2337
2338 if (ST->hasBWI())
2339 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2340 LTDest.second, LTSrc.second))
2341 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2342
2343 if (ST->hasDQI())
2344 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2345 LTDest.second, LTSrc.second))
2346 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2347
2348 if (ST->hasAVX512())
2349 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2350 LTDest.second, LTSrc.second))
2351 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2352
2353 if (ST->hasAVX2())
2354 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2355 LTDest.second, LTSrc.second))
2356 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2357
2358 if (ST->hasAVX())
2359 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2360 LTDest.second, LTSrc.second))
2361 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2362
2363 if (ST->hasSSE41())
2364 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2365 LTDest.second, LTSrc.second))
2366 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2367
2368 if (ST->hasSSE2())
2369 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2370 LTDest.second, LTSrc.second))
2371 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2372
2373 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
2374 // sitofp.
2375 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
2376 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
2377 Type *ExtSrc = Src->getWithNewBitWidth(32);
2378 unsigned ExtOpc =
2379 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
2380
2381 // For scalar loads the extend would be free.
2382 InstructionCost ExtCost = 0;
2383 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
2384 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
2385
2386 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
2387 TTI::CastContextHint::None, CostKind);
2388 }
2389
2390 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
2391 // i32.
2392 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
2393 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
2394 Type *TruncDst = Dst->getWithNewBitWidth(32);
2395 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
2396 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
2397 TTI::CastContextHint::None, CostKind);
2398 }
2399
2400 return AdjustCost(
2401 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2402 }
2403
getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy,CmpInst::Predicate VecPred,TTI::TargetCostKind CostKind,const Instruction * I)2404 InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
2405 Type *CondTy,
2406 CmpInst::Predicate VecPred,
2407 TTI::TargetCostKind CostKind,
2408 const Instruction *I) {
2409 // TODO: Handle other cost kinds.
2410 if (CostKind != TTI::TCK_RecipThroughput)
2411 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2412 I);
2413
2414 // Legalize the type.
2415 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2416
2417 MVT MTy = LT.second;
2418
2419 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2420 assert(ISD && "Invalid opcode");
2421
2422 unsigned ExtraCost = 0;
2423 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
2424 // Some vector comparison predicates cost extra instructions.
2425 // TODO: Should we invert this and assume worst case cmp costs
2426 // and reduce for particular predicates?
2427 if (MTy.isVector() &&
2428 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
2429 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
2430 ST->hasBWI())) {
2431 // Fallback to I if a specific predicate wasn't specified.
2432 CmpInst::Predicate Pred = VecPred;
2433 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
2434 Pred == CmpInst::BAD_FCMP_PREDICATE))
2435 Pred = cast<CmpInst>(I)->getPredicate();
2436
2437 switch (Pred) {
2438 case CmpInst::Predicate::ICMP_NE:
2439 // xor(cmpeq(x,y),-1)
2440 ExtraCost = 1;
2441 break;
2442 case CmpInst::Predicate::ICMP_SGE:
2443 case CmpInst::Predicate::ICMP_SLE:
2444 // xor(cmpgt(x,y),-1)
2445 ExtraCost = 1;
2446 break;
2447 case CmpInst::Predicate::ICMP_ULT:
2448 case CmpInst::Predicate::ICMP_UGT:
2449 // cmpgt(xor(x,signbit),xor(y,signbit))
2450 // xor(cmpeq(pmaxu(x,y),x),-1)
2451 ExtraCost = 2;
2452 break;
2453 case CmpInst::Predicate::ICMP_ULE:
2454 case CmpInst::Predicate::ICMP_UGE:
2455 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
2456 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
2457 // cmpeq(psubus(x,y),0)
2458 // cmpeq(pminu(x,y),x)
2459 ExtraCost = 1;
2460 } else {
2461 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
2462 ExtraCost = 3;
2463 }
2464 break;
2465 case CmpInst::Predicate::BAD_ICMP_PREDICATE:
2466 case CmpInst::Predicate::BAD_FCMP_PREDICATE:
2467 // Assume worst case scenario and add the maximum extra cost.
2468 ExtraCost = 3;
2469 break;
2470 default:
2471 break;
2472 }
2473 }
2474 }
2475
2476 static const CostTblEntry SLMCostTbl[] = {
2477 // slm pcmpeq/pcmpgt throughput is 2
2478 { ISD::SETCC, MVT::v2i64, 2 },
2479 };
2480
2481 static const CostTblEntry AVX512BWCostTbl[] = {
2482 { ISD::SETCC, MVT::v32i16, 1 },
2483 { ISD::SETCC, MVT::v64i8, 1 },
2484
2485 { ISD::SELECT, MVT::v32i16, 1 },
2486 { ISD::SELECT, MVT::v64i8, 1 },
2487 };
2488
2489 static const CostTblEntry AVX512CostTbl[] = {
2490 { ISD::SETCC, MVT::v8i64, 1 },
2491 { ISD::SETCC, MVT::v16i32, 1 },
2492 { ISD::SETCC, MVT::v8f64, 1 },
2493 { ISD::SETCC, MVT::v16f32, 1 },
2494
2495 { ISD::SELECT, MVT::v8i64, 1 },
2496 { ISD::SELECT, MVT::v16i32, 1 },
2497 { ISD::SELECT, MVT::v8f64, 1 },
2498 { ISD::SELECT, MVT::v16f32, 1 },
2499
2500 { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
2501 { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
2502
2503 { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
2504 { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
2505 };
2506
2507 static const CostTblEntry AVX2CostTbl[] = {
2508 { ISD::SETCC, MVT::v4i64, 1 },
2509 { ISD::SETCC, MVT::v8i32, 1 },
2510 { ISD::SETCC, MVT::v16i16, 1 },
2511 { ISD::SETCC, MVT::v32i8, 1 },
2512
2513 { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
2514 { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
2515 { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
2516 { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
2517 };
2518
2519 static const CostTblEntry AVX1CostTbl[] = {
2520 { ISD::SETCC, MVT::v4f64, 1 },
2521 { ISD::SETCC, MVT::v8f32, 1 },
2522 // AVX1 does not support 8-wide integer compare.
2523 { ISD::SETCC, MVT::v4i64, 4 },
2524 { ISD::SETCC, MVT::v8i32, 4 },
2525 { ISD::SETCC, MVT::v16i16, 4 },
2526 { ISD::SETCC, MVT::v32i8, 4 },
2527
2528 { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
2529 { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
2530 { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
2531 { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
2532 { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
2533 { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
2534 };
2535
2536 static const CostTblEntry SSE42CostTbl[] = {
2537 { ISD::SETCC, MVT::v2f64, 1 },
2538 { ISD::SETCC, MVT::v4f32, 1 },
2539 { ISD::SETCC, MVT::v2i64, 1 },
2540 };
2541
2542 static const CostTblEntry SSE41CostTbl[] = {
2543 { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
2544 { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
2545 { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
2546 { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
2547 { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
2548 { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
2549 };
2550
2551 static const CostTblEntry SSE2CostTbl[] = {
2552 { ISD::SETCC, MVT::v2f64, 2 },
2553 { ISD::SETCC, MVT::f64, 1 },
2554 { ISD::SETCC, MVT::v2i64, 8 },
2555 { ISD::SETCC, MVT::v4i32, 1 },
2556 { ISD::SETCC, MVT::v8i16, 1 },
2557 { ISD::SETCC, MVT::v16i8, 1 },
2558
2559 { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
2560 { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
2561 { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
2562 { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
2563 { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
2564 };
2565
2566 static const CostTblEntry SSE1CostTbl[] = {
2567 { ISD::SETCC, MVT::v4f32, 2 },
2568 { ISD::SETCC, MVT::f32, 1 },
2569
2570 { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
2571 };
2572
2573 if (ST->isSLM())
2574 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2575 return LT.first * (ExtraCost + Entry->Cost);
2576
2577 if (ST->hasBWI())
2578 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2579 return LT.first * (ExtraCost + Entry->Cost);
2580
2581 if (ST->hasAVX512())
2582 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2583 return LT.first * (ExtraCost + Entry->Cost);
2584
2585 if (ST->hasAVX2())
2586 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2587 return LT.first * (ExtraCost + Entry->Cost);
2588
2589 if (ST->hasAVX())
2590 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2591 return LT.first * (ExtraCost + Entry->Cost);
2592
2593 if (ST->hasSSE42())
2594 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2595 return LT.first * (ExtraCost + Entry->Cost);
2596
2597 if (ST->hasSSE41())
2598 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2599 return LT.first * (ExtraCost + Entry->Cost);
2600
2601 if (ST->hasSSE2())
2602 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2603 return LT.first * (ExtraCost + Entry->Cost);
2604
2605 if (ST->hasSSE1())
2606 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2607 return LT.first * (ExtraCost + Entry->Cost);
2608
2609 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2610 }
2611
getAtomicMemIntrinsicMaxElementSize() const2612 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
2613
2614 InstructionCost
getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)2615 X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
2616 TTI::TargetCostKind CostKind) {
2617
2618 // Costs should match the codegen from:
2619 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
2620 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
2621 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
2622 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
2623 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
2624
2625 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
2626 // specialized in these tables yet.
2627 static const CostTblEntry AVX512BITALGCostTbl[] = {
2628 { ISD::CTPOP, MVT::v32i16, 1 },
2629 { ISD::CTPOP, MVT::v64i8, 1 },
2630 { ISD::CTPOP, MVT::v16i16, 1 },
2631 { ISD::CTPOP, MVT::v32i8, 1 },
2632 { ISD::CTPOP, MVT::v8i16, 1 },
2633 { ISD::CTPOP, MVT::v16i8, 1 },
2634 };
2635 static const CostTblEntry AVX512VPOPCNTDQCostTbl[] = {
2636 { ISD::CTPOP, MVT::v8i64, 1 },
2637 { ISD::CTPOP, MVT::v16i32, 1 },
2638 { ISD::CTPOP, MVT::v4i64, 1 },
2639 { ISD::CTPOP, MVT::v8i32, 1 },
2640 { ISD::CTPOP, MVT::v2i64, 1 },
2641 { ISD::CTPOP, MVT::v4i32, 1 },
2642 };
2643 static const CostTblEntry AVX512CDCostTbl[] = {
2644 { ISD::CTLZ, MVT::v8i64, 1 },
2645 { ISD::CTLZ, MVT::v16i32, 1 },
2646 { ISD::CTLZ, MVT::v32i16, 8 },
2647 { ISD::CTLZ, MVT::v64i8, 20 },
2648 { ISD::CTLZ, MVT::v4i64, 1 },
2649 { ISD::CTLZ, MVT::v8i32, 1 },
2650 { ISD::CTLZ, MVT::v16i16, 4 },
2651 { ISD::CTLZ, MVT::v32i8, 10 },
2652 { ISD::CTLZ, MVT::v2i64, 1 },
2653 { ISD::CTLZ, MVT::v4i32, 1 },
2654 { ISD::CTLZ, MVT::v8i16, 4 },
2655 { ISD::CTLZ, MVT::v16i8, 4 },
2656 };
2657 static const CostTblEntry AVX512BWCostTbl[] = {
2658 { ISD::ABS, MVT::v32i16, 1 },
2659 { ISD::ABS, MVT::v64i8, 1 },
2660 { ISD::BITREVERSE, MVT::v8i64, 3 },
2661 { ISD::BITREVERSE, MVT::v16i32, 3 },
2662 { ISD::BITREVERSE, MVT::v32i16, 3 },
2663 { ISD::BITREVERSE, MVT::v64i8, 2 },
2664 { ISD::BSWAP, MVT::v8i64, 1 },
2665 { ISD::BSWAP, MVT::v16i32, 1 },
2666 { ISD::BSWAP, MVT::v32i16, 1 },
2667 { ISD::CTLZ, MVT::v8i64, 23 },
2668 { ISD::CTLZ, MVT::v16i32, 22 },
2669 { ISD::CTLZ, MVT::v32i16, 18 },
2670 { ISD::CTLZ, MVT::v64i8, 17 },
2671 { ISD::CTPOP, MVT::v8i64, 7 },
2672 { ISD::CTPOP, MVT::v16i32, 11 },
2673 { ISD::CTPOP, MVT::v32i16, 9 },
2674 { ISD::CTPOP, MVT::v64i8, 6 },
2675 { ISD::CTTZ, MVT::v8i64, 10 },
2676 { ISD::CTTZ, MVT::v16i32, 14 },
2677 { ISD::CTTZ, MVT::v32i16, 12 },
2678 { ISD::CTTZ, MVT::v64i8, 9 },
2679 { ISD::SADDSAT, MVT::v32i16, 1 },
2680 { ISD::SADDSAT, MVT::v64i8, 1 },
2681 { ISD::SMAX, MVT::v32i16, 1 },
2682 { ISD::SMAX, MVT::v64i8, 1 },
2683 { ISD::SMIN, MVT::v32i16, 1 },
2684 { ISD::SMIN, MVT::v64i8, 1 },
2685 { ISD::SSUBSAT, MVT::v32i16, 1 },
2686 { ISD::SSUBSAT, MVT::v64i8, 1 },
2687 { ISD::UADDSAT, MVT::v32i16, 1 },
2688 { ISD::UADDSAT, MVT::v64i8, 1 },
2689 { ISD::UMAX, MVT::v32i16, 1 },
2690 { ISD::UMAX, MVT::v64i8, 1 },
2691 { ISD::UMIN, MVT::v32i16, 1 },
2692 { ISD::UMIN, MVT::v64i8, 1 },
2693 { ISD::USUBSAT, MVT::v32i16, 1 },
2694 { ISD::USUBSAT, MVT::v64i8, 1 },
2695 };
2696 static const CostTblEntry AVX512CostTbl[] = {
2697 { ISD::ABS, MVT::v8i64, 1 },
2698 { ISD::ABS, MVT::v16i32, 1 },
2699 { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split
2700 { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split
2701 { ISD::ABS, MVT::v4i64, 1 },
2702 { ISD::ABS, MVT::v2i64, 1 },
2703 { ISD::BITREVERSE, MVT::v8i64, 36 },
2704 { ISD::BITREVERSE, MVT::v16i32, 24 },
2705 { ISD::BITREVERSE, MVT::v32i16, 10 },
2706 { ISD::BITREVERSE, MVT::v64i8, 10 },
2707 { ISD::BSWAP, MVT::v8i64, 4 },
2708 { ISD::BSWAP, MVT::v16i32, 4 },
2709 { ISD::BSWAP, MVT::v32i16, 4 },
2710 { ISD::CTLZ, MVT::v8i64, 29 },
2711 { ISD::CTLZ, MVT::v16i32, 35 },
2712 { ISD::CTLZ, MVT::v32i16, 28 },
2713 { ISD::CTLZ, MVT::v64i8, 18 },
2714 { ISD::CTPOP, MVT::v8i64, 16 },
2715 { ISD::CTPOP, MVT::v16i32, 24 },
2716 { ISD::CTPOP, MVT::v32i16, 18 },
2717 { ISD::CTPOP, MVT::v64i8, 12 },
2718 { ISD::CTTZ, MVT::v8i64, 20 },
2719 { ISD::CTTZ, MVT::v16i32, 28 },
2720 { ISD::CTTZ, MVT::v32i16, 24 },
2721 { ISD::CTTZ, MVT::v64i8, 18 },
2722 { ISD::SMAX, MVT::v8i64, 1 },
2723 { ISD::SMAX, MVT::v16i32, 1 },
2724 { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split
2725 { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split
2726 { ISD::SMAX, MVT::v4i64, 1 },
2727 { ISD::SMAX, MVT::v2i64, 1 },
2728 { ISD::SMIN, MVT::v8i64, 1 },
2729 { ISD::SMIN, MVT::v16i32, 1 },
2730 { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split
2731 { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split
2732 { ISD::SMIN, MVT::v4i64, 1 },
2733 { ISD::SMIN, MVT::v2i64, 1 },
2734 { ISD::UMAX, MVT::v8i64, 1 },
2735 { ISD::UMAX, MVT::v16i32, 1 },
2736 { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split
2737 { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split
2738 { ISD::UMAX, MVT::v4i64, 1 },
2739 { ISD::UMAX, MVT::v2i64, 1 },
2740 { ISD::UMIN, MVT::v8i64, 1 },
2741 { ISD::UMIN, MVT::v16i32, 1 },
2742 { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split
2743 { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split
2744 { ISD::UMIN, MVT::v4i64, 1 },
2745 { ISD::UMIN, MVT::v2i64, 1 },
2746 { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
2747 { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
2748 { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
2749 { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
2750 { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
2751 { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
2752 { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
2753 { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
2754 { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2755 { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2756 { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2757 { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2758 { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2759 { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2760 { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2761 { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2762 { ISD::FMAXNUM, MVT::f32, 2 },
2763 { ISD::FMAXNUM, MVT::v4f32, 2 },
2764 { ISD::FMAXNUM, MVT::v8f32, 2 },
2765 { ISD::FMAXNUM, MVT::v16f32, 2 },
2766 { ISD::FMAXNUM, MVT::f64, 2 },
2767 { ISD::FMAXNUM, MVT::v2f64, 2 },
2768 { ISD::FMAXNUM, MVT::v4f64, 2 },
2769 { ISD::FMAXNUM, MVT::v8f64, 2 },
2770 };
2771 static const CostTblEntry XOPCostTbl[] = {
2772 { ISD::BITREVERSE, MVT::v4i64, 4 },
2773 { ISD::BITREVERSE, MVT::v8i32, 4 },
2774 { ISD::BITREVERSE, MVT::v16i16, 4 },
2775 { ISD::BITREVERSE, MVT::v32i8, 4 },
2776 { ISD::BITREVERSE, MVT::v2i64, 1 },
2777 { ISD::BITREVERSE, MVT::v4i32, 1 },
2778 { ISD::BITREVERSE, MVT::v8i16, 1 },
2779 { ISD::BITREVERSE, MVT::v16i8, 1 },
2780 { ISD::BITREVERSE, MVT::i64, 3 },
2781 { ISD::BITREVERSE, MVT::i32, 3 },
2782 { ISD::BITREVERSE, MVT::i16, 3 },
2783 { ISD::BITREVERSE, MVT::i8, 3 }
2784 };
2785 static const CostTblEntry AVX2CostTbl[] = {
2786 { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2787 { ISD::ABS, MVT::v8i32, 1 },
2788 { ISD::ABS, MVT::v16i16, 1 },
2789 { ISD::ABS, MVT::v32i8, 1 },
2790 { ISD::BITREVERSE, MVT::v2i64, 3 },
2791 { ISD::BITREVERSE, MVT::v4i64, 3 },
2792 { ISD::BITREVERSE, MVT::v4i32, 3 },
2793 { ISD::BITREVERSE, MVT::v8i32, 3 },
2794 { ISD::BITREVERSE, MVT::v8i16, 3 },
2795 { ISD::BITREVERSE, MVT::v16i16, 3 },
2796 { ISD::BITREVERSE, MVT::v16i8, 3 },
2797 { ISD::BITREVERSE, MVT::v32i8, 3 },
2798 { ISD::BSWAP, MVT::v4i64, 1 },
2799 { ISD::BSWAP, MVT::v8i32, 1 },
2800 { ISD::BSWAP, MVT::v16i16, 1 },
2801 { ISD::CTLZ, MVT::v2i64, 7 },
2802 { ISD::CTLZ, MVT::v4i64, 7 },
2803 { ISD::CTLZ, MVT::v4i32, 5 },
2804 { ISD::CTLZ, MVT::v8i32, 5 },
2805 { ISD::CTLZ, MVT::v8i16, 4 },
2806 { ISD::CTLZ, MVT::v16i16, 4 },
2807 { ISD::CTLZ, MVT::v16i8, 3 },
2808 { ISD::CTLZ, MVT::v32i8, 3 },
2809 { ISD::CTPOP, MVT::v2i64, 3 },
2810 { ISD::CTPOP, MVT::v4i64, 3 },
2811 { ISD::CTPOP, MVT::v4i32, 7 },
2812 { ISD::CTPOP, MVT::v8i32, 7 },
2813 { ISD::CTPOP, MVT::v8i16, 3 },
2814 { ISD::CTPOP, MVT::v16i16, 3 },
2815 { ISD::CTPOP, MVT::v16i8, 2 },
2816 { ISD::CTPOP, MVT::v32i8, 2 },
2817 { ISD::CTTZ, MVT::v2i64, 4 },
2818 { ISD::CTTZ, MVT::v4i64, 4 },
2819 { ISD::CTTZ, MVT::v4i32, 7 },
2820 { ISD::CTTZ, MVT::v8i32, 7 },
2821 { ISD::CTTZ, MVT::v8i16, 4 },
2822 { ISD::CTTZ, MVT::v16i16, 4 },
2823 { ISD::CTTZ, MVT::v16i8, 3 },
2824 { ISD::CTTZ, MVT::v32i8, 3 },
2825 { ISD::SADDSAT, MVT::v16i16, 1 },
2826 { ISD::SADDSAT, MVT::v32i8, 1 },
2827 { ISD::SMAX, MVT::v8i32, 1 },
2828 { ISD::SMAX, MVT::v16i16, 1 },
2829 { ISD::SMAX, MVT::v32i8, 1 },
2830 { ISD::SMIN, MVT::v8i32, 1 },
2831 { ISD::SMIN, MVT::v16i16, 1 },
2832 { ISD::SMIN, MVT::v32i8, 1 },
2833 { ISD::SSUBSAT, MVT::v16i16, 1 },
2834 { ISD::SSUBSAT, MVT::v32i8, 1 },
2835 { ISD::UADDSAT, MVT::v16i16, 1 },
2836 { ISD::UADDSAT, MVT::v32i8, 1 },
2837 { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
2838 { ISD::UMAX, MVT::v8i32, 1 },
2839 { ISD::UMAX, MVT::v16i16, 1 },
2840 { ISD::UMAX, MVT::v32i8, 1 },
2841 { ISD::UMIN, MVT::v8i32, 1 },
2842 { ISD::UMIN, MVT::v16i16, 1 },
2843 { ISD::UMIN, MVT::v32i8, 1 },
2844 { ISD::USUBSAT, MVT::v16i16, 1 },
2845 { ISD::USUBSAT, MVT::v32i8, 1 },
2846 { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
2847 { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2848 { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2849 { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
2850 { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
2851 { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
2852 { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
2853 { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
2854 { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
2855 };
2856 static const CostTblEntry AVX1CostTbl[] = {
2857 { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2858 { ISD::ABS, MVT::v8i32, 3 },
2859 { ISD::ABS, MVT::v16i16, 3 },
2860 { ISD::ABS, MVT::v32i8, 3 },
2861 { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
2862 { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
2863 { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
2864 { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
2865 { ISD::BSWAP, MVT::v4i64, 4 },
2866 { ISD::BSWAP, MVT::v8i32, 4 },
2867 { ISD::BSWAP, MVT::v16i16, 4 },
2868 { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
2869 { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
2870 { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
2871 { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2872 { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
2873 { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
2874 { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
2875 { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
2876 { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
2877 { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
2878 { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
2879 { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2880 { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2881 { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2882 { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2883 { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2884 { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2885 { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2886 { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2887 { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2888 { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2889 { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2890 { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2891 { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2892 { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
2893 { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2894 { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2895 { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2896 { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2897 { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2898 { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2899 { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2900 { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2901 { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
2902 { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
2903 { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2904 { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
2905 { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
2906 { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2907 { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
2908 { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
2909 { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
2910 { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
2911 { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
2912 { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
2913 { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
2914 };
2915 static const CostTblEntry GLMCostTbl[] = {
2916 { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
2917 { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
2918 { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
2919 { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
2920 };
2921 static const CostTblEntry SLMCostTbl[] = {
2922 { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
2923 { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
2924 { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
2925 { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
2926 };
2927 static const CostTblEntry SSE42CostTbl[] = {
2928 { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
2929 { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
2930 { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
2931 { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
2932 };
2933 static const CostTblEntry SSE41CostTbl[] = {
2934 { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
2935 { ISD::SMAX, MVT::v4i32, 1 },
2936 { ISD::SMAX, MVT::v16i8, 1 },
2937 { ISD::SMIN, MVT::v4i32, 1 },
2938 { ISD::SMIN, MVT::v16i8, 1 },
2939 { ISD::UMAX, MVT::v4i32, 1 },
2940 { ISD::UMAX, MVT::v8i16, 1 },
2941 { ISD::UMIN, MVT::v4i32, 1 },
2942 { ISD::UMIN, MVT::v8i16, 1 },
2943 };
2944 static const CostTblEntry SSSE3CostTbl[] = {
2945 { ISD::ABS, MVT::v4i32, 1 },
2946 { ISD::ABS, MVT::v8i16, 1 },
2947 { ISD::ABS, MVT::v16i8, 1 },
2948 { ISD::BITREVERSE, MVT::v2i64, 5 },
2949 { ISD::BITREVERSE, MVT::v4i32, 5 },
2950 { ISD::BITREVERSE, MVT::v8i16, 5 },
2951 { ISD::BITREVERSE, MVT::v16i8, 5 },
2952 { ISD::BSWAP, MVT::v2i64, 1 },
2953 { ISD::BSWAP, MVT::v4i32, 1 },
2954 { ISD::BSWAP, MVT::v8i16, 1 },
2955 { ISD::CTLZ, MVT::v2i64, 23 },
2956 { ISD::CTLZ, MVT::v4i32, 18 },
2957 { ISD::CTLZ, MVT::v8i16, 14 },
2958 { ISD::CTLZ, MVT::v16i8, 9 },
2959 { ISD::CTPOP, MVT::v2i64, 7 },
2960 { ISD::CTPOP, MVT::v4i32, 11 },
2961 { ISD::CTPOP, MVT::v8i16, 9 },
2962 { ISD::CTPOP, MVT::v16i8, 6 },
2963 { ISD::CTTZ, MVT::v2i64, 10 },
2964 { ISD::CTTZ, MVT::v4i32, 14 },
2965 { ISD::CTTZ, MVT::v8i16, 12 },
2966 { ISD::CTTZ, MVT::v16i8, 9 }
2967 };
2968 static const CostTblEntry SSE2CostTbl[] = {
2969 { ISD::ABS, MVT::v2i64, 4 },
2970 { ISD::ABS, MVT::v4i32, 3 },
2971 { ISD::ABS, MVT::v8i16, 2 },
2972 { ISD::ABS, MVT::v16i8, 2 },
2973 { ISD::BITREVERSE, MVT::v2i64, 29 },
2974 { ISD::BITREVERSE, MVT::v4i32, 27 },
2975 { ISD::BITREVERSE, MVT::v8i16, 27 },
2976 { ISD::BITREVERSE, MVT::v16i8, 20 },
2977 { ISD::BSWAP, MVT::v2i64, 7 },
2978 { ISD::BSWAP, MVT::v4i32, 7 },
2979 { ISD::BSWAP, MVT::v8i16, 7 },
2980 { ISD::CTLZ, MVT::v2i64, 25 },
2981 { ISD::CTLZ, MVT::v4i32, 26 },
2982 { ISD::CTLZ, MVT::v8i16, 20 },
2983 { ISD::CTLZ, MVT::v16i8, 17 },
2984 { ISD::CTPOP, MVT::v2i64, 12 },
2985 { ISD::CTPOP, MVT::v4i32, 15 },
2986 { ISD::CTPOP, MVT::v8i16, 13 },
2987 { ISD::CTPOP, MVT::v16i8, 10 },
2988 { ISD::CTTZ, MVT::v2i64, 14 },
2989 { ISD::CTTZ, MVT::v4i32, 18 },
2990 { ISD::CTTZ, MVT::v8i16, 16 },
2991 { ISD::CTTZ, MVT::v16i8, 13 },
2992 { ISD::SADDSAT, MVT::v8i16, 1 },
2993 { ISD::SADDSAT, MVT::v16i8, 1 },
2994 { ISD::SMAX, MVT::v8i16, 1 },
2995 { ISD::SMIN, MVT::v8i16, 1 },
2996 { ISD::SSUBSAT, MVT::v8i16, 1 },
2997 { ISD::SSUBSAT, MVT::v16i8, 1 },
2998 { ISD::UADDSAT, MVT::v8i16, 1 },
2999 { ISD::UADDSAT, MVT::v16i8, 1 },
3000 { ISD::UMAX, MVT::v8i16, 2 },
3001 { ISD::UMAX, MVT::v16i8, 1 },
3002 { ISD::UMIN, MVT::v8i16, 2 },
3003 { ISD::UMIN, MVT::v16i8, 1 },
3004 { ISD::USUBSAT, MVT::v8i16, 1 },
3005 { ISD::USUBSAT, MVT::v16i8, 1 },
3006 { ISD::FMAXNUM, MVT::f64, 4 },
3007 { ISD::FMAXNUM, MVT::v2f64, 4 },
3008 { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
3009 { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
3010 };
3011 static const CostTblEntry SSE1CostTbl[] = {
3012 { ISD::FMAXNUM, MVT::f32, 4 },
3013 { ISD::FMAXNUM, MVT::v4f32, 4 },
3014 { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
3015 { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
3016 };
3017 static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
3018 { ISD::CTTZ, MVT::i64, 1 },
3019 };
3020 static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
3021 { ISD::CTTZ, MVT::i32, 1 },
3022 { ISD::CTTZ, MVT::i16, 1 },
3023 { ISD::CTTZ, MVT::i8, 1 },
3024 };
3025 static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
3026 { ISD::CTLZ, MVT::i64, 1 },
3027 };
3028 static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
3029 { ISD::CTLZ, MVT::i32, 1 },
3030 { ISD::CTLZ, MVT::i16, 1 },
3031 { ISD::CTLZ, MVT::i8, 1 },
3032 };
3033 static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
3034 { ISD::CTPOP, MVT::i64, 1 },
3035 };
3036 static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
3037 { ISD::CTPOP, MVT::i32, 1 },
3038 { ISD::CTPOP, MVT::i16, 1 },
3039 { ISD::CTPOP, MVT::i8, 1 },
3040 };
3041 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
3042 { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
3043 { ISD::BITREVERSE, MVT::i64, 14 },
3044 { ISD::BSWAP, MVT::i64, 1 },
3045 { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
3046 { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
3047 { ISD::CTPOP, MVT::i64, 10 },
3048 { ISD::SADDO, MVT::i64, 1 },
3049 { ISD::UADDO, MVT::i64, 1 },
3050 { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
3051 };
3052 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3053 { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
3054 { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
3055 { ISD::BITREVERSE, MVT::i32, 14 },
3056 { ISD::BITREVERSE, MVT::i16, 14 },
3057 { ISD::BITREVERSE, MVT::i8, 11 },
3058 { ISD::BSWAP, MVT::i32, 1 },
3059 { ISD::BSWAP, MVT::i16, 1 }, // ROL
3060 { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
3061 { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
3062 { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
3063 { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
3064 { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
3065 { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
3066 { ISD::CTPOP, MVT::i32, 8 },
3067 { ISD::CTPOP, MVT::i16, 9 },
3068 { ISD::CTPOP, MVT::i8, 7 },
3069 { ISD::SADDO, MVT::i32, 1 },
3070 { ISD::SADDO, MVT::i16, 1 },
3071 { ISD::SADDO, MVT::i8, 1 },
3072 { ISD::UADDO, MVT::i32, 1 },
3073 { ISD::UADDO, MVT::i16, 1 },
3074 { ISD::UADDO, MVT::i8, 1 },
3075 { ISD::UMULO, MVT::i32, 2 }, // mul + seto
3076 { ISD::UMULO, MVT::i16, 2 },
3077 { ISD::UMULO, MVT::i8, 2 },
3078 };
3079
3080 Type *RetTy = ICA.getReturnType();
3081 Type *OpTy = RetTy;
3082 Intrinsic::ID IID = ICA.getID();
3083 unsigned ISD = ISD::DELETED_NODE;
3084 switch (IID) {
3085 default:
3086 break;
3087 case Intrinsic::abs:
3088 ISD = ISD::ABS;
3089 break;
3090 case Intrinsic::bitreverse:
3091 ISD = ISD::BITREVERSE;
3092 break;
3093 case Intrinsic::bswap:
3094 ISD = ISD::BSWAP;
3095 break;
3096 case Intrinsic::ctlz:
3097 ISD = ISD::CTLZ;
3098 break;
3099 case Intrinsic::ctpop:
3100 ISD = ISD::CTPOP;
3101 break;
3102 case Intrinsic::cttz:
3103 ISD = ISD::CTTZ;
3104 break;
3105 case Intrinsic::maxnum:
3106 case Intrinsic::minnum:
3107 // FMINNUM has same costs so don't duplicate.
3108 ISD = ISD::FMAXNUM;
3109 break;
3110 case Intrinsic::sadd_sat:
3111 ISD = ISD::SADDSAT;
3112 break;
3113 case Intrinsic::smax:
3114 ISD = ISD::SMAX;
3115 break;
3116 case Intrinsic::smin:
3117 ISD = ISD::SMIN;
3118 break;
3119 case Intrinsic::ssub_sat:
3120 ISD = ISD::SSUBSAT;
3121 break;
3122 case Intrinsic::uadd_sat:
3123 ISD = ISD::UADDSAT;
3124 break;
3125 case Intrinsic::umax:
3126 ISD = ISD::UMAX;
3127 break;
3128 case Intrinsic::umin:
3129 ISD = ISD::UMIN;
3130 break;
3131 case Intrinsic::usub_sat:
3132 ISD = ISD::USUBSAT;
3133 break;
3134 case Intrinsic::sqrt:
3135 ISD = ISD::FSQRT;
3136 break;
3137 case Intrinsic::sadd_with_overflow:
3138 case Intrinsic::ssub_with_overflow:
3139 // SSUBO has same costs so don't duplicate.
3140 ISD = ISD::SADDO;
3141 OpTy = RetTy->getContainedType(0);
3142 break;
3143 case Intrinsic::uadd_with_overflow:
3144 case Intrinsic::usub_with_overflow:
3145 // USUBO has same costs so don't duplicate.
3146 ISD = ISD::UADDO;
3147 OpTy = RetTy->getContainedType(0);
3148 break;
3149 case Intrinsic::umul_with_overflow:
3150 case Intrinsic::smul_with_overflow:
3151 // SMULO has same costs so don't duplicate.
3152 ISD = ISD::UMULO;
3153 OpTy = RetTy->getContainedType(0);
3154 break;
3155 }
3156
3157 if (ISD != ISD::DELETED_NODE) {
3158 // Legalize the type.
3159 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
3160 MVT MTy = LT.second;
3161
3162 // Attempt to lookup cost.
3163 if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
3164 MTy.isVector()) {
3165 // With PSHUFB the code is very similar for all types. If we have integer
3166 // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
3167 // we also need a PSHUFB.
3168 unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
3169
3170 // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
3171 // instructions. We also need an extract and an insert.
3172 if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
3173 (ST->hasBWI() && MTy.is512BitVector())))
3174 Cost = Cost * 2 + 2;
3175
3176 return LT.first * Cost;
3177 }
3178
3179 auto adjustTableCost = [](const CostTblEntry &Entry,
3180 InstructionCost LegalizationCost,
3181 FastMathFlags FMF) {
3182 // If there are no NANs to deal with, then these are reduced to a
3183 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
3184 // assume is used in the non-fast case.
3185 if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
3186 if (FMF.noNaNs())
3187 return LegalizationCost * 1;
3188 }
3189 return LegalizationCost * (int)Entry.Cost;
3190 };
3191
3192 if (ST->useGLMDivSqrtCosts())
3193 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
3194 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3195
3196 if (ST->isSLM())
3197 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3198 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3199
3200 if (ST->hasBITALG())
3201 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
3202 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3203
3204 if (ST->hasVPOPCNTDQ())
3205 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
3206 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3207
3208 if (ST->hasCDI())
3209 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
3210 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3211
3212 if (ST->hasBWI())
3213 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3214 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3215
3216 if (ST->hasAVX512())
3217 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3218 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3219
3220 if (ST->hasXOP())
3221 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3222 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3223
3224 if (ST->hasAVX2())
3225 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3226 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3227
3228 if (ST->hasAVX())
3229 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3230 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3231
3232 if (ST->hasSSE42())
3233 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3234 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3235
3236 if (ST->hasSSE41())
3237 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3238 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3239
3240 if (ST->hasSSSE3())
3241 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
3242 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3243
3244 if (ST->hasSSE2())
3245 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3246 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3247
3248 if (ST->hasSSE1())
3249 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3250 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3251
3252 if (ST->hasBMI()) {
3253 if (ST->is64Bit())
3254 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
3255 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3256
3257 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
3258 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3259 }
3260
3261 if (ST->hasLZCNT()) {
3262 if (ST->is64Bit())
3263 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
3264 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3265
3266 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
3267 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3268 }
3269
3270 if (ST->hasPOPCNT()) {
3271 if (ST->is64Bit())
3272 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
3273 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3274
3275 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
3276 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3277 }
3278
3279 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
3280 if (const Instruction *II = ICA.getInst()) {
3281 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
3282 return TTI::TCC_Free;
3283 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
3284 if (LI->hasOneUse())
3285 return TTI::TCC_Free;
3286 }
3287 }
3288 }
3289
3290 // TODO - add BMI (TZCNT) scalar handling
3291
3292 if (ST->is64Bit())
3293 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3294 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3295
3296 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3297 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3298 }
3299
3300 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
3301 }
3302
3303 InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)3304 X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
3305 TTI::TargetCostKind CostKind) {
3306 if (ICA.isTypeBasedOnly())
3307 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
3308
3309 static const CostTblEntry AVX512CostTbl[] = {
3310 { ISD::ROTL, MVT::v8i64, 1 },
3311 { ISD::ROTL, MVT::v4i64, 1 },
3312 { ISD::ROTL, MVT::v2i64, 1 },
3313 { ISD::ROTL, MVT::v16i32, 1 },
3314 { ISD::ROTL, MVT::v8i32, 1 },
3315 { ISD::ROTL, MVT::v4i32, 1 },
3316 { ISD::ROTR, MVT::v8i64, 1 },
3317 { ISD::ROTR, MVT::v4i64, 1 },
3318 { ISD::ROTR, MVT::v2i64, 1 },
3319 { ISD::ROTR, MVT::v16i32, 1 },
3320 { ISD::ROTR, MVT::v8i32, 1 },
3321 { ISD::ROTR, MVT::v4i32, 1 }
3322 };
3323 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3324 static const CostTblEntry XOPCostTbl[] = {
3325 { ISD::ROTL, MVT::v4i64, 4 },
3326 { ISD::ROTL, MVT::v8i32, 4 },
3327 { ISD::ROTL, MVT::v16i16, 4 },
3328 { ISD::ROTL, MVT::v32i8, 4 },
3329 { ISD::ROTL, MVT::v2i64, 1 },
3330 { ISD::ROTL, MVT::v4i32, 1 },
3331 { ISD::ROTL, MVT::v8i16, 1 },
3332 { ISD::ROTL, MVT::v16i8, 1 },
3333 { ISD::ROTR, MVT::v4i64, 6 },
3334 { ISD::ROTR, MVT::v8i32, 6 },
3335 { ISD::ROTR, MVT::v16i16, 6 },
3336 { ISD::ROTR, MVT::v32i8, 6 },
3337 { ISD::ROTR, MVT::v2i64, 2 },
3338 { ISD::ROTR, MVT::v4i32, 2 },
3339 { ISD::ROTR, MVT::v8i16, 2 },
3340 { ISD::ROTR, MVT::v16i8, 2 }
3341 };
3342 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
3343 { ISD::ROTL, MVT::i64, 1 },
3344 { ISD::ROTR, MVT::i64, 1 },
3345 { ISD::FSHL, MVT::i64, 4 }
3346 };
3347 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3348 { ISD::ROTL, MVT::i32, 1 },
3349 { ISD::ROTL, MVT::i16, 1 },
3350 { ISD::ROTL, MVT::i8, 1 },
3351 { ISD::ROTR, MVT::i32, 1 },
3352 { ISD::ROTR, MVT::i16, 1 },
3353 { ISD::ROTR, MVT::i8, 1 },
3354 { ISD::FSHL, MVT::i32, 4 },
3355 { ISD::FSHL, MVT::i16, 4 },
3356 { ISD::FSHL, MVT::i8, 4 }
3357 };
3358
3359 Intrinsic::ID IID = ICA.getID();
3360 Type *RetTy = ICA.getReturnType();
3361 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
3362 unsigned ISD = ISD::DELETED_NODE;
3363 switch (IID) {
3364 default:
3365 break;
3366 case Intrinsic::fshl:
3367 ISD = ISD::FSHL;
3368 if (Args[0] == Args[1])
3369 ISD = ISD::ROTL;
3370 break;
3371 case Intrinsic::fshr:
3372 // FSHR has same costs so don't duplicate.
3373 ISD = ISD::FSHL;
3374 if (Args[0] == Args[1])
3375 ISD = ISD::ROTR;
3376 break;
3377 }
3378
3379 if (ISD != ISD::DELETED_NODE) {
3380 // Legalize the type.
3381 std::pair<InstructionCost, MVT> LT =
3382 TLI->getTypeLegalizationCost(DL, RetTy);
3383 MVT MTy = LT.second;
3384
3385 // Attempt to lookup cost.
3386 if (ST->hasAVX512())
3387 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3388 return LT.first * Entry->Cost;
3389
3390 if (ST->hasXOP())
3391 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3392 return LT.first * Entry->Cost;
3393
3394 if (ST->is64Bit())
3395 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3396 return LT.first * Entry->Cost;
3397
3398 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3399 return LT.first * Entry->Cost;
3400 }
3401
3402 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
3403 }
3404
getVectorInstrCost(unsigned Opcode,Type * Val,unsigned Index)3405 InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
3406 unsigned Index) {
3407 static const CostTblEntry SLMCostTbl[] = {
3408 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
3409 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
3410 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
3411 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
3412 };
3413
3414 assert(Val->isVectorTy() && "This must be a vector type");
3415 Type *ScalarType = Val->getScalarType();
3416 int RegisterFileMoveCost = 0;
3417
3418 // Non-immediate extraction/insertion can be handled as a sequence of
3419 // aliased loads+stores via the stack.
3420 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
3421 Opcode == Instruction::InsertElement)) {
3422 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
3423 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
3424
3425 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
3426 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
3427 Align VecAlign = DL.getPrefTypeAlign(Val);
3428 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
3429
3430 // Extract - store vector to stack, load scalar.
3431 if (Opcode == Instruction::ExtractElement) {
3432 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0,
3433 TTI::TargetCostKind::TCK_RecipThroughput) +
3434 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
3435 TTI::TargetCostKind::TCK_RecipThroughput);
3436 }
3437 // Insert - store vector to stack, store scalar, load vector.
3438 if (Opcode == Instruction::InsertElement) {
3439 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0,
3440 TTI::TargetCostKind::TCK_RecipThroughput) +
3441 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
3442 TTI::TargetCostKind::TCK_RecipThroughput) +
3443 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0,
3444 TTI::TargetCostKind::TCK_RecipThroughput);
3445 }
3446 }
3447
3448 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
3449 Opcode == Instruction::InsertElement)) {
3450 // Legalize the type.
3451 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
3452
3453 // This type is legalized to a scalar type.
3454 if (!LT.second.isVector())
3455 return 0;
3456
3457 // The type may be split. Normalize the index to the new type.
3458 unsigned NumElts = LT.second.getVectorNumElements();
3459 unsigned SubNumElts = NumElts;
3460 Index = Index % NumElts;
3461
3462 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
3463 // For inserts, we also need to insert the subvector back.
3464 if (LT.second.getSizeInBits() > 128) {
3465 assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
3466 unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
3467 SubNumElts = NumElts / NumSubVecs;
3468 if (SubNumElts <= Index) {
3469 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
3470 Index %= SubNumElts;
3471 }
3472 }
3473
3474 if (Index == 0) {
3475 // Floating point scalars are already located in index #0.
3476 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
3477 // true for all.
3478 if (ScalarType->isFloatingPointTy())
3479 return RegisterFileMoveCost;
3480
3481 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
3482 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
3483 return 1 + RegisterFileMoveCost;
3484 }
3485
3486 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3487 assert(ISD && "Unexpected vector opcode");
3488 MVT MScalarTy = LT.second.getScalarType();
3489 if (ST->isSLM())
3490 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
3491 return Entry->Cost + RegisterFileMoveCost;
3492
3493 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
3494 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3495 (MScalarTy.isInteger() && ST->hasSSE41()))
3496 return 1 + RegisterFileMoveCost;
3497
3498 // Assume insertps is relatively cheap on all targets.
3499 if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
3500 Opcode == Instruction::InsertElement)
3501 return 1 + RegisterFileMoveCost;
3502
3503 // For extractions we just need to shuffle the element to index 0, which
3504 // should be very cheap (assume cost = 1). For insertions we need to shuffle
3505 // the elements to its destination. In both cases we must handle the
3506 // subvector move(s).
3507 // If the vector type is already less than 128-bits then don't reduce it.
3508 // TODO: Under what circumstances should we shuffle using the full width?
3509 InstructionCost ShuffleCost = 1;
3510 if (Opcode == Instruction::InsertElement) {
3511 auto *SubTy = cast<VectorType>(Val);
3512 EVT VT = TLI->getValueType(DL, Val);
3513 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
3514 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
3515 ShuffleCost =
3516 getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy);
3517 }
3518 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
3519 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
3520 }
3521
3522 // Add to the base cost if we know that the extracted element of a vector is
3523 // destined to be moved to and used in the integer register file.
3524 if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
3525 RegisterFileMoveCost += 1;
3526
3527 return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
3528 }
3529
getScalarizationOverhead(VectorType * Ty,const APInt & DemandedElts,bool Insert,bool Extract)3530 InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
3531 const APInt &DemandedElts,
3532 bool Insert,
3533 bool Extract) {
3534 InstructionCost Cost = 0;
3535
3536 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
3537 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
3538 if (Insert) {
3539 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
3540 MVT MScalarTy = LT.second.getScalarType();
3541
3542 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3543 (MScalarTy.isInteger() && ST->hasSSE41()) ||
3544 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
3545 // For types we can insert directly, insertion into 128-bit sub vectors is
3546 // cheap, followed by a cheap chain of concatenations.
3547 if (LT.second.getSizeInBits() <= 128) {
3548 Cost +=
3549 BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
3550 } else {
3551 // In each 128-lane, if at least one index is demanded but not all
3552 // indices are demanded and this 128-lane is not the first 128-lane of
3553 // the legalized-vector, then this 128-lane needs a extracti128; If in
3554 // each 128-lane, there is at least one demanded index, this 128-lane
3555 // needs a inserti128.
3556
3557 // The following cases will help you build a better understanding:
3558 // Assume we insert several elements into a v8i32 vector in avx2,
3559 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
3560 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
3561 // inserti128.
3562 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
3563 const int CostValue = *LT.first.getValue();
3564 assert(CostValue >= 0 && "Negative cost!");
3565 unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue;
3566 unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
3567 APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
3568 unsigned Scale = NumElts / Num128Lanes;
3569 // We iterate each 128-lane, and check if we need a
3570 // extracti128/inserti128 for this 128-lane.
3571 for (unsigned I = 0; I < NumElts; I += Scale) {
3572 APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale);
3573 APInt MaskedDE = Mask & WidenedDemandedElts;
3574 unsigned Population = MaskedDE.countPopulation();
3575 Cost += (Population > 0 && Population != Scale &&
3576 I % LT.second.getVectorNumElements() != 0);
3577 Cost += Population > 0;
3578 }
3579 Cost += DemandedElts.countPopulation();
3580
3581 // For vXf32 cases, insertion into the 0'th index in each v4f32
3582 // 128-bit vector is free.
3583 // NOTE: This assumes legalization widens vXf32 vectors.
3584 if (MScalarTy == MVT::f32)
3585 for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
3586 i < e; i += 4)
3587 if (DemandedElts[i])
3588 Cost--;
3589 }
3590 } else if (LT.second.isVector()) {
3591 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
3592 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
3593 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
3594 // considered cheap.
3595 if (Ty->isIntOrIntVectorTy())
3596 Cost += DemandedElts.countPopulation();
3597
3598 // Get the smaller of the legalized or original pow2-extended number of
3599 // vector elements, which represents the number of unpacks we'll end up
3600 // performing.
3601 unsigned NumElts = LT.second.getVectorNumElements();
3602 unsigned Pow2Elts =
3603 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
3604 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
3605 }
3606 }
3607
3608 // TODO: Use default extraction for now, but we should investigate extending this
3609 // to handle repeated subvector extraction.
3610 if (Extract)
3611 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
3612
3613 return Cost;
3614 }
3615
getMemoryOpCost(unsigned Opcode,Type * Src,MaybeAlign Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,const Instruction * I)3616 InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
3617 MaybeAlign Alignment,
3618 unsigned AddressSpace,
3619 TTI::TargetCostKind CostKind,
3620 const Instruction *I) {
3621 // TODO: Handle other cost kinds.
3622 if (CostKind != TTI::TCK_RecipThroughput) {
3623 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
3624 // Store instruction with index and scale costs 2 Uops.
3625 // Check the preceding GEP to identify non-const indices.
3626 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
3627 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
3628 return TTI::TCC_Basic * 2;
3629 }
3630 }
3631 return TTI::TCC_Basic;
3632 }
3633
3634 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3635 "Invalid Opcode");
3636 // Type legalization can't handle structs
3637 if (TLI->getValueType(DL, Src, true) == MVT::Other)
3638 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3639 CostKind);
3640
3641 // Legalize the type.
3642 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
3643
3644 auto *VTy = dyn_cast<FixedVectorType>(Src);
3645
3646 // Handle the simple case of non-vectors.
3647 // NOTE: this assumes that legalization never creates vector from scalars!
3648 if (!VTy || !LT.second.isVector())
3649 // Each load/store unit costs 1.
3650 return LT.first * 1;
3651
3652 bool IsLoad = Opcode == Instruction::Load;
3653
3654 Type *EltTy = VTy->getElementType();
3655
3656 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
3657
3658 InstructionCost Cost = 0;
3659
3660 // Source of truth: how many elements were there in the original IR vector?
3661 const unsigned SrcNumElt = VTy->getNumElements();
3662
3663 // How far have we gotten?
3664 int NumEltRemaining = SrcNumElt;
3665 // Note that we intentionally capture by-reference, NumEltRemaining changes.
3666 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
3667
3668 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
3669
3670 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
3671 const unsigned XMMBits = 128;
3672 if (XMMBits % EltTyBits != 0)
3673 // Vector size must be a multiple of the element size. I.e. no padding.
3674 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3675 CostKind);
3676 const int NumEltPerXMM = XMMBits / EltTyBits;
3677
3678 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
3679
3680 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
3681 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
3682 // How many elements would a single op deal with at once?
3683 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
3684 // Vector size must be a multiple of the element size. I.e. no padding.
3685 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3686 CostKind);
3687 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
3688
3689 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
3690 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
3691 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
3692 "Unless we haven't halved the op size yet, "
3693 "we have less than two op's sized units of work left.");
3694
3695 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
3696 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
3697 : XMMVecTy;
3698
3699 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
3700 "After halving sizes, the vector elt count is no longer a multiple "
3701 "of number of elements per operation?");
3702 auto *CoalescedVecTy =
3703 CurrNumEltPerOp == 1
3704 ? CurrVecTy
3705 : FixedVectorType::get(
3706 IntegerType::get(Src->getContext(),
3707 EltTyBits * CurrNumEltPerOp),
3708 CurrVecTy->getNumElements() / CurrNumEltPerOp);
3709 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
3710 DL.getTypeSizeInBits(CurrVecTy) &&
3711 "coalesciing elements doesn't change vector width.");
3712
3713 while (NumEltRemaining > 0) {
3714 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
3715
3716 // Can we use this vector size, as per the remaining element count?
3717 // Iff the vector is naturally aligned, we can do a wide load regardless.
3718 if (NumEltRemaining < CurrNumEltPerOp &&
3719 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
3720 CurrOpSizeBytes != 1)
3721 break; // Try smalled vector size.
3722
3723 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
3724
3725 // If we have fully processed the previous reg, we need to replenish it.
3726 if (SubVecEltsLeft == 0) {
3727 SubVecEltsLeft += CurrVecTy->getNumElements();
3728 // And that's free only for the 0'th subvector of a legalized vector.
3729 if (!Is0thSubVec)
3730 Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
3731 : TTI::ShuffleKind::SK_ExtractSubvector,
3732 VTy, None, NumEltDone(), CurrVecTy);
3733 }
3734
3735 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
3736 // for smaller widths (32/16/8) we have to insert/extract them separately.
3737 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
3738 // but let's pretend that it is also true for 16/8 bit wide ops...)
3739 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
3740 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
3741 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
3742 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
3743 APInt DemandedElts =
3744 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
3745 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
3746 assert(DemandedElts.countPopulation() == 1 && "Inserting single value");
3747 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
3748 !IsLoad);
3749 }
3750
3751 // This isn't exactly right. We're using slow unaligned 32-byte accesses
3752 // as a proxy for a double-pumped AVX memory interface such as on
3753 // Sandybridge.
3754 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
3755 Cost += 2;
3756 else
3757 Cost += 1;
3758
3759 SubVecEltsLeft -= CurrNumEltPerOp;
3760 NumEltRemaining -= CurrNumEltPerOp;
3761 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
3762 }
3763 }
3764
3765 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
3766
3767 return Cost;
3768 }
3769
3770 InstructionCost
getMaskedMemoryOpCost(unsigned Opcode,Type * SrcTy,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)3771 X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
3772 unsigned AddressSpace,
3773 TTI::TargetCostKind CostKind) {
3774 bool IsLoad = (Instruction::Load == Opcode);
3775 bool IsStore = (Instruction::Store == Opcode);
3776
3777 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
3778 if (!SrcVTy)
3779 // To calculate scalar take the regular cost, without mask
3780 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
3781
3782 unsigned NumElem = SrcVTy->getNumElements();
3783 auto *MaskTy =
3784 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
3785 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
3786 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
3787 // Scalarization
3788 APInt DemandedElts = APInt::getAllOnes(NumElem);
3789 InstructionCost MaskSplitCost =
3790 getScalarizationOverhead(MaskTy, DemandedElts, false, true);
3791 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
3792 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
3793 CmpInst::BAD_ICMP_PREDICATE, CostKind);
3794 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
3795 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
3796 InstructionCost ValueSplitCost =
3797 getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
3798 InstructionCost MemopCost =
3799 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3800 Alignment, AddressSpace, CostKind);
3801 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
3802 }
3803
3804 // Legalize the type.
3805 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
3806 auto VT = TLI->getValueType(DL, SrcVTy);
3807 InstructionCost Cost = 0;
3808 if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
3809 LT.second.getVectorNumElements() == NumElem)
3810 // Promotion requires extend/truncate for data and a shuffle for mask.
3811 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) +
3812 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr);
3813
3814 else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
3815 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
3816 LT.second.getVectorNumElements());
3817 // Expanding requires fill mask with zeroes
3818 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy);
3819 }
3820
3821 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
3822 if (!ST->hasAVX512())
3823 return Cost + LT.first * (IsLoad ? 2 : 8);
3824
3825 // AVX-512 masked load/store is cheapper
3826 return Cost + LT.first;
3827 }
3828
getAddressComputationCost(Type * Ty,ScalarEvolution * SE,const SCEV * Ptr)3829 InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
3830 ScalarEvolution *SE,
3831 const SCEV *Ptr) {
3832 // Address computations in vectorized code with non-consecutive addresses will
3833 // likely result in more instructions compared to scalar code where the
3834 // computation can more often be merged into the index mode. The resulting
3835 // extra micro-ops can significantly decrease throughput.
3836 const unsigned NumVectorInstToHideOverhead = 10;
3837
3838 // Cost modeling of Strided Access Computation is hidden by the indexing
3839 // modes of X86 regardless of the stride value. We dont believe that there
3840 // is a difference between constant strided access in gerenal and constant
3841 // strided value which is less than or equal to 64.
3842 // Even in the case of (loop invariant) stride whose value is not known at
3843 // compile time, the address computation will not incur more than one extra
3844 // ADD instruction.
3845 if (Ty->isVectorTy() && SE) {
3846 if (!BaseT::isStridedAccess(Ptr))
3847 return NumVectorInstToHideOverhead;
3848 if (!BaseT::getConstantStrideStep(SE, Ptr))
3849 return 1;
3850 }
3851
3852 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
3853 }
3854
3855 InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * ValTy,Optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)3856 X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
3857 Optional<FastMathFlags> FMF,
3858 TTI::TargetCostKind CostKind) {
3859 if (TTI::requiresOrderedReduction(FMF))
3860 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3861
3862 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
3863 // and make it as the cost.
3864
3865 static const CostTblEntry SLMCostTblNoPairWise[] = {
3866 { ISD::FADD, MVT::v2f64, 3 },
3867 { ISD::ADD, MVT::v2i64, 5 },
3868 };
3869
3870 static const CostTblEntry SSE2CostTblNoPairWise[] = {
3871 { ISD::FADD, MVT::v2f64, 2 },
3872 { ISD::FADD, MVT::v2f32, 2 },
3873 { ISD::FADD, MVT::v4f32, 4 },
3874 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
3875 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
3876 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
3877 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
3878 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
3879 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
3880 { ISD::ADD, MVT::v2i8, 2 },
3881 { ISD::ADD, MVT::v4i8, 2 },
3882 { ISD::ADD, MVT::v8i8, 2 },
3883 { ISD::ADD, MVT::v16i8, 3 },
3884 };
3885
3886 static const CostTblEntry AVX1CostTblNoPairWise[] = {
3887 { ISD::FADD, MVT::v4f64, 3 },
3888 { ISD::FADD, MVT::v4f32, 3 },
3889 { ISD::FADD, MVT::v8f32, 4 },
3890 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
3891 { ISD::ADD, MVT::v4i64, 3 },
3892 { ISD::ADD, MVT::v8i32, 5 },
3893 { ISD::ADD, MVT::v16i16, 5 },
3894 { ISD::ADD, MVT::v32i8, 4 },
3895 };
3896
3897 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3898 assert(ISD && "Invalid opcode");
3899
3900 // Before legalizing the type, give a chance to look up illegal narrow types
3901 // in the table.
3902 // FIXME: Is there a better way to do this?
3903 EVT VT = TLI->getValueType(DL, ValTy);
3904 if (VT.isSimple()) {
3905 MVT MTy = VT.getSimpleVT();
3906 if (ST->isSLM())
3907 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
3908 return Entry->Cost;
3909
3910 if (ST->hasAVX())
3911 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3912 return Entry->Cost;
3913
3914 if (ST->hasSSE2())
3915 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3916 return Entry->Cost;
3917 }
3918
3919 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
3920
3921 MVT MTy = LT.second;
3922
3923 auto *ValVTy = cast<FixedVectorType>(ValTy);
3924
3925 // Special case: vXi8 mul reductions are performed as vXi16.
3926 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
3927 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
3928 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
3929 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
3930 TargetTransformInfo::CastContextHint::None,
3931 CostKind) +
3932 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
3933 }
3934
3935 InstructionCost ArithmeticCost = 0;
3936 if (LT.first != 1 && MTy.isVector() &&
3937 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3938 // Type needs to be split. We need LT.first - 1 arithmetic ops.
3939 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
3940 MTy.getVectorNumElements());
3941 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
3942 ArithmeticCost *= LT.first - 1;
3943 }
3944
3945 if (ST->isSLM())
3946 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
3947 return ArithmeticCost + Entry->Cost;
3948
3949 if (ST->hasAVX())
3950 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3951 return ArithmeticCost + Entry->Cost;
3952
3953 if (ST->hasSSE2())
3954 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3955 return ArithmeticCost + Entry->Cost;
3956
3957 // FIXME: These assume a naive kshift+binop lowering, which is probably
3958 // conservative in most cases.
3959 static const CostTblEntry AVX512BoolReduction[] = {
3960 { ISD::AND, MVT::v2i1, 3 },
3961 { ISD::AND, MVT::v4i1, 5 },
3962 { ISD::AND, MVT::v8i1, 7 },
3963 { ISD::AND, MVT::v16i1, 9 },
3964 { ISD::AND, MVT::v32i1, 11 },
3965 { ISD::AND, MVT::v64i1, 13 },
3966 { ISD::OR, MVT::v2i1, 3 },
3967 { ISD::OR, MVT::v4i1, 5 },
3968 { ISD::OR, MVT::v8i1, 7 },
3969 { ISD::OR, MVT::v16i1, 9 },
3970 { ISD::OR, MVT::v32i1, 11 },
3971 { ISD::OR, MVT::v64i1, 13 },
3972 };
3973
3974 static const CostTblEntry AVX2BoolReduction[] = {
3975 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
3976 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
3977 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
3978 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
3979 };
3980
3981 static const CostTblEntry AVX1BoolReduction[] = {
3982 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
3983 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
3984 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
3985 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
3986 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
3987 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
3988 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
3989 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
3990 };
3991
3992 static const CostTblEntry SSE2BoolReduction[] = {
3993 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
3994 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
3995 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
3996 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
3997 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
3998 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
3999 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
4000 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
4001 };
4002
4003 // Handle bool allof/anyof patterns.
4004 if (ValVTy->getElementType()->isIntegerTy(1)) {
4005 InstructionCost ArithmeticCost = 0;
4006 if (LT.first != 1 && MTy.isVector() &&
4007 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
4008 // Type needs to be split. We need LT.first - 1 arithmetic ops.
4009 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
4010 MTy.getVectorNumElements());
4011 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
4012 ArithmeticCost *= LT.first - 1;
4013 }
4014
4015 if (ST->hasAVX512())
4016 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
4017 return ArithmeticCost + Entry->Cost;
4018 if (ST->hasAVX2())
4019 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
4020 return ArithmeticCost + Entry->Cost;
4021 if (ST->hasAVX())
4022 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
4023 return ArithmeticCost + Entry->Cost;
4024 if (ST->hasSSE2())
4025 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
4026 return ArithmeticCost + Entry->Cost;
4027
4028 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
4029 }
4030
4031 unsigned NumVecElts = ValVTy->getNumElements();
4032 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
4033
4034 // Special case power of 2 reductions where the scalar type isn't changed
4035 // by type legalization.
4036 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
4037 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
4038
4039 InstructionCost ReductionCost = 0;
4040
4041 auto *Ty = ValVTy;
4042 if (LT.first != 1 && MTy.isVector() &&
4043 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
4044 // Type needs to be split. We need LT.first - 1 arithmetic ops.
4045 Ty = FixedVectorType::get(ValVTy->getElementType(),
4046 MTy.getVectorNumElements());
4047 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
4048 ReductionCost *= LT.first - 1;
4049 NumVecElts = MTy.getVectorNumElements();
4050 }
4051
4052 // Now handle reduction with the legal type, taking into account size changes
4053 // at each level.
4054 while (NumVecElts > 1) {
4055 // Determine the size of the remaining vector we need to reduce.
4056 unsigned Size = NumVecElts * ScalarSize;
4057 NumVecElts /= 2;
4058 // If we're reducing from 256/512 bits, use an extract_subvector.
4059 if (Size > 128) {
4060 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
4061 ReductionCost +=
4062 getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy);
4063 Ty = SubTy;
4064 } else if (Size == 128) {
4065 // Reducing from 128 bits is a permute of v2f64/v2i64.
4066 FixedVectorType *ShufTy;
4067 if (ValVTy->isFloatingPointTy())
4068 ShufTy =
4069 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
4070 else
4071 ShufTy =
4072 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
4073 ReductionCost +=
4074 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
4075 } else if (Size == 64) {
4076 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
4077 FixedVectorType *ShufTy;
4078 if (ValVTy->isFloatingPointTy())
4079 ShufTy =
4080 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
4081 else
4082 ShufTy =
4083 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
4084 ReductionCost +=
4085 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
4086 } else {
4087 // Reducing from smaller size is a shift by immediate.
4088 auto *ShiftTy = FixedVectorType::get(
4089 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
4090 ReductionCost += getArithmeticInstrCost(
4091 Instruction::LShr, ShiftTy, CostKind,
4092 TargetTransformInfo::OK_AnyValue,
4093 TargetTransformInfo::OK_UniformConstantValue,
4094 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
4095 }
4096
4097 // Add the arithmetic op for this level.
4098 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
4099 }
4100
4101 // Add the final extract element to the cost.
4102 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
4103 }
4104
getMinMaxCost(Type * Ty,Type * CondTy,bool IsUnsigned)4105 InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy,
4106 bool IsUnsigned) {
4107 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
4108
4109 MVT MTy = LT.second;
4110
4111 int ISD;
4112 if (Ty->isIntOrIntVectorTy()) {
4113 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
4114 } else {
4115 assert(Ty->isFPOrFPVectorTy() &&
4116 "Expected float point or integer vector type.");
4117 ISD = ISD::FMINNUM;
4118 }
4119
4120 static const CostTblEntry SSE1CostTbl[] = {
4121 {ISD::FMINNUM, MVT::v4f32, 1},
4122 };
4123
4124 static const CostTblEntry SSE2CostTbl[] = {
4125 {ISD::FMINNUM, MVT::v2f64, 1},
4126 {ISD::SMIN, MVT::v8i16, 1},
4127 {ISD::UMIN, MVT::v16i8, 1},
4128 };
4129
4130 static const CostTblEntry SSE41CostTbl[] = {
4131 {ISD::SMIN, MVT::v4i32, 1},
4132 {ISD::UMIN, MVT::v4i32, 1},
4133 {ISD::UMIN, MVT::v8i16, 1},
4134 {ISD::SMIN, MVT::v16i8, 1},
4135 };
4136
4137 static const CostTblEntry SSE42CostTbl[] = {
4138 {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd
4139 };
4140
4141 static const CostTblEntry AVX1CostTbl[] = {
4142 {ISD::FMINNUM, MVT::v8f32, 1},
4143 {ISD::FMINNUM, MVT::v4f64, 1},
4144 {ISD::SMIN, MVT::v8i32, 3},
4145 {ISD::UMIN, MVT::v8i32, 3},
4146 {ISD::SMIN, MVT::v16i16, 3},
4147 {ISD::UMIN, MVT::v16i16, 3},
4148 {ISD::SMIN, MVT::v32i8, 3},
4149 {ISD::UMIN, MVT::v32i8, 3},
4150 };
4151
4152 static const CostTblEntry AVX2CostTbl[] = {
4153 {ISD::SMIN, MVT::v8i32, 1},
4154 {ISD::UMIN, MVT::v8i32, 1},
4155 {ISD::SMIN, MVT::v16i16, 1},
4156 {ISD::UMIN, MVT::v16i16, 1},
4157 {ISD::SMIN, MVT::v32i8, 1},
4158 {ISD::UMIN, MVT::v32i8, 1},
4159 };
4160
4161 static const CostTblEntry AVX512CostTbl[] = {
4162 {ISD::FMINNUM, MVT::v16f32, 1},
4163 {ISD::FMINNUM, MVT::v8f64, 1},
4164 {ISD::SMIN, MVT::v2i64, 1},
4165 {ISD::UMIN, MVT::v2i64, 1},
4166 {ISD::SMIN, MVT::v4i64, 1},
4167 {ISD::UMIN, MVT::v4i64, 1},
4168 {ISD::SMIN, MVT::v8i64, 1},
4169 {ISD::UMIN, MVT::v8i64, 1},
4170 {ISD::SMIN, MVT::v16i32, 1},
4171 {ISD::UMIN, MVT::v16i32, 1},
4172 };
4173
4174 static const CostTblEntry AVX512BWCostTbl[] = {
4175 {ISD::SMIN, MVT::v32i16, 1},
4176 {ISD::UMIN, MVT::v32i16, 1},
4177 {ISD::SMIN, MVT::v64i8, 1},
4178 {ISD::UMIN, MVT::v64i8, 1},
4179 };
4180
4181 // If we have a native MIN/MAX instruction for this type, use it.
4182 if (ST->hasBWI())
4183 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4184 return LT.first * Entry->Cost;
4185
4186 if (ST->hasAVX512())
4187 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4188 return LT.first * Entry->Cost;
4189
4190 if (ST->hasAVX2())
4191 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4192 return LT.first * Entry->Cost;
4193
4194 if (ST->hasAVX())
4195 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4196 return LT.first * Entry->Cost;
4197
4198 if (ST->hasSSE42())
4199 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4200 return LT.first * Entry->Cost;
4201
4202 if (ST->hasSSE41())
4203 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4204 return LT.first * Entry->Cost;
4205
4206 if (ST->hasSSE2())
4207 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4208 return LT.first * Entry->Cost;
4209
4210 if (ST->hasSSE1())
4211 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4212 return LT.first * Entry->Cost;
4213
4214 unsigned CmpOpcode;
4215 if (Ty->isFPOrFPVectorTy()) {
4216 CmpOpcode = Instruction::FCmp;
4217 } else {
4218 assert(Ty->isIntOrIntVectorTy() &&
4219 "expecting floating point or integer type for min/max reduction");
4220 CmpOpcode = Instruction::ICmp;
4221 }
4222
4223 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4224 // Otherwise fall back to cmp+select.
4225 InstructionCost Result =
4226 getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
4227 CostKind) +
4228 getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
4229 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4230 return Result;
4231 }
4232
4233 InstructionCost
getMinMaxReductionCost(VectorType * ValTy,VectorType * CondTy,bool IsUnsigned,TTI::TargetCostKind CostKind)4234 X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
4235 bool IsUnsigned,
4236 TTI::TargetCostKind CostKind) {
4237 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
4238
4239 MVT MTy = LT.second;
4240
4241 int ISD;
4242 if (ValTy->isIntOrIntVectorTy()) {
4243 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
4244 } else {
4245 assert(ValTy->isFPOrFPVectorTy() &&
4246 "Expected float point or integer vector type.");
4247 ISD = ISD::FMINNUM;
4248 }
4249
4250 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
4251 // and make it as the cost.
4252
4253 static const CostTblEntry SSE2CostTblNoPairWise[] = {
4254 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
4255 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
4256 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
4257 };
4258
4259 static const CostTblEntry SSE41CostTblNoPairWise[] = {
4260 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
4261 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
4262 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
4263 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
4264 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
4265 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
4266 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
4267 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
4268 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
4269 {ISD::SMIN, MVT::v16i8, 6},
4270 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
4271 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
4272 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
4273 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
4274 };
4275
4276 static const CostTblEntry AVX1CostTblNoPairWise[] = {
4277 {ISD::SMIN, MVT::v16i16, 6},
4278 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
4279 {ISD::SMIN, MVT::v32i8, 8},
4280 {ISD::UMIN, MVT::v32i8, 8},
4281 };
4282
4283 static const CostTblEntry AVX512BWCostTblNoPairWise[] = {
4284 {ISD::SMIN, MVT::v32i16, 8},
4285 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
4286 {ISD::SMIN, MVT::v64i8, 10},
4287 {ISD::UMIN, MVT::v64i8, 10},
4288 };
4289
4290 // Before legalizing the type, give a chance to look up illegal narrow types
4291 // in the table.
4292 // FIXME: Is there a better way to do this?
4293 EVT VT = TLI->getValueType(DL, ValTy);
4294 if (VT.isSimple()) {
4295 MVT MTy = VT.getSimpleVT();
4296 if (ST->hasBWI())
4297 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
4298 return Entry->Cost;
4299
4300 if (ST->hasAVX())
4301 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
4302 return Entry->Cost;
4303
4304 if (ST->hasSSE41())
4305 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
4306 return Entry->Cost;
4307
4308 if (ST->hasSSE2())
4309 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
4310 return Entry->Cost;
4311 }
4312
4313 auto *ValVTy = cast<FixedVectorType>(ValTy);
4314 unsigned NumVecElts = ValVTy->getNumElements();
4315
4316 auto *Ty = ValVTy;
4317 InstructionCost MinMaxCost = 0;
4318 if (LT.first != 1 && MTy.isVector() &&
4319 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
4320 // Type needs to be split. We need LT.first - 1 operations ops.
4321 Ty = FixedVectorType::get(ValVTy->getElementType(),
4322 MTy.getVectorNumElements());
4323 auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(),
4324 MTy.getVectorNumElements());
4325 MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned);
4326 MinMaxCost *= LT.first - 1;
4327 NumVecElts = MTy.getVectorNumElements();
4328 }
4329
4330 if (ST->hasBWI())
4331 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
4332 return MinMaxCost + Entry->Cost;
4333
4334 if (ST->hasAVX())
4335 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
4336 return MinMaxCost + Entry->Cost;
4337
4338 if (ST->hasSSE41())
4339 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
4340 return MinMaxCost + Entry->Cost;
4341
4342 if (ST->hasSSE2())
4343 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
4344 return MinMaxCost + Entry->Cost;
4345
4346 unsigned ScalarSize = ValTy->getScalarSizeInBits();
4347
4348 // Special case power of 2 reductions where the scalar type isn't changed
4349 // by type legalization.
4350 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
4351 ScalarSize != MTy.getScalarSizeInBits())
4352 return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind);
4353
4354 // Now handle reduction with the legal type, taking into account size changes
4355 // at each level.
4356 while (NumVecElts > 1) {
4357 // Determine the size of the remaining vector we need to reduce.
4358 unsigned Size = NumVecElts * ScalarSize;
4359 NumVecElts /= 2;
4360 // If we're reducing from 256/512 bits, use an extract_subvector.
4361 if (Size > 128) {
4362 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
4363 MinMaxCost +=
4364 getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy);
4365 Ty = SubTy;
4366 } else if (Size == 128) {
4367 // Reducing from 128 bits is a permute of v2f64/v2i64.
4368 VectorType *ShufTy;
4369 if (ValTy->isFloatingPointTy())
4370 ShufTy =
4371 FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
4372 else
4373 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
4374 MinMaxCost +=
4375 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
4376 } else if (Size == 64) {
4377 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
4378 FixedVectorType *ShufTy;
4379 if (ValTy->isFloatingPointTy())
4380 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
4381 else
4382 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
4383 MinMaxCost +=
4384 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
4385 } else {
4386 // Reducing from smaller size is a shift by immediate.
4387 auto *ShiftTy = FixedVectorType::get(
4388 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
4389 MinMaxCost += getArithmeticInstrCost(
4390 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
4391 TargetTransformInfo::OK_AnyValue,
4392 TargetTransformInfo::OK_UniformConstantValue,
4393 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
4394 }
4395
4396 // Add the arithmetic op for this level.
4397 auto *SubCondTy =
4398 FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements());
4399 MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned);
4400 }
4401
4402 // Add the final extract element to the cost.
4403 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
4404 }
4405
4406 /// Calculate the cost of materializing a 64-bit value. This helper
4407 /// method might only calculate a fraction of a larger immediate. Therefore it
4408 /// is valid to return a cost of ZERO.
getIntImmCost(int64_t Val)4409 InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) {
4410 if (Val == 0)
4411 return TTI::TCC_Free;
4412
4413 if (isInt<32>(Val))
4414 return TTI::TCC_Basic;
4415
4416 return 2 * TTI::TCC_Basic;
4417 }
4418
getIntImmCost(const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)4419 InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
4420 TTI::TargetCostKind CostKind) {
4421 assert(Ty->isIntegerTy());
4422
4423 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4424 if (BitSize == 0)
4425 return ~0U;
4426
4427 // Never hoist constants larger than 128bit, because this might lead to
4428 // incorrect code generation or assertions in codegen.
4429 // Fixme: Create a cost model for types larger than i128 once the codegen
4430 // issues have been fixed.
4431 if (BitSize > 128)
4432 return TTI::TCC_Free;
4433
4434 if (Imm == 0)
4435 return TTI::TCC_Free;
4436
4437 // Sign-extend all constants to a multiple of 64-bit.
4438 APInt ImmVal = Imm;
4439 if (BitSize % 64 != 0)
4440 ImmVal = Imm.sext(alignTo(BitSize, 64));
4441
4442 // Split the constant into 64-bit chunks and calculate the cost for each
4443 // chunk.
4444 InstructionCost Cost = 0;
4445 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
4446 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
4447 int64_t Val = Tmp.getSExtValue();
4448 Cost += getIntImmCost(Val);
4449 }
4450 // We need at least one instruction to materialize the constant.
4451 return std::max<InstructionCost>(1, Cost);
4452 }
4453
getIntImmCostInst(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind,Instruction * Inst)4454 InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
4455 const APInt &Imm, Type *Ty,
4456 TTI::TargetCostKind CostKind,
4457 Instruction *Inst) {
4458 assert(Ty->isIntegerTy());
4459
4460 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4461 // There is no cost model for constants with a bit size of 0. Return TCC_Free
4462 // here, so that constant hoisting will ignore this constant.
4463 if (BitSize == 0)
4464 return TTI::TCC_Free;
4465
4466 unsigned ImmIdx = ~0U;
4467 switch (Opcode) {
4468 default:
4469 return TTI::TCC_Free;
4470 case Instruction::GetElementPtr:
4471 // Always hoist the base address of a GetElementPtr. This prevents the
4472 // creation of new constants for every base constant that gets constant
4473 // folded with the offset.
4474 if (Idx == 0)
4475 return 2 * TTI::TCC_Basic;
4476 return TTI::TCC_Free;
4477 case Instruction::Store:
4478 ImmIdx = 0;
4479 break;
4480 case Instruction::ICmp:
4481 // This is an imperfect hack to prevent constant hoisting of
4482 // compares that might be trying to check if a 64-bit value fits in
4483 // 32-bits. The backend can optimize these cases using a right shift by 32.
4484 // Ideally we would check the compare predicate here. There also other
4485 // similar immediates the backend can use shifts for.
4486 if (Idx == 1 && Imm.getBitWidth() == 64) {
4487 uint64_t ImmVal = Imm.getZExtValue();
4488 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
4489 return TTI::TCC_Free;
4490 }
4491 ImmIdx = 1;
4492 break;
4493 case Instruction::And:
4494 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
4495 // by using a 32-bit operation with implicit zero extension. Detect such
4496 // immediates here as the normal path expects bit 31 to be sign extended.
4497 if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
4498 return TTI::TCC_Free;
4499 ImmIdx = 1;
4500 break;
4501 case Instruction::Add:
4502 case Instruction::Sub:
4503 // For add/sub, we can use the opposite instruction for INT32_MIN.
4504 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
4505 return TTI::TCC_Free;
4506 ImmIdx = 1;
4507 break;
4508 case Instruction::UDiv:
4509 case Instruction::SDiv:
4510 case Instruction::URem:
4511 case Instruction::SRem:
4512 // Division by constant is typically expanded later into a different
4513 // instruction sequence. This completely changes the constants.
4514 // Report them as "free" to stop ConstantHoist from marking them as opaque.
4515 return TTI::TCC_Free;
4516 case Instruction::Mul:
4517 case Instruction::Or:
4518 case Instruction::Xor:
4519 ImmIdx = 1;
4520 break;
4521 // Always return TCC_Free for the shift value of a shift instruction.
4522 case Instruction::Shl:
4523 case Instruction::LShr:
4524 case Instruction::AShr:
4525 if (Idx == 1)
4526 return TTI::TCC_Free;
4527 break;
4528 case Instruction::Trunc:
4529 case Instruction::ZExt:
4530 case Instruction::SExt:
4531 case Instruction::IntToPtr:
4532 case Instruction::PtrToInt:
4533 case Instruction::BitCast:
4534 case Instruction::PHI:
4535 case Instruction::Call:
4536 case Instruction::Select:
4537 case Instruction::Ret:
4538 case Instruction::Load:
4539 break;
4540 }
4541
4542 if (Idx == ImmIdx) {
4543 int NumConstants = divideCeil(BitSize, 64);
4544 InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4545 return (Cost <= NumConstants * TTI::TCC_Basic)
4546 ? static_cast<int>(TTI::TCC_Free)
4547 : Cost;
4548 }
4549
4550 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4551 }
4552
getIntImmCostIntrin(Intrinsic::ID IID,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)4553 InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
4554 const APInt &Imm, Type *Ty,
4555 TTI::TargetCostKind CostKind) {
4556 assert(Ty->isIntegerTy());
4557
4558 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4559 // There is no cost model for constants with a bit size of 0. Return TCC_Free
4560 // here, so that constant hoisting will ignore this constant.
4561 if (BitSize == 0)
4562 return TTI::TCC_Free;
4563
4564 switch (IID) {
4565 default:
4566 return TTI::TCC_Free;
4567 case Intrinsic::sadd_with_overflow:
4568 case Intrinsic::uadd_with_overflow:
4569 case Intrinsic::ssub_with_overflow:
4570 case Intrinsic::usub_with_overflow:
4571 case Intrinsic::smul_with_overflow:
4572 case Intrinsic::umul_with_overflow:
4573 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
4574 return TTI::TCC_Free;
4575 break;
4576 case Intrinsic::experimental_stackmap:
4577 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
4578 return TTI::TCC_Free;
4579 break;
4580 case Intrinsic::experimental_patchpoint_void:
4581 case Intrinsic::experimental_patchpoint_i64:
4582 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
4583 return TTI::TCC_Free;
4584 break;
4585 }
4586 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4587 }
4588
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind,const Instruction * I)4589 InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode,
4590 TTI::TargetCostKind CostKind,
4591 const Instruction *I) {
4592 if (CostKind != TTI::TCK_RecipThroughput)
4593 return Opcode == Instruction::PHI ? 0 : 1;
4594 // Branches are assumed to be predicted.
4595 return 0;
4596 }
4597
getGatherOverhead() const4598 int X86TTIImpl::getGatherOverhead() const {
4599 // Some CPUs have more overhead for gather. The specified overhead is relative
4600 // to the Load operation. "2" is the number provided by Intel architects. This
4601 // parameter is used for cost estimation of Gather Op and comparison with
4602 // other alternatives.
4603 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
4604 // enable gather with a -march.
4605 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
4606 return 2;
4607
4608 return 1024;
4609 }
4610
getScatterOverhead() const4611 int X86TTIImpl::getScatterOverhead() const {
4612 if (ST->hasAVX512())
4613 return 2;
4614
4615 return 1024;
4616 }
4617
4618 // Return an average cost of Gather / Scatter instruction, maybe improved later.
4619 // FIXME: Add TargetCostKind support.
getGSVectorCost(unsigned Opcode,Type * SrcVTy,const Value * Ptr,Align Alignment,unsigned AddressSpace)4620 InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy,
4621 const Value *Ptr, Align Alignment,
4622 unsigned AddressSpace) {
4623
4624 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
4625 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
4626
4627 // Try to reduce index size from 64 bit (default for GEP)
4628 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
4629 // operation will use 16 x 64 indices which do not fit in a zmm and needs
4630 // to split. Also check that the base pointer is the same for all lanes,
4631 // and that there's at most one variable index.
4632 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
4633 unsigned IndexSize = DL.getPointerSizeInBits();
4634 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
4635 if (IndexSize < 64 || !GEP)
4636 return IndexSize;
4637
4638 unsigned NumOfVarIndices = 0;
4639 const Value *Ptrs = GEP->getPointerOperand();
4640 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
4641 return IndexSize;
4642 for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
4643 if (isa<Constant>(GEP->getOperand(i)))
4644 continue;
4645 Type *IndxTy = GEP->getOperand(i)->getType();
4646 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
4647 IndxTy = IndexVTy->getElementType();
4648 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
4649 !isa<SExtInst>(GEP->getOperand(i))) ||
4650 ++NumOfVarIndices > 1)
4651 return IndexSize; // 64
4652 }
4653 return (unsigned)32;
4654 };
4655
4656 // Trying to reduce IndexSize to 32 bits for vector 16.
4657 // By default the IndexSize is equal to pointer size.
4658 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
4659 ? getIndexSizeInBits(Ptr, DL)
4660 : DL.getPointerSizeInBits();
4661
4662 auto *IndexVTy = FixedVectorType::get(
4663 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
4664 std::pair<InstructionCost, MVT> IdxsLT =
4665 TLI->getTypeLegalizationCost(DL, IndexVTy);
4666 std::pair<InstructionCost, MVT> SrcLT =
4667 TLI->getTypeLegalizationCost(DL, SrcVTy);
4668 InstructionCost::CostType SplitFactor =
4669 *std::max(IdxsLT.first, SrcLT.first).getValue();
4670 if (SplitFactor > 1) {
4671 // Handle splitting of vector of pointers
4672 auto *SplitSrcTy =
4673 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
4674 return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
4675 AddressSpace);
4676 }
4677
4678 // The gather / scatter cost is given by Intel architects. It is a rough
4679 // number since we are looking at one instruction in a time.
4680 const int GSOverhead = (Opcode == Instruction::Load)
4681 ? getGatherOverhead()
4682 : getScatterOverhead();
4683 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4684 MaybeAlign(Alignment), AddressSpace,
4685 TTI::TCK_RecipThroughput);
4686 }
4687
4688 /// Return the cost of full scalarization of gather / scatter operation.
4689 ///
4690 /// Opcode - Load or Store instruction.
4691 /// SrcVTy - The type of the data vector that should be gathered or scattered.
4692 /// VariableMask - The mask is non-constant at compile time.
4693 /// Alignment - Alignment for one element.
4694 /// AddressSpace - pointer[s] address space.
4695 ///
4696 /// FIXME: Add TargetCostKind support.
getGSScalarCost(unsigned Opcode,Type * SrcVTy,bool VariableMask,Align Alignment,unsigned AddressSpace)4697 InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
4698 bool VariableMask, Align Alignment,
4699 unsigned AddressSpace) {
4700 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
4701 APInt DemandedElts = APInt::getAllOnes(VF);
4702 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4703
4704 InstructionCost MaskUnpackCost = 0;
4705 if (VariableMask) {
4706 auto *MaskTy =
4707 FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
4708 MaskUnpackCost =
4709 getScalarizationOverhead(MaskTy, DemandedElts, false, true);
4710 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
4711 Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
4712 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4713 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
4714 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
4715 }
4716
4717 // The cost of the scalar loads/stores.
4718 InstructionCost MemoryOpCost =
4719 VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4720 MaybeAlign(Alignment), AddressSpace, CostKind);
4721
4722 InstructionCost InsertExtractCost = 0;
4723 if (Opcode == Instruction::Load)
4724 for (unsigned i = 0; i < VF; ++i)
4725 // Add the cost of inserting each scalar load into the vector
4726 InsertExtractCost +=
4727 getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
4728 else
4729 for (unsigned i = 0; i < VF; ++i)
4730 // Add the cost of extracting each element out of the data vector
4731 InsertExtractCost +=
4732 getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
4733
4734 return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
4735 }
4736
4737 /// Calculate the cost of Gather / Scatter operation
getGatherScatterOpCost(unsigned Opcode,Type * SrcVTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I=nullptr)4738 InstructionCost X86TTIImpl::getGatherScatterOpCost(
4739 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
4740 Align Alignment, TTI::TargetCostKind CostKind,
4741 const Instruction *I = nullptr) {
4742 if (CostKind != TTI::TCK_RecipThroughput) {
4743 if ((Opcode == Instruction::Load &&
4744 isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
4745 (Opcode == Instruction::Store &&
4746 isLegalMaskedScatter(SrcVTy, Align(Alignment))))
4747 return 1;
4748 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
4749 Alignment, CostKind, I);
4750 }
4751
4752 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
4753 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
4754 if (!PtrTy && Ptr->getType()->isVectorTy())
4755 PtrTy = dyn_cast<PointerType>(
4756 cast<VectorType>(Ptr->getType())->getElementType());
4757 assert(PtrTy && "Unexpected type for Ptr argument");
4758 unsigned AddressSpace = PtrTy->getAddressSpace();
4759
4760 if ((Opcode == Instruction::Load &&
4761 !isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
4762 (Opcode == Instruction::Store &&
4763 !isLegalMaskedScatter(SrcVTy, Align(Alignment))))
4764 return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
4765 AddressSpace);
4766
4767 return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
4768 }
4769
isLSRCostLess(TargetTransformInfo::LSRCost & C1,TargetTransformInfo::LSRCost & C2)4770 bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
4771 TargetTransformInfo::LSRCost &C2) {
4772 // X86 specific here are "instruction number 1st priority".
4773 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
4774 C1.NumIVMuls, C1.NumBaseAdds,
4775 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
4776 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
4777 C2.NumIVMuls, C2.NumBaseAdds,
4778 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
4779 }
4780
canMacroFuseCmp()4781 bool X86TTIImpl::canMacroFuseCmp() {
4782 return ST->hasMacroFusion() || ST->hasBranchFusion();
4783 }
4784
isLegalMaskedLoad(Type * DataTy,Align Alignment)4785 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
4786 if (!ST->hasAVX())
4787 return false;
4788
4789 // The backend can't handle a single element vector.
4790 if (isa<VectorType>(DataTy) &&
4791 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
4792 return false;
4793 Type *ScalarTy = DataTy->getScalarType();
4794
4795 if (ScalarTy->isPointerTy())
4796 return true;
4797
4798 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4799 return true;
4800
4801 if (ScalarTy->isHalfTy() && ST->hasBWI() && ST->hasFP16())
4802 return true;
4803
4804 if (!ScalarTy->isIntegerTy())
4805 return false;
4806
4807 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4808 return IntWidth == 32 || IntWidth == 64 ||
4809 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
4810 }
4811
isLegalMaskedStore(Type * DataType,Align Alignment)4812 bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
4813 return isLegalMaskedLoad(DataType, Alignment);
4814 }
4815
isLegalNTLoad(Type * DataType,Align Alignment)4816 bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
4817 unsigned DataSize = DL.getTypeStoreSize(DataType);
4818 // The only supported nontemporal loads are for aligned vectors of 16 or 32
4819 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
4820 // (the equivalent stores only require AVX).
4821 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
4822 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
4823
4824 return false;
4825 }
4826
isLegalNTStore(Type * DataType,Align Alignment)4827 bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
4828 unsigned DataSize = DL.getTypeStoreSize(DataType);
4829
4830 // SSE4A supports nontemporal stores of float and double at arbitrary
4831 // alignment.
4832 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
4833 return true;
4834
4835 // Besides the SSE4A subtarget exception above, only aligned stores are
4836 // available nontemporaly on any other subtarget. And only stores with a size
4837 // of 4..32 bytes (powers of 2, only) are permitted.
4838 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
4839 !isPowerOf2_32(DataSize))
4840 return false;
4841
4842 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
4843 // loads require AVX2).
4844 if (DataSize == 32)
4845 return ST->hasAVX();
4846 if (DataSize == 16)
4847 return ST->hasSSE1();
4848 return true;
4849 }
4850
isLegalMaskedExpandLoad(Type * DataTy)4851 bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
4852 if (!isa<VectorType>(DataTy))
4853 return false;
4854
4855 if (!ST->hasAVX512())
4856 return false;
4857
4858 // The backend can't handle a single element vector.
4859 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
4860 return false;
4861
4862 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
4863
4864 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4865 return true;
4866
4867 if (!ScalarTy->isIntegerTy())
4868 return false;
4869
4870 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4871 return IntWidth == 32 || IntWidth == 64 ||
4872 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
4873 }
4874
isLegalMaskedCompressStore(Type * DataTy)4875 bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
4876 return isLegalMaskedExpandLoad(DataTy);
4877 }
4878
isLegalMaskedGather(Type * DataTy,Align Alignment)4879 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
4880 // Some CPUs have better gather performance than others.
4881 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
4882 // enable gather with a -march.
4883 if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
4884 return false;
4885
4886 // This function is called now in two cases: from the Loop Vectorizer
4887 // and from the Scalarizer.
4888 // When the Loop Vectorizer asks about legality of the feature,
4889 // the vectorization factor is not calculated yet. The Loop Vectorizer
4890 // sends a scalar type and the decision is based on the width of the
4891 // scalar element.
4892 // Later on, the cost model will estimate usage this intrinsic based on
4893 // the vector type.
4894 // The Scalarizer asks again about legality. It sends a vector type.
4895 // In this case we can reject non-power-of-2 vectors.
4896 // We also reject single element vectors as the type legalizer can't
4897 // scalarize it.
4898 if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
4899 unsigned NumElts = DataVTy->getNumElements();
4900 if (NumElts == 1)
4901 return false;
4902 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
4903 // Vector-4 of gather/scatter instruction does not exist on KNL.
4904 // We can extend it to 8 elements, but zeroing upper bits of
4905 // the mask vector will add more instructions. Right now we give the scalar
4906 // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter
4907 // instruction is better in the VariableMask case.
4908 if (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())))
4909 return false;
4910 }
4911 Type *ScalarTy = DataTy->getScalarType();
4912 if (ScalarTy->isPointerTy())
4913 return true;
4914
4915 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4916 return true;
4917
4918 if (!ScalarTy->isIntegerTy())
4919 return false;
4920
4921 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4922 return IntWidth == 32 || IntWidth == 64;
4923 }
4924
isLegalMaskedScatter(Type * DataType,Align Alignment)4925 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
4926 // AVX2 doesn't support scatter
4927 if (!ST->hasAVX512())
4928 return false;
4929 return isLegalMaskedGather(DataType, Alignment);
4930 }
4931
hasDivRemOp(Type * DataType,bool IsSigned)4932 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
4933 EVT VT = TLI->getValueType(DL, DataType);
4934 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
4935 }
4936
isFCmpOrdCheaperThanFCmpZero(Type * Ty)4937 bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
4938 return false;
4939 }
4940
areInlineCompatible(const Function * Caller,const Function * Callee) const4941 bool X86TTIImpl::areInlineCompatible(const Function *Caller,
4942 const Function *Callee) const {
4943 const TargetMachine &TM = getTLI()->getTargetMachine();
4944
4945 // Work this as a subsetting of subtarget features.
4946 const FeatureBitset &CallerBits =
4947 TM.getSubtargetImpl(*Caller)->getFeatureBits();
4948 const FeatureBitset &CalleeBits =
4949 TM.getSubtargetImpl(*Callee)->getFeatureBits();
4950
4951 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
4952 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
4953 return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
4954 }
4955
areFunctionArgsABICompatible(const Function * Caller,const Function * Callee,SmallPtrSetImpl<Argument * > & Args) const4956 bool X86TTIImpl::areFunctionArgsABICompatible(
4957 const Function *Caller, const Function *Callee,
4958 SmallPtrSetImpl<Argument *> &Args) const {
4959 if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
4960 return false;
4961
4962 // If we get here, we know the target features match. If one function
4963 // considers 512-bit vectors legal and the other does not, consider them
4964 // incompatible.
4965 const TargetMachine &TM = getTLI()->getTargetMachine();
4966
4967 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
4968 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
4969 return true;
4970
4971 // Consider the arguments compatible if they aren't vectors or aggregates.
4972 // FIXME: Look at the size of vectors.
4973 // FIXME: Look at the element types of aggregates to see if there are vectors.
4974 // FIXME: The API of this function seems intended to allow arguments
4975 // to be removed from the set, but the caller doesn't check if the set
4976 // becomes empty so that may not work in practice.
4977 return llvm::none_of(Args, [](Argument *A) {
4978 auto *EltTy = cast<PointerType>(A->getType())->getElementType();
4979 return EltTy->isVectorTy() || EltTy->isAggregateType();
4980 });
4981 }
4982
4983 X86TTIImpl::TTI::MemCmpExpansionOptions
enableMemCmpExpansion(bool OptSize,bool IsZeroCmp) const4984 X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4985 TTI::MemCmpExpansionOptions Options;
4986 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4987 Options.NumLoadsPerBlock = 2;
4988 // All GPR and vector loads can be unaligned.
4989 Options.AllowOverlappingLoads = true;
4990 if (IsZeroCmp) {
4991 // Only enable vector loads for equality comparison. Right now the vector
4992 // version is not as fast for three way compare (see #33329).
4993 const unsigned PreferredWidth = ST->getPreferVectorWidth();
4994 if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
4995 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
4996 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
4997 }
4998 if (ST->is64Bit()) {
4999 Options.LoadSizes.push_back(8);
5000 }
5001 Options.LoadSizes.push_back(4);
5002 Options.LoadSizes.push_back(2);
5003 Options.LoadSizes.push_back(1);
5004 return Options;
5005 }
5006
enableInterleavedAccessVectorization()5007 bool X86TTIImpl::enableInterleavedAccessVectorization() {
5008 // TODO: We expect this to be beneficial regardless of arch,
5009 // but there are currently some unexplained performance artifacts on Atom.
5010 // As a temporary solution, disable on Atom.
5011 return !(ST->isAtom());
5012 }
5013
5014 // Get estimation for interleaved load/store operations for AVX2.
5015 // \p Factor is the interleaved-access factor (stride) - number of
5016 // (interleaved) elements in the group.
5017 // \p Indices contains the indices for a strided load: when the
5018 // interleaved load has gaps they indicate which elements are used.
5019 // If Indices is empty (or if the number of indices is equal to the size
5020 // of the interleaved-access as given in \p Factor) the access has no gaps.
5021 //
5022 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow
5023 // computing the cost using a generic formula as a function of generic
5024 // shuffles. We therefore use a lookup table instead, filled according to
5025 // the instruction sequences that codegen currently generates.
getInterleavedMemoryOpCostAVX2(unsigned Opcode,FixedVectorType * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,bool UseMaskForCond,bool UseMaskForGaps)5026 InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX2(
5027 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
5028 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
5029 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
5030
5031 if (UseMaskForCond || UseMaskForGaps)
5032 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5033 Alignment, AddressSpace, CostKind,
5034 UseMaskForCond, UseMaskForGaps);
5035
5036 // We currently Support only fully-interleaved groups, with no gaps.
5037 // TODO: Support also strided loads (interleaved-groups with gaps).
5038 if (Indices.size() && Indices.size() != Factor)
5039 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5040 Alignment, AddressSpace, CostKind);
5041
5042 // VecTy for interleave memop is <VF*Factor x Elt>.
5043 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
5044 // VecTy = <12 x i32>.
5045 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
5046
5047 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
5048 // the VF=2, while v2i128 is an unsupported MVT vector type
5049 // (see MachineValueType.h::getVectorVT()).
5050 if (!LegalVT.isVector())
5051 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5052 Alignment, AddressSpace, CostKind);
5053
5054 unsigned VF = VecTy->getNumElements() / Factor;
5055 Type *ScalarTy = VecTy->getElementType();
5056 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
5057 if (!ScalarTy->isIntegerTy())
5058 ScalarTy =
5059 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
5060
5061 // Get the cost of all the memory operations.
5062 InstructionCost MemOpCosts = getMemoryOpCost(
5063 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
5064
5065 auto *VT = FixedVectorType::get(ScalarTy, VF);
5066 EVT ETy = TLI->getValueType(DL, VT);
5067 if (!ETy.isSimple())
5068 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5069 Alignment, AddressSpace, CostKind);
5070
5071 // TODO: Complete for other data-types and strides.
5072 // Each combination of Stride, element bit width and VF results in a different
5073 // sequence; The cost tables are therefore accessed with:
5074 // Factor (stride) and VectorType=VFxiN.
5075 // The Cost accounts only for the shuffle sequence;
5076 // The cost of the loads/stores is accounted for separately.
5077 //
5078 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
5079 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
5080 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
5081 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
5082 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
5083 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
5084
5085 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
5086 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
5087 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
5088 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
5089 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
5090
5091 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
5092 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
5093 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
5094 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
5095 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
5096
5097 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
5098 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
5099 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
5100 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
5101
5102 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
5103 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
5104 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
5105 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
5106 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
5107
5108 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
5109 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
5110 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
5111 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
5112 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
5113
5114 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
5115 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
5116 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
5117 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
5118
5119 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
5120 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
5121 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
5122 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
5123
5124 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
5125 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
5126 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
5127 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
5128 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
5129
5130 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
5131 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
5132 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
5133 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
5134 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
5135
5136 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
5137 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
5138 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
5139 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
5140
5141 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
5142 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
5143 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
5144
5145 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
5146 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
5147 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
5148 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
5149 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
5150
5151 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
5152 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
5153 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
5154 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
5155
5156 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
5157 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
5158 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
5159 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
5160
5161 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
5162 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
5163 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
5164
5165 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
5166 };
5167
5168 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
5169 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
5170 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
5171 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
5172 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
5173 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
5174
5175 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
5176 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
5177 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
5178 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
5179 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
5180
5181 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
5182 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
5183 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
5184 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
5185 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
5186
5187 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
5188 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
5189 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
5190 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
5191
5192 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
5193 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
5194 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
5195 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
5196 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
5197
5198 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
5199 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
5200 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
5201 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
5202 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
5203
5204 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
5205 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
5206 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
5207 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
5208
5209 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
5210 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
5211 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
5212 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
5213
5214 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
5215 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
5216 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
5217 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
5218 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
5219
5220 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
5221 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
5222 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
5223 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
5224 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
5225
5226 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
5227 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
5228 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
5229 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
5230
5231 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
5232 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
5233 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
5234
5235 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
5236 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
5237 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
5238 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
5239 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
5240
5241 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
5242 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
5243 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
5244 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
5245
5246 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
5247 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
5248 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
5249 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
5250
5251 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
5252 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
5253 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
5254 };
5255
5256 if (Opcode == Instruction::Load) {
5257 if (const auto *Entry =
5258 CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
5259 return MemOpCosts + Entry->Cost;
5260 } else {
5261 assert(Opcode == Instruction::Store &&
5262 "Expected Store Instruction at this point");
5263 if (const auto *Entry =
5264 CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
5265 return MemOpCosts + Entry->Cost;
5266 }
5267
5268 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5269 Alignment, AddressSpace, CostKind);
5270 }
5271
5272 // Get estimation for interleaved load/store operations and strided load.
5273 // \p Indices contains indices for strided load.
5274 // \p Factor - the factor of interleaving.
5275 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
getInterleavedMemoryOpCostAVX512(unsigned Opcode,FixedVectorType * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,bool UseMaskForCond,bool UseMaskForGaps)5276 InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
5277 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
5278 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
5279 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
5280
5281 if (UseMaskForCond || UseMaskForGaps)
5282 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5283 Alignment, AddressSpace, CostKind,
5284 UseMaskForCond, UseMaskForGaps);
5285
5286 // VecTy for interleave memop is <VF*Factor x Elt>.
5287 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
5288 // VecTy = <12 x i32>.
5289
5290 // Calculate the number of memory operations (NumOfMemOps), required
5291 // for load/store the VecTy.
5292 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
5293 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
5294 unsigned LegalVTSize = LegalVT.getStoreSize();
5295 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
5296
5297 // Get the cost of one memory operation.
5298 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
5299 LegalVT.getVectorNumElements());
5300 InstructionCost MemOpCost = getMemoryOpCost(
5301 Opcode, SingleMemOpTy, MaybeAlign(Alignment), AddressSpace, CostKind);
5302
5303 unsigned VF = VecTy->getNumElements() / Factor;
5304 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
5305
5306 if (Opcode == Instruction::Load) {
5307 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
5308 // contain the cost of the optimized shuffle sequence that the
5309 // X86InterleavedAccess pass will generate.
5310 // The cost of loads and stores are computed separately from the table.
5311
5312 // X86InterleavedAccess support only the following interleaved-access group.
5313 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
5314 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
5315 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
5316 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
5317 };
5318
5319 if (const auto *Entry =
5320 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
5321 return NumOfMemOps * MemOpCost + Entry->Cost;
5322 //If an entry does not exist, fallback to the default implementation.
5323
5324 // Kind of shuffle depends on number of loaded values.
5325 // If we load the entire data in one register, we can use a 1-src shuffle.
5326 // Otherwise, we'll merge 2 sources in each operation.
5327 TTI::ShuffleKind ShuffleKind =
5328 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
5329
5330 InstructionCost ShuffleCost =
5331 getShuffleCost(ShuffleKind, SingleMemOpTy, None, 0, nullptr);
5332
5333 unsigned NumOfLoadsInInterleaveGrp =
5334 Indices.size() ? Indices.size() : Factor;
5335 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
5336 VecTy->getNumElements() / Factor);
5337 InstructionCost NumOfResults =
5338 getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
5339 NumOfLoadsInInterleaveGrp;
5340
5341 // About a half of the loads may be folded in shuffles when we have only
5342 // one result. If we have more than one result, we do not fold loads at all.
5343 unsigned NumOfUnfoldedLoads =
5344 NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
5345
5346 // Get a number of shuffle operations per result.
5347 unsigned NumOfShufflesPerResult =
5348 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
5349
5350 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
5351 // When we have more than one destination, we need additional instructions
5352 // to keep sources.
5353 InstructionCost NumOfMoves = 0;
5354 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
5355 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
5356
5357 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
5358 NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
5359
5360 return Cost;
5361 }
5362
5363 // Store.
5364 assert(Opcode == Instruction::Store &&
5365 "Expected Store Instruction at this point");
5366 // X86InterleavedAccess support only the following interleaved-access group.
5367 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
5368 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
5369 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
5370 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
5371
5372 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
5373 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
5374 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
5375 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
5376 };
5377
5378 if (const auto *Entry =
5379 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
5380 return NumOfMemOps * MemOpCost + Entry->Cost;
5381 //If an entry does not exist, fallback to the default implementation.
5382
5383 // There is no strided stores meanwhile. And store can't be folded in
5384 // shuffle.
5385 unsigned NumOfSources = Factor; // The number of values to be merged.
5386 InstructionCost ShuffleCost =
5387 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, 0, nullptr);
5388 unsigned NumOfShufflesPerStore = NumOfSources - 1;
5389
5390 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
5391 // We need additional instructions to keep sources.
5392 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
5393 InstructionCost Cost =
5394 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
5395 NumOfMoves;
5396 return Cost;
5397 }
5398
getInterleavedMemoryOpCost(unsigned Opcode,Type * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,bool UseMaskForCond,bool UseMaskForGaps)5399 InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
5400 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
5401 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5402 bool UseMaskForCond, bool UseMaskForGaps) {
5403 auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) {
5404 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
5405 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
5406 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
5407 return true;
5408 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) ||
5409 (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy()))
5410 return HasBW;
5411 return false;
5412 };
5413 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
5414 return getInterleavedMemoryOpCostAVX512(
5415 Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
5416 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
5417 if (ST->hasAVX2())
5418 return getInterleavedMemoryOpCostAVX2(
5419 Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
5420 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
5421
5422 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5423 Alignment, AddressSpace, CostKind,
5424 UseMaskForCond, UseMaskForGaps);
5425 }
5426