1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of a
17 /// specific CPU model. Usually the numbers correspond to the CPU where the
18 /// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost,
21 /// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22 ///
23 /// Some examples of other technologies/CPUs:
24 ///   SSE 3   - Pentium4 / Athlon64
25 ///   SSE 4.1 - Penryn
26 ///   SSE 4.2 - Nehalem / Silvermont
27 ///   AVX     - Sandy Bridge / Jaguar / Bulldozer
28 ///   AVX2    - Haswell / Ryzen
29 ///   AVX-512 - Xeon Phi / Skylake
30 ///
31 /// And some examples of instruction target dependent costs (latency)
32 ///                   divss     sqrtss          rsqrtss
33 ///   AMD K7          11-16     19              3
34 ///   Piledriver      9-24      13-15           5
35 ///   Jaguar          14        16              2
36 ///   Pentium II,III  18        30              2
37 ///   Nehalem         7-14      7-18            3
38 ///   Haswell         10-13     11              5
39 ///
40 /// Interpreting the 4 TargetCostKind types:
41 /// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42 /// values reported by the CPU scheduler models (and llvm-mca).
43 /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44 /// actual encoding size of the instruction.
45 /// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46 /// by the CPU scheduler models (and llvm-mca), to ensure that they are
47 /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48 /// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49 //===----------------------------------------------------------------------===//
50 
51 #include "X86TargetTransformInfo.h"
52 #include "llvm/Analysis/TargetTransformInfo.h"
53 #include "llvm/CodeGen/BasicTTIImpl.h"
54 #include "llvm/CodeGen/CostTable.h"
55 #include "llvm/CodeGen/TargetLowering.h"
56 #include "llvm/IR/InstIterator.h"
57 #include "llvm/IR/IntrinsicInst.h"
58 #include "llvm/Support/Debug.h"
59 #include <optional>
60 
61 using namespace llvm;
62 
63 #define DEBUG_TYPE "x86tti"
64 
65 //===----------------------------------------------------------------------===//
66 //
67 // X86 cost model.
68 //
69 //===----------------------------------------------------------------------===//
70 
71 // Helper struct to store/access costs for each cost kind.
72 // TODO: Move this to allow other targets to use it?
73 struct CostKindCosts {
74   unsigned RecipThroughputCost = ~0U;
75   unsigned LatencyCost = ~0U;
76   unsigned CodeSizeCost = ~0U;
77   unsigned SizeAndLatencyCost = ~0U;
78 
79   std::optional<unsigned>
80   operator[](TargetTransformInfo::TargetCostKind Kind) const {
81     unsigned Cost = ~0U;
82     switch (Kind) {
83     case TargetTransformInfo::TCK_RecipThroughput:
84       Cost = RecipThroughputCost;
85       break;
86     case TargetTransformInfo::TCK_Latency:
87       Cost = LatencyCost;
88       break;
89     case TargetTransformInfo::TCK_CodeSize:
90       Cost = CodeSizeCost;
91       break;
92     case TargetTransformInfo::TCK_SizeAndLatency:
93       Cost = SizeAndLatencyCost;
94       break;
95     }
96     if (Cost == ~0U)
97       return std::nullopt;
98     return Cost;
99   }
100 };
101 using CostKindTblEntry = CostTblEntryT<CostKindCosts>;
102 
103 TargetTransformInfo::PopcntSupportKind
104 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
105   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106   // TODO: Currently the __builtin_popcount() implementation using SSE3
107   //   instructions is inefficient. Once the problem is fixed, we should
108   //   call ST->hasSSE3() instead of ST->hasPOPCNT().
109   return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110 }
111 
112 std::optional<unsigned> X86TTIImpl::getCacheSize(
113   TargetTransformInfo::CacheLevel Level) const {
114   switch (Level) {
115   case TargetTransformInfo::CacheLevel::L1D:
116     //   - Penryn
117     //   - Nehalem
118     //   - Westmere
119     //   - Sandy Bridge
120     //   - Ivy Bridge
121     //   - Haswell
122     //   - Broadwell
123     //   - Skylake
124     //   - Kabylake
125     return 32 * 1024;  //  32 KByte
126   case TargetTransformInfo::CacheLevel::L2D:
127     //   - Penryn
128     //   - Nehalem
129     //   - Westmere
130     //   - Sandy Bridge
131     //   - Ivy Bridge
132     //   - Haswell
133     //   - Broadwell
134     //   - Skylake
135     //   - Kabylake
136     return 256 * 1024; // 256 KByte
137   }
138 
139   llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140 }
141 
142 std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
143   TargetTransformInfo::CacheLevel Level) const {
144   //   - Penryn
145   //   - Nehalem
146   //   - Westmere
147   //   - Sandy Bridge
148   //   - Ivy Bridge
149   //   - Haswell
150   //   - Broadwell
151   //   - Skylake
152   //   - Kabylake
153   switch (Level) {
154   case TargetTransformInfo::CacheLevel::L1D:
155     [[fallthrough]];
156   case TargetTransformInfo::CacheLevel::L2D:
157     return 8;
158   }
159 
160   llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161 }
162 
163 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164   bool Vector = (ClassID == 1);
165   if (Vector && !ST->hasSSE1())
166     return 0;
167 
168   if (ST->is64Bit()) {
169     if (Vector && ST->hasAVX512())
170       return 32;
171     return 16;
172   }
173   return 8;
174 }
175 
176 TypeSize
177 X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
178   unsigned PreferVectorWidth = ST->getPreferVectorWidth();
179   switch (K) {
180   case TargetTransformInfo::RGK_Scalar:
181     return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
182   case TargetTransformInfo::RGK_FixedWidthVector:
183     if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
184       return TypeSize::getFixed(512);
185     if (ST->hasAVX() && PreferVectorWidth >= 256)
186       return TypeSize::getFixed(256);
187     if (ST->hasSSE1() && PreferVectorWidth >= 128)
188       return TypeSize::getFixed(128);
189     return TypeSize::getFixed(0);
190   case TargetTransformInfo::RGK_ScalableVector:
191     return TypeSize::getScalable(0);
192   }
193 
194   llvm_unreachable("Unsupported register kind");
195 }
196 
197 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
198   return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
199       .getFixedValue();
200 }
201 
202 unsigned X86TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
203   // If the loop will not be vectorized, don't interleave the loop.
204   // Let regular unroll to unroll the loop, which saves the overflow
205   // check and memory check cost.
206   if (VF.isScalar())
207     return 1;
208 
209   if (ST->isAtom())
210     return 1;
211 
212   // Sandybridge and Haswell have multiple execution ports and pipelined
213   // vector units.
214   if (ST->hasAVX())
215     return 4;
216 
217   return 2;
218 }
219 
220 InstructionCost X86TTIImpl::getArithmeticInstrCost(
221     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
222     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
223     ArrayRef<const Value *> Args,
224     const Instruction *CxtI) {
225 
226   // vXi8 multiplications are always promoted to vXi16.
227   // Sub-128-bit types can be extended/packed more efficiently.
228   if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
229       Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
230     Type *WideVecTy =
231         VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
232     return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
233                             TargetTransformInfo::CastContextHint::None,
234                             CostKind) +
235            getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
236                             TargetTransformInfo::CastContextHint::None,
237                             CostKind) +
238            getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
239   }
240 
241   // Legalize the type.
242   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
243 
244   int ISD = TLI->InstructionOpcodeToISD(Opcode);
245   assert(ISD && "Invalid opcode");
246 
247   if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
248       (LT.second.getScalarType() == MVT::i32 ||
249        LT.second.getScalarType() == MVT::i64)) {
250     // Check if the operands can be represented as a smaller datatype.
251     bool Op1Signed = false, Op2Signed = false;
252     unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
253     unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
254     unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
255     bool SignedMode = Op1Signed || Op2Signed;
256 
257     // If both vXi32 are representable as i15 and at least one is constant,
258     // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
259     // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
260     if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
261         LT.second.getScalarType() == MVT::i32) {
262       bool Op1Constant =
263           isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
264       bool Op2Constant =
265           isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
266       bool Op1Sext = isa<SExtInst>(Args[0]) &&
267                      (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
268       bool Op2Sext = isa<SExtInst>(Args[1]) &&
269                      (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
270 
271       bool IsZeroExtended = !Op1Signed || !Op2Signed;
272       bool IsConstant = Op1Constant || Op2Constant;
273       bool IsSext = Op1Sext || Op2Sext;
274       if (IsConstant || IsZeroExtended || IsSext)
275         LT.second =
276             MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
277     }
278 
279     // Check if the vXi32 operands can be shrunk into a smaller datatype.
280     // This should match the codegen from reduceVMULWidth.
281     // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
282     if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
283       if (OpMinSize <= 7)
284         return LT.first * 3; // pmullw/sext
285       if (!SignedMode && OpMinSize <= 8)
286         return LT.first * 3; // pmullw/zext
287       if (OpMinSize <= 15)
288         return LT.first * 5; // pmullw/pmulhw/pshuf
289       if (!SignedMode && OpMinSize <= 16)
290         return LT.first * 5; // pmullw/pmulhw/pshuf
291     }
292 
293     // If both vXi64 are representable as (unsigned) i32, then we can perform
294     // the multiple with a single PMULUDQ instruction.
295     // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
296     if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
297       ISD = X86ISD::PMULUDQ;
298   }
299 
300   // Vector multiply by pow2 will be simplified to shifts.
301   // Vector multiply by -pow2 will be simplified to shifts/negates.
302   if (ISD == ISD::MUL && Op2Info.isConstant() &&
303       (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
304     InstructionCost Cost =
305         getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
306                                Op1Info.getNoProps(), Op2Info.getNoProps());
307     if (Op2Info.isNegatedPowerOf2())
308       Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
309     return Cost;
310   }
311 
312   // On X86, vector signed division by constants power-of-two are
313   // normally expanded to the sequence SRA + SRL + ADD + SRA.
314   // The OperandValue properties may not be the same as that of the previous
315   // operation; conservatively assume OP_None.
316   if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
317       Op2Info.isConstant() && Op2Info.isPowerOf2()) {
318     InstructionCost Cost =
319         2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
320                                    Op1Info.getNoProps(), Op2Info.getNoProps());
321     Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
322                                    Op1Info.getNoProps(), Op2Info.getNoProps());
323     Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
324                                    Op1Info.getNoProps(), Op2Info.getNoProps());
325 
326     if (ISD == ISD::SREM) {
327       // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
328       Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
329                                      Op2Info.getNoProps());
330       Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
331                                      Op2Info.getNoProps());
332     }
333 
334     return Cost;
335   }
336 
337   // Vector unsigned division/remainder will be simplified to shifts/masks.
338   if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
339       Op2Info.isConstant() && Op2Info.isPowerOf2()) {
340     if (ISD == ISD::UDIV)
341       return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
342                                     Op1Info.getNoProps(), Op2Info.getNoProps());
343     // UREM
344     return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
345                                   Op1Info.getNoProps(), Op2Info.getNoProps());
346   }
347 
348   static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
349     { ISD::SHL,  MVT::v16i8,  { 1, 7, 2, 3 } }, // psllw + pand.
350     { ISD::SRL,  MVT::v16i8,  { 1, 7, 2, 3 } }, // psrlw + pand.
351     { ISD::SRA,  MVT::v16i8,  { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
352     { ISD::SHL,  MVT::v32i8,  { 1, 8, 2, 3 } }, // psllw + pand.
353     { ISD::SRL,  MVT::v32i8,  { 1, 8, 2, 3 } }, // psrlw + pand.
354     { ISD::SRA,  MVT::v32i8,  { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
355     { ISD::SHL,  MVT::v64i8,  { 1, 8, 2, 3 } }, // psllw + pand.
356     { ISD::SRL,  MVT::v64i8,  { 1, 8, 2, 3 } }, // psrlw + pand.
357     { ISD::SRA,  MVT::v64i8,  { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
358 
359     { ISD::SHL,  MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
360     { ISD::SRL,  MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
361     { ISD::SRA,  MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
362     { ISD::SHL,  MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
363     { ISD::SRL,  MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
364     { ISD::SRA,  MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
365   };
366 
367   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
368     if (const auto *Entry =
369             CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
370       if (auto KindCost = Entry->Cost[CostKind])
371         return LT.first * *KindCost;
372 
373   static const CostKindTblEntry AVX512UniformConstCostTable[] = {
374     { ISD::SHL,  MVT::v64i8,  {  2, 12,  5,  6 } }, // psllw + pand.
375     { ISD::SRL,  MVT::v64i8,  {  2, 12,  5,  6 } }, // psrlw + pand.
376     { ISD::SRA,  MVT::v64i8,  {  3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
377 
378     { ISD::SHL,  MVT::v16i16, {  2,  7,  4,  4 } }, // psllw + split.
379     { ISD::SRL,  MVT::v16i16, {  2,  7,  4,  4 } }, // psrlw + split.
380     { ISD::SRA,  MVT::v16i16, {  2,  7,  4,  4 } }, // psraw + split.
381 
382     { ISD::SHL,  MVT::v8i32,  {  1,  1,  1,  1 } }, // pslld
383     { ISD::SRL,  MVT::v8i32,  {  1,  1,  1,  1 } }, // psrld
384     { ISD::SRA,  MVT::v8i32,  {  1,  1,  1,  1 } }, // psrad
385     { ISD::SHL,  MVT::v16i32, {  1,  1,  1,  1 } }, // pslld
386     { ISD::SRL,  MVT::v16i32, {  1,  1,  1,  1 } }, // psrld
387     { ISD::SRA,  MVT::v16i32, {  1,  1,  1,  1 } }, // psrad
388 
389     { ISD::SRA,  MVT::v2i64,  {  1,  1,  1,  1 } }, // psraq
390     { ISD::SHL,  MVT::v4i64,  {  1,  1,  1,  1 } }, // psllq
391     { ISD::SRL,  MVT::v4i64,  {  1,  1,  1,  1 } }, // psrlq
392     { ISD::SRA,  MVT::v4i64,  {  1,  1,  1,  1 } }, // psraq
393     { ISD::SHL,  MVT::v8i64,  {  1,  1,  1,  1 } }, // psllq
394     { ISD::SRL,  MVT::v8i64,  {  1,  1,  1,  1 } }, // psrlq
395     { ISD::SRA,  MVT::v8i64,  {  1,  1,  1,  1 } }, // psraq
396 
397     { ISD::SDIV, MVT::v16i32, {  6 } }, // pmuludq sequence
398     { ISD::SREM, MVT::v16i32, {  8 } }, // pmuludq+mul+sub sequence
399     { ISD::UDIV, MVT::v16i32, {  5 } }, // pmuludq sequence
400     { ISD::UREM, MVT::v16i32, {  7 } }, // pmuludq+mul+sub sequence
401   };
402 
403   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
404     if (const auto *Entry =
405             CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
406       if (auto KindCost = Entry->Cost[CostKind])
407         return LT.first * *KindCost;
408 
409   static const CostKindTblEntry AVX2UniformConstCostTable[] = {
410     { ISD::SHL,  MVT::v16i8, {  1,  8,  2,  3 } }, // psllw + pand.
411     { ISD::SRL,  MVT::v16i8, {  1,  8,  2,  3 } }, // psrlw + pand.
412     { ISD::SRA,  MVT::v16i8, {  2, 10,  5,  6 } }, // psrlw, pand, pxor, psubb.
413     { ISD::SHL,  MVT::v32i8, {  2,  8,  2,  4 } }, // psllw + pand.
414     { ISD::SRL,  MVT::v32i8, {  2,  8,  2,  4 } }, // psrlw + pand.
415     { ISD::SRA,  MVT::v32i8, {  3, 10,  5,  9 } }, // psrlw, pand, pxor, psubb.
416 
417     { ISD::SHL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psllw
418     { ISD::SRL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psrlw
419     { ISD::SRA,  MVT::v8i16, {  1,  1,  1,  1 } }, // psraw
420     { ISD::SHL,  MVT::v16i16,{  2,  2,  1,  2 } }, // psllw
421     { ISD::SRL,  MVT::v16i16,{  2,  2,  1,  2 } }, // psrlw
422     { ISD::SRA,  MVT::v16i16,{  2,  2,  1,  2 } }, // psraw
423 
424     { ISD::SHL,  MVT::v4i32, {  1,  1,  1,  1 } }, // pslld
425     { ISD::SRL,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrld
426     { ISD::SRA,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrad
427     { ISD::SHL,  MVT::v8i32, {  2,  2,  1,  2 } }, // pslld
428     { ISD::SRL,  MVT::v8i32, {  2,  2,  1,  2 } }, // psrld
429     { ISD::SRA,  MVT::v8i32, {  2,  2,  1,  2 } }, // psrad
430 
431     { ISD::SHL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psllq
432     { ISD::SRL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psrlq
433     { ISD::SRA,  MVT::v2i64, {  2,  3,  3,  3 } }, // psrad + shuffle.
434     { ISD::SHL,  MVT::v4i64, {  2,  2,  1,  2 } }, // psllq
435     { ISD::SRL,  MVT::v4i64, {  2,  2,  1,  2 } }, // psrlq
436     { ISD::SRA,  MVT::v4i64, {  4,  4,  3,  6 } }, // psrad + shuffle + split.
437 
438     { ISD::SDIV, MVT::v8i32, {  6 } }, // pmuludq sequence
439     { ISD::SREM, MVT::v8i32, {  8 } }, // pmuludq+mul+sub sequence
440     { ISD::UDIV, MVT::v8i32, {  5 } }, // pmuludq sequence
441     { ISD::UREM, MVT::v8i32, {  7 } }, // pmuludq+mul+sub sequence
442   };
443 
444   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
445     if (const auto *Entry =
446             CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
447       if (auto KindCost = Entry->Cost[CostKind])
448         return LT.first * *KindCost;
449 
450   static const CostKindTblEntry AVXUniformConstCostTable[] = {
451     { ISD::SHL,  MVT::v16i8, {  2,  7,  2,  3 } }, // psllw + pand.
452     { ISD::SRL,  MVT::v16i8, {  2,  7,  2,  3 } }, // psrlw + pand.
453     { ISD::SRA,  MVT::v16i8, {  3,  9,  5,  6 } }, // psrlw, pand, pxor, psubb.
454     { ISD::SHL,  MVT::v32i8, {  4,  7,  7,  8 } }, // 2*(psllw + pand) + split.
455     { ISD::SRL,  MVT::v32i8, {  4,  7,  7,  8 } }, // 2*(psrlw + pand) + split.
456     { ISD::SRA,  MVT::v32i8, {  7,  7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
457 
458     { ISD::SHL,  MVT::v8i16, {  1,  2,  1,  1 } }, // psllw.
459     { ISD::SRL,  MVT::v8i16, {  1,  2,  1,  1 } }, // psrlw.
460     { ISD::SRA,  MVT::v8i16, {  1,  2,  1,  1 } }, // psraw.
461     { ISD::SHL,  MVT::v16i16,{  3,  6,  4,  5 } }, // psllw + split.
462     { ISD::SRL,  MVT::v16i16,{  3,  6,  4,  5 } }, // psrlw + split.
463     { ISD::SRA,  MVT::v16i16,{  3,  6,  4,  5 } }, // psraw + split.
464 
465     { ISD::SHL,  MVT::v4i32, {  1,  2,  1,  1 } }, // pslld.
466     { ISD::SRL,  MVT::v4i32, {  1,  2,  1,  1 } }, // psrld.
467     { ISD::SRA,  MVT::v4i32, {  1,  2,  1,  1 } }, // psrad.
468     { ISD::SHL,  MVT::v8i32, {  3,  6,  4,  5 } }, // pslld + split.
469     { ISD::SRL,  MVT::v8i32, {  3,  6,  4,  5 } }, // psrld + split.
470     { ISD::SRA,  MVT::v8i32, {  3,  6,  4,  5 } }, // psrad + split.
471 
472     { ISD::SHL,  MVT::v2i64, {  1,  2,  1,  1 } }, // psllq.
473     { ISD::SRL,  MVT::v2i64, {  1,  2,  1,  1 } }, // psrlq.
474     { ISD::SRA,  MVT::v2i64, {  2,  3,  3,  3 } }, // psrad + shuffle.
475     { ISD::SHL,  MVT::v4i64, {  3,  6,  4,  5 } }, // 2 x psllq + split.
476     { ISD::SRL,  MVT::v4i64, {  3,  6,  4,  5 } }, // 2 x psllq + split.
477     { ISD::SRA,  MVT::v4i64, {  5,  7,  8,  9 } }, // 2 x psrad + shuffle + split.
478 
479     { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
480     { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
481     { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
482     { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
483   };
484 
485   // XOP has faster vXi8 shifts.
486   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
487       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
488     if (const auto *Entry =
489             CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
490       if (auto KindCost = Entry->Cost[CostKind])
491         return LT.first * *KindCost;
492 
493   static const CostKindTblEntry SSE2UniformConstCostTable[] = {
494     { ISD::SHL,  MVT::v16i8, {  1,  7,  2,  3 } }, // psllw + pand.
495     { ISD::SRL,  MVT::v16i8, {  1,  7,  2,  3 } }, // psrlw + pand.
496     { ISD::SRA,  MVT::v16i8, {  3,  9,  5,  6 } }, // psrlw, pand, pxor, psubb.
497 
498     { ISD::SHL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psllw.
499     { ISD::SRL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psrlw.
500     { ISD::SRA,  MVT::v8i16, {  1,  1,  1,  1 } }, // psraw.
501 
502     { ISD::SHL,  MVT::v4i32, {  1,  1,  1,  1 } }, // pslld
503     { ISD::SRL,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrld.
504     { ISD::SRA,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrad.
505 
506     { ISD::SHL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psllq.
507     { ISD::SRL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psrlq.
508     { ISD::SRA,  MVT::v2i64, {  3,  5,  6,  6 } }, // 2 x psrad + shuffle.
509 
510     { ISD::SDIV, MVT::v4i32, {  6 } }, // pmuludq sequence
511     { ISD::SREM, MVT::v4i32, {  8 } }, // pmuludq+mul+sub sequence
512     { ISD::UDIV, MVT::v4i32, {  5 } }, // pmuludq sequence
513     { ISD::UREM, MVT::v4i32, {  7 } }, // pmuludq+mul+sub sequence
514   };
515 
516   // XOP has faster vXi8 shifts.
517   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
518       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
519     if (const auto *Entry =
520             CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
521       if (auto KindCost = Entry->Cost[CostKind])
522         return LT.first * *KindCost;
523 
524   static const CostKindTblEntry AVX512BWConstCostTable[] = {
525     { ISD::SDIV, MVT::v64i8,  { 14 } }, // 2*ext+2*pmulhw sequence
526     { ISD::SREM, MVT::v64i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
527     { ISD::UDIV, MVT::v64i8,  { 14 } }, // 2*ext+2*pmulhw sequence
528     { ISD::UREM, MVT::v64i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
529 
530     { ISD::SDIV, MVT::v32i16, {  6 } }, // vpmulhw sequence
531     { ISD::SREM, MVT::v32i16, {  8 } }, // vpmulhw+mul+sub sequence
532     { ISD::UDIV, MVT::v32i16, {  6 } }, // vpmulhuw sequence
533     { ISD::UREM, MVT::v32i16, {  8 } }, // vpmulhuw+mul+sub sequence
534   };
535 
536   if (Op2Info.isConstant() && ST->hasBWI())
537     if (const auto *Entry =
538             CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
539       if (auto KindCost = Entry->Cost[CostKind])
540         return LT.first * *KindCost;
541 
542   static const CostKindTblEntry AVX512ConstCostTable[] = {
543     { ISD::SDIV, MVT::v64i8,  { 28 } }, // 4*ext+4*pmulhw sequence
544     { ISD::SREM, MVT::v64i8,  { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
545     { ISD::UDIV, MVT::v64i8,  { 28 } }, // 4*ext+4*pmulhw sequence
546     { ISD::UREM, MVT::v64i8,  { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
547 
548     { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
549     { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
550     { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
551     { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
552 
553     { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
554     { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
555     { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
556     { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
557   };
558 
559   if (Op2Info.isConstant() && ST->hasAVX512())
560     if (const auto *Entry =
561             CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
562       if (auto KindCost = Entry->Cost[CostKind])
563         return LT.first * *KindCost;
564 
565   static const CostKindTblEntry AVX2ConstCostTable[] = {
566     { ISD::SDIV, MVT::v32i8,  { 14 } }, // 2*ext+2*pmulhw sequence
567     { ISD::SREM, MVT::v32i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
568     { ISD::UDIV, MVT::v32i8,  { 14 } }, // 2*ext+2*pmulhw sequence
569     { ISD::UREM, MVT::v32i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570 
571     { ISD::SDIV, MVT::v16i16, {  6 } }, // vpmulhw sequence
572     { ISD::SREM, MVT::v16i16, {  8 } }, // vpmulhw+mul+sub sequence
573     { ISD::UDIV, MVT::v16i16, {  6 } }, // vpmulhuw sequence
574     { ISD::UREM, MVT::v16i16, {  8 } }, // vpmulhuw+mul+sub sequence
575 
576     { ISD::SDIV, MVT::v8i32,  { 15 } }, // vpmuldq sequence
577     { ISD::SREM, MVT::v8i32,  { 19 } }, // vpmuldq+mul+sub sequence
578     { ISD::UDIV, MVT::v8i32,  { 15 } }, // vpmuludq sequence
579     { ISD::UREM, MVT::v8i32,  { 19 } }, // vpmuludq+mul+sub sequence
580   };
581 
582   if (Op2Info.isConstant() && ST->hasAVX2())
583     if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
584       if (auto KindCost = Entry->Cost[CostKind])
585         return LT.first * *KindCost;
586 
587   static const CostKindTblEntry AVXConstCostTable[] = {
588     { ISD::SDIV, MVT::v32i8,  { 30 } }, // 4*ext+4*pmulhw sequence + split.
589     { ISD::SREM, MVT::v32i8,  { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
590     { ISD::UDIV, MVT::v32i8,  { 30 } }, // 4*ext+4*pmulhw sequence + split.
591     { ISD::UREM, MVT::v32i8,  { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
592 
593     { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
594     { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
595     { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
596     { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
597 
598     { ISD::SDIV, MVT::v8i32,  { 32 } }, // vpmuludq sequence
599     { ISD::SREM, MVT::v8i32,  { 38 } }, // vpmuludq+mul+sub sequence
600     { ISD::UDIV, MVT::v8i32,  { 32 } }, // 2*pmuludq sequence + split.
601     { ISD::UREM, MVT::v8i32,  { 42 } }, // 2*pmuludq+mul+sub sequence + split.
602   };
603 
604   if (Op2Info.isConstant() && ST->hasAVX())
605     if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
606       if (auto KindCost = Entry->Cost[CostKind])
607         return LT.first * *KindCost;
608 
609   static const CostKindTblEntry SSE41ConstCostTable[] = {
610     { ISD::SDIV, MVT::v4i32,  { 15 } }, // vpmuludq sequence
611     { ISD::SREM, MVT::v4i32,  { 20 } }, // vpmuludq+mul+sub sequence
612   };
613 
614   if (Op2Info.isConstant() && ST->hasSSE41())
615     if (const auto *Entry =
616             CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
617       if (auto KindCost = Entry->Cost[CostKind])
618         return LT.first * *KindCost;
619 
620   static const CostKindTblEntry SSE2ConstCostTable[] = {
621     { ISD::SDIV, MVT::v16i8,  { 14 } }, // 2*ext+2*pmulhw sequence
622     { ISD::SREM, MVT::v16i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
623     { ISD::UDIV, MVT::v16i8,  { 14 } }, // 2*ext+2*pmulhw sequence
624     { ISD::UREM, MVT::v16i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
625 
626     { ISD::SDIV, MVT::v8i16,  {  6 } }, // pmulhw sequence
627     { ISD::SREM, MVT::v8i16,  {  8 } }, // pmulhw+mul+sub sequence
628     { ISD::UDIV, MVT::v8i16,  {  6 } }, // pmulhuw sequence
629     { ISD::UREM, MVT::v8i16,  {  8 } }, // pmulhuw+mul+sub sequence
630 
631     { ISD::SDIV, MVT::v4i32,  { 19 } }, // pmuludq sequence
632     { ISD::SREM, MVT::v4i32,  { 24 } }, // pmuludq+mul+sub sequence
633     { ISD::UDIV, MVT::v4i32,  { 15 } }, // pmuludq sequence
634     { ISD::UREM, MVT::v4i32,  { 20 } }, // pmuludq+mul+sub sequence
635   };
636 
637   if (Op2Info.isConstant() && ST->hasSSE2())
638     if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
639       if (auto KindCost = Entry->Cost[CostKind])
640         return LT.first * *KindCost;
641 
642   static const CostKindTblEntry AVX512BWUniformCostTable[] = {
643     { ISD::SHL,  MVT::v16i8,  { 3, 5, 5, 7 } }, // psllw + pand.
644     { ISD::SRL,  MVT::v16i8,  { 3,10, 5, 8 } }, // psrlw + pand.
645     { ISD::SRA,  MVT::v16i8,  { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
646     { ISD::SHL,  MVT::v32i8,  { 4, 7, 6, 8 } }, // psllw + pand.
647     { ISD::SRL,  MVT::v32i8,  { 4, 8, 7, 9 } }, // psrlw + pand.
648     { ISD::SRA,  MVT::v32i8,  { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
649     { ISD::SHL,  MVT::v64i8,  { 4, 7, 6, 8 } }, // psllw + pand.
650     { ISD::SRL,  MVT::v64i8,  { 4, 8, 7,10 } }, // psrlw + pand.
651     { ISD::SRA,  MVT::v64i8,  { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
652 
653     { ISD::SHL,  MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
654     { ISD::SRL,  MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
655     { ISD::SRA,  MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
656   };
657 
658   if (ST->hasBWI() && Op2Info.isUniform())
659     if (const auto *Entry =
660             CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
661       if (auto KindCost = Entry->Cost[CostKind])
662         return LT.first * *KindCost;
663 
664   static const CostKindTblEntry AVX512UniformCostTable[] = {
665     { ISD::SHL,  MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
666     { ISD::SRL,  MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
667     { ISD::SRA,  MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
668 
669     { ISD::SHL,  MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
670     { ISD::SRL,  MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
671     { ISD::SRA,  MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
672 
673     { ISD::SRA,  MVT::v2i64,  { 1, 2, 1, 2 } }, // psraq
674     { ISD::SHL,  MVT::v4i64,  { 1, 4, 1, 2 } }, // psllq
675     { ISD::SRL,  MVT::v4i64,  { 1, 4, 1, 2 } }, // psrlq
676     { ISD::SRA,  MVT::v4i64,  { 1, 4, 1, 2 } }, // psraq
677     { ISD::SHL,  MVT::v8i64,  { 1, 4, 1, 2 } }, // psllq
678     { ISD::SRL,  MVT::v8i64,  { 1, 4, 1, 2 } }, // psrlq
679     { ISD::SRA,  MVT::v8i64,  { 1, 4, 1, 2 } }, // psraq
680   };
681 
682   if (ST->hasAVX512() && Op2Info.isUniform())
683     if (const auto *Entry =
684             CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
685       if (auto KindCost = Entry->Cost[CostKind])
686         return LT.first * *KindCost;
687 
688   static const CostKindTblEntry AVX2UniformCostTable[] = {
689     // Uniform splats are cheaper for the following instructions.
690     { ISD::SHL,  MVT::v16i8,  { 3, 5, 5, 7 } }, // psllw + pand.
691     { ISD::SRL,  MVT::v16i8,  { 3, 9, 5, 8 } }, // psrlw + pand.
692     { ISD::SRA,  MVT::v16i8,  { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
693     { ISD::SHL,  MVT::v32i8,  { 4, 7, 6, 8 } }, // psllw + pand.
694     { ISD::SRL,  MVT::v32i8,  { 4, 8, 7, 9 } }, // psrlw + pand.
695     { ISD::SRA,  MVT::v32i8,  { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
696 
697     { ISD::SHL,  MVT::v8i16,  { 1, 2, 1, 2 } }, // psllw.
698     { ISD::SRL,  MVT::v8i16,  { 1, 2, 1, 2 } }, // psrlw.
699     { ISD::SRA,  MVT::v8i16,  { 1, 2, 1, 2 } }, // psraw.
700     { ISD::SHL,  MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
701     { ISD::SRL,  MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
702     { ISD::SRA,  MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
703 
704     { ISD::SHL,  MVT::v4i32,  { 1, 2, 1, 2 } }, // pslld
705     { ISD::SRL,  MVT::v4i32,  { 1, 2, 1, 2 } }, // psrld
706     { ISD::SRA,  MVT::v4i32,  { 1, 2, 1, 2 } }, // psrad
707     { ISD::SHL,  MVT::v8i32,  { 2, 4, 2, 3 } }, // pslld
708     { ISD::SRL,  MVT::v8i32,  { 2, 4, 2, 3 } }, // psrld
709     { ISD::SRA,  MVT::v8i32,  { 2, 4, 2, 3 } }, // psrad
710 
711     { ISD::SHL,  MVT::v2i64,  { 1, 2, 1, 2 } }, // psllq
712     { ISD::SRL,  MVT::v2i64,  { 1, 2, 1, 2 } }, // psrlq
713     { ISD::SRA,  MVT::v2i64,  { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
714     { ISD::SHL,  MVT::v4i64,  { 2, 4, 1, 2 } }, // psllq
715     { ISD::SRL,  MVT::v4i64,  { 2, 4, 1, 2 } }, // psrlq
716     { ISD::SRA,  MVT::v4i64,  { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
717   };
718 
719   if (ST->hasAVX2() && Op2Info.isUniform())
720     if (const auto *Entry =
721             CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
722       if (auto KindCost = Entry->Cost[CostKind])
723         return LT.first * *KindCost;
724 
725   static const CostKindTblEntry AVXUniformCostTable[] = {
726     { ISD::SHL,  MVT::v16i8,  {  4, 4, 6, 8 } }, // psllw + pand.
727     { ISD::SRL,  MVT::v16i8,  {  4, 8, 5, 8 } }, // psrlw + pand.
728     { ISD::SRA,  MVT::v16i8,  {  6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
729     { ISD::SHL,  MVT::v32i8,  {  7, 8,11,14 } }, // psllw + pand + split.
730     { ISD::SRL,  MVT::v32i8,  {  7, 9,10,14 } }, // psrlw + pand + split.
731     { ISD::SRA,  MVT::v32i8,  { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
732 
733     { ISD::SHL,  MVT::v8i16,  {  1, 3, 1, 2 } }, // psllw.
734     { ISD::SRL,  MVT::v8i16,  {  1, 3, 1, 2 } }, // psrlw.
735     { ISD::SRA,  MVT::v8i16,  {  1, 3, 1, 2 } }, // psraw.
736     { ISD::SHL,  MVT::v16i16, {  3, 7, 5, 7 } }, // psllw + split.
737     { ISD::SRL,  MVT::v16i16, {  3, 7, 5, 7 } }, // psrlw + split.
738     { ISD::SRA,  MVT::v16i16, {  3, 7, 5, 7 } }, // psraw + split.
739 
740     { ISD::SHL,  MVT::v4i32,  {  1, 3, 1, 2 } }, // pslld.
741     { ISD::SRL,  MVT::v4i32,  {  1, 3, 1, 2 } }, // psrld.
742     { ISD::SRA,  MVT::v4i32,  {  1, 3, 1, 2 } }, // psrad.
743     { ISD::SHL,  MVT::v8i32,  {  3, 7, 5, 7 } }, // pslld + split.
744     { ISD::SRL,  MVT::v8i32,  {  3, 7, 5, 7 } }, // psrld + split.
745     { ISD::SRA,  MVT::v8i32,  {  3, 7, 5, 7 } }, // psrad + split.
746 
747     { ISD::SHL,  MVT::v2i64,  {  1, 3, 1, 2 } }, // psllq.
748     { ISD::SRL,  MVT::v2i64,  {  1, 3, 1, 2 } }, // psrlq.
749     { ISD::SRA,  MVT::v2i64,  {  3, 4, 5, 7 } }, // 2 x psrad + shuffle.
750     { ISD::SHL,  MVT::v4i64,  {  3, 7, 4, 6 } }, // psllq + split.
751     { ISD::SRL,  MVT::v4i64,  {  3, 7, 4, 6 } }, // psrlq + split.
752     { ISD::SRA,  MVT::v4i64,  {  6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
753   };
754 
755   // XOP has faster vXi8 shifts.
756   if (ST->hasAVX() && Op2Info.isUniform() &&
757       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
758     if (const auto *Entry =
759             CostTableLookup(AVXUniformCostTable, ISD, LT.second))
760       if (auto KindCost = Entry->Cost[CostKind])
761         return LT.first * *KindCost;
762 
763   static const CostKindTblEntry SSE2UniformCostTable[] = {
764     // Uniform splats are cheaper for the following instructions.
765     { ISD::SHL,  MVT::v16i8, {  9, 10, 6, 9 } }, // psllw + pand.
766     { ISD::SRL,  MVT::v16i8, {  9, 13, 5, 9 } }, // psrlw + pand.
767     { ISD::SRA,  MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
768 
769     { ISD::SHL,  MVT::v8i16, {  2, 2, 1, 2 } }, // psllw.
770     { ISD::SRL,  MVT::v8i16, {  2, 2, 1, 2 } }, // psrlw.
771     { ISD::SRA,  MVT::v8i16, {  2, 2, 1, 2 } }, // psraw.
772 
773     { ISD::SHL,  MVT::v4i32, {  2, 2, 1, 2 } }, // pslld
774     { ISD::SRL,  MVT::v4i32, {  2, 2, 1, 2 } }, // psrld.
775     { ISD::SRA,  MVT::v4i32, {  2, 2, 1, 2 } }, // psrad.
776 
777     { ISD::SHL,  MVT::v2i64, {  2, 2, 1, 2 } }, // psllq.
778     { ISD::SRL,  MVT::v2i64, {  2, 2, 1, 2 } }, // psrlq.
779     { ISD::SRA,  MVT::v2i64, {  5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
780   };
781 
782   if (ST->hasSSE2() && Op2Info.isUniform() &&
783       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
784     if (const auto *Entry =
785             CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
786       if (auto KindCost = Entry->Cost[CostKind])
787         return LT.first * *KindCost;
788 
789   static const CostKindTblEntry AVX512DQCostTable[] = {
790     { ISD::MUL,  MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
791     { ISD::MUL,  MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
792     { ISD::MUL,  MVT::v8i64, { 3, 15, 1, 3 } }  // pmullq
793   };
794 
795   // Look for AVX512DQ lowering tricks for custom cases.
796   if (ST->hasDQI())
797     if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
798       if (auto KindCost = Entry->Cost[CostKind])
799         return LT.first * *KindCost;
800 
801   static const CostKindTblEntry AVX512BWCostTable[] = {
802     { ISD::SHL,   MVT::v16i8,   {  4,  8, 4, 5 } }, // extend/vpsllvw/pack sequence.
803     { ISD::SRL,   MVT::v16i8,   {  4,  8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
804     { ISD::SRA,   MVT::v16i8,   {  4,  8, 4, 5 } }, // extend/vpsravw/pack sequence.
805     { ISD::SHL,   MVT::v32i8,   {  4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
806     { ISD::SRL,   MVT::v32i8,   {  4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
807     { ISD::SRA,   MVT::v32i8,   {  6, 13,24,30 } }, // extend/vpsravw/pack sequence.
808     { ISD::SHL,   MVT::v64i8,   {  6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
809     { ISD::SRL,   MVT::v64i8,   {  7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
810     { ISD::SRA,   MVT::v64i8,   { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
811 
812     { ISD::SHL,   MVT::v8i16,   {  1,  1, 1, 1 } }, // vpsllvw
813     { ISD::SRL,   MVT::v8i16,   {  1,  1, 1, 1 } }, // vpsrlvw
814     { ISD::SRA,   MVT::v8i16,   {  1,  1, 1, 1 } }, // vpsravw
815     { ISD::SHL,   MVT::v16i16,  {  1,  1, 1, 1 } }, // vpsllvw
816     { ISD::SRL,   MVT::v16i16,  {  1,  1, 1, 1 } }, // vpsrlvw
817     { ISD::SRA,   MVT::v16i16,  {  1,  1, 1, 1 } }, // vpsravw
818     { ISD::SHL,   MVT::v32i16,  {  1,  1, 1, 1 } }, // vpsllvw
819     { ISD::SRL,   MVT::v32i16,  {  1,  1, 1, 1 } }, // vpsrlvw
820     { ISD::SRA,   MVT::v32i16,  {  1,  1, 1, 1 } }, // vpsravw
821 
822     { ISD::ADD,   MVT::v64i8,   {  1,  1, 1, 1 } }, // paddb
823     { ISD::ADD,   MVT::v32i16,  {  1,  1, 1, 1 } }, // paddw
824 
825     { ISD::ADD,   MVT::v32i8,   {  1,  1, 1, 1 } }, // paddb
826     { ISD::ADD,   MVT::v16i16,  {  1,  1, 1, 1 } }, // paddw
827     { ISD::ADD,   MVT::v8i32,   {  1,  1, 1, 1 } }, // paddd
828     { ISD::ADD,   MVT::v4i64,   {  1,  1, 1, 1 } }, // paddq
829 
830     { ISD::SUB,   MVT::v64i8,   {  1,  1, 1, 1 } }, // psubb
831     { ISD::SUB,   MVT::v32i16,  {  1,  1, 1, 1 } }, // psubw
832 
833     { ISD::MUL,   MVT::v64i8,   {  5, 10,10,11 } },
834     { ISD::MUL,   MVT::v32i16,  {  1,  5, 1, 1 } }, // pmullw
835 
836     { ISD::SUB,   MVT::v32i8,   {  1,  1, 1, 1 } }, // psubb
837     { ISD::SUB,   MVT::v16i16,  {  1,  1, 1, 1 } }, // psubw
838     { ISD::SUB,   MVT::v8i32,   {  1,  1, 1, 1 } }, // psubd
839     { ISD::SUB,   MVT::v4i64,   {  1,  1, 1, 1 } }, // psubq
840   };
841 
842   // Look for AVX512BW lowering tricks for custom cases.
843   if (ST->hasBWI())
844     if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
845       if (auto KindCost = Entry->Cost[CostKind])
846         return LT.first * *KindCost;
847 
848   static const CostKindTblEntry AVX512CostTable[] = {
849     { ISD::SHL,     MVT::v64i8,   { 15, 19,27,33 } }, // vpblendv+split sequence.
850     { ISD::SRL,     MVT::v64i8,   { 15, 19,30,36 } }, // vpblendv+split sequence.
851     { ISD::SRA,     MVT::v64i8,   { 37, 37,51,63 } }, // vpblendv+split sequence.
852 
853     { ISD::SHL,     MVT::v32i16,  { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
854     { ISD::SRL,     MVT::v32i16,  { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
855     { ISD::SRA,     MVT::v32i16,  { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
856 
857     { ISD::SHL,     MVT::v4i32,   {  1,  1, 1, 1 } },
858     { ISD::SRL,     MVT::v4i32,   {  1,  1, 1, 1 } },
859     { ISD::SRA,     MVT::v4i32,   {  1,  1, 1, 1 } },
860     { ISD::SHL,     MVT::v8i32,   {  1,  1, 1, 1 } },
861     { ISD::SRL,     MVT::v8i32,   {  1,  1, 1, 1 } },
862     { ISD::SRA,     MVT::v8i32,   {  1,  1, 1, 1 } },
863     { ISD::SHL,     MVT::v16i32,  {  1,  1, 1, 1 } },
864     { ISD::SRL,     MVT::v16i32,  {  1,  1, 1, 1 } },
865     { ISD::SRA,     MVT::v16i32,  {  1,  1, 1, 1 } },
866 
867     { ISD::SHL,     MVT::v2i64,   {  1,  1, 1, 1 } },
868     { ISD::SRL,     MVT::v2i64,   {  1,  1, 1, 1 } },
869     { ISD::SRA,     MVT::v2i64,   {  1,  1, 1, 1 } },
870     { ISD::SHL,     MVT::v4i64,   {  1,  1, 1, 1 } },
871     { ISD::SRL,     MVT::v4i64,   {  1,  1, 1, 1 } },
872     { ISD::SRA,     MVT::v4i64,   {  1,  1, 1, 1 } },
873     { ISD::SHL,     MVT::v8i64,   {  1,  1, 1, 1 } },
874     { ISD::SRL,     MVT::v8i64,   {  1,  1, 1, 1 } },
875     { ISD::SRA,     MVT::v8i64,   {  1,  1, 1, 1 } },
876 
877     { ISD::ADD,     MVT::v64i8,   {  3,  7, 5, 5 } }, // 2*paddb + split
878     { ISD::ADD,     MVT::v32i16,  {  3,  7, 5, 5 } }, // 2*paddw + split
879 
880     { ISD::SUB,     MVT::v64i8,   {  3,  7, 5, 5 } }, // 2*psubb + split
881     { ISD::SUB,     MVT::v32i16,  {  3,  7, 5, 5 } }, // 2*psubw + split
882 
883     { ISD::AND,     MVT::v32i8,   {  1,  1, 1, 1 } },
884     { ISD::AND,     MVT::v16i16,  {  1,  1, 1, 1 } },
885     { ISD::AND,     MVT::v8i32,   {  1,  1, 1, 1 } },
886     { ISD::AND,     MVT::v4i64,   {  1,  1, 1, 1 } },
887 
888     { ISD::OR,      MVT::v32i8,   {  1,  1, 1, 1 } },
889     { ISD::OR,      MVT::v16i16,  {  1,  1, 1, 1 } },
890     { ISD::OR,      MVT::v8i32,   {  1,  1, 1, 1 } },
891     { ISD::OR,      MVT::v4i64,   {  1,  1, 1, 1 } },
892 
893     { ISD::XOR,     MVT::v32i8,   {  1,  1, 1, 1 } },
894     { ISD::XOR,     MVT::v16i16,  {  1,  1, 1, 1 } },
895     { ISD::XOR,     MVT::v8i32,   {  1,  1, 1, 1 } },
896     { ISD::XOR,     MVT::v4i64,   {  1,  1, 1, 1 } },
897 
898     { ISD::MUL,     MVT::v16i32,  {  1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
899     { ISD::MUL,     MVT::v8i32,   {  1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
900     { ISD::MUL,     MVT::v4i32,   {  1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
901     { ISD::MUL,     MVT::v8i64,   {  6,  9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
902     { ISD::MUL,     MVT::i64,     {  1 } }, // Skylake from http://www.agner.org/
903 
904     { X86ISD::PMULUDQ, MVT::v8i64, { 1,  5, 1, 1 } },
905 
906     { ISD::FNEG,    MVT::v8f64,   {  1,  1, 1, 2 } }, // Skylake from http://www.agner.org/
907     { ISD::FADD,    MVT::v8f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
908     { ISD::FADD,    MVT::v4f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
909     { ISD::FSUB,    MVT::v8f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
910     { ISD::FSUB,    MVT::v4f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
911     { ISD::FMUL,    MVT::v8f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
912     { ISD::FMUL,    MVT::v4f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
913     { ISD::FMUL,    MVT::v2f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
914     { ISD::FMUL,    MVT::f64,     {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
915 
916     { ISD::FDIV,    MVT::f64,     {  4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
917     { ISD::FDIV,    MVT::v2f64,   {  4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
918     { ISD::FDIV,    MVT::v4f64,   {  8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
919     { ISD::FDIV,    MVT::v8f64,   { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
920 
921     { ISD::FNEG,    MVT::v16f32,  {  1,  1, 1, 2 } }, // Skylake from http://www.agner.org/
922     { ISD::FADD,    MVT::v16f32,  {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
923     { ISD::FADD,    MVT::v8f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
924     { ISD::FSUB,    MVT::v16f32,  {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
925     { ISD::FSUB,    MVT::v8f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
926     { ISD::FMUL,    MVT::v16f32,  {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
927     { ISD::FMUL,    MVT::v8f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
928     { ISD::FMUL,    MVT::v4f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
929     { ISD::FMUL,    MVT::f32,     {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
930 
931     { ISD::FDIV,    MVT::f32,     {  3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
932     { ISD::FDIV,    MVT::v4f32,   {  3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
933     { ISD::FDIV,    MVT::v8f32,   {  5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
934     { ISD::FDIV,    MVT::v16f32,  { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
935   };
936 
937   if (ST->hasAVX512())
938     if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
939       if (auto KindCost = Entry->Cost[CostKind])
940         return LT.first * *KindCost;
941 
942   static const CostKindTblEntry AVX2ShiftCostTable[] = {
943     // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
944     // customize them to detect the cases where shift amount is a scalar one.
945     { ISD::SHL,     MVT::v4i32,  { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
946     { ISD::SRL,     MVT::v4i32,  { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
947     { ISD::SRA,     MVT::v4i32,  { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
948     { ISD::SHL,     MVT::v8i32,  { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
949     { ISD::SRL,     MVT::v8i32,  { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
950     { ISD::SRA,     MVT::v8i32,  { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
951     { ISD::SHL,     MVT::v2i64,  { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
952     { ISD::SRL,     MVT::v2i64,  { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
953     { ISD::SHL,     MVT::v4i64,  { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
954     { ISD::SRL,     MVT::v4i64,  { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
955   };
956 
957   if (ST->hasAVX512()) {
958     if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
959       // On AVX512, a packed v32i16 shift left by a constant build_vector
960       // is lowered into a vector multiply (vpmullw).
961       return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
962                                     Op1Info.getNoProps(), Op2Info.getNoProps());
963   }
964 
965   // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
966   if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
967     if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
968         Op2Info.isConstant())
969       // On AVX2, a packed v16i16 shift left by a constant build_vector
970       // is lowered into a vector multiply (vpmullw).
971       return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
972                                     Op1Info.getNoProps(), Op2Info.getNoProps());
973 
974     if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
975       if (auto KindCost = Entry->Cost[CostKind])
976         return LT.first * *KindCost;
977   }
978 
979   static const CostKindTblEntry XOPShiftCostTable[] = {
980     // 128bit shifts take 1cy, but right shifts require negation beforehand.
981     { ISD::SHL,     MVT::v16i8,  { 1, 3, 1, 1 } },
982     { ISD::SRL,     MVT::v16i8,  { 2, 3, 1, 1 } },
983     { ISD::SRA,     MVT::v16i8,  { 2, 3, 1, 1 } },
984     { ISD::SHL,     MVT::v8i16,  { 1, 3, 1, 1 } },
985     { ISD::SRL,     MVT::v8i16,  { 2, 3, 1, 1 } },
986     { ISD::SRA,     MVT::v8i16,  { 2, 3, 1, 1 } },
987     { ISD::SHL,     MVT::v4i32,  { 1, 3, 1, 1 } },
988     { ISD::SRL,     MVT::v4i32,  { 2, 3, 1, 1 } },
989     { ISD::SRA,     MVT::v4i32,  { 2, 3, 1, 1 } },
990     { ISD::SHL,     MVT::v2i64,  { 1, 3, 1, 1 } },
991     { ISD::SRL,     MVT::v2i64,  { 2, 3, 1, 1 } },
992     { ISD::SRA,     MVT::v2i64,  { 2, 3, 1, 1 } },
993     // 256bit shifts require splitting if AVX2 didn't catch them above.
994     { ISD::SHL,     MVT::v32i8,  { 4, 7, 5, 6 } },
995     { ISD::SRL,     MVT::v32i8,  { 6, 7, 5, 6 } },
996     { ISD::SRA,     MVT::v32i8,  { 6, 7, 5, 6 } },
997     { ISD::SHL,     MVT::v16i16, { 4, 7, 5, 6 } },
998     { ISD::SRL,     MVT::v16i16, { 6, 7, 5, 6 } },
999     { ISD::SRA,     MVT::v16i16, { 6, 7, 5, 6 } },
1000     { ISD::SHL,     MVT::v8i32,  { 4, 7, 5, 6 } },
1001     { ISD::SRL,     MVT::v8i32,  { 6, 7, 5, 6 } },
1002     { ISD::SRA,     MVT::v8i32,  { 6, 7, 5, 6 } },
1003     { ISD::SHL,     MVT::v4i64,  { 4, 7, 5, 6 } },
1004     { ISD::SRL,     MVT::v4i64,  { 6, 7, 5, 6 } },
1005     { ISD::SRA,     MVT::v4i64,  { 6, 7, 5, 6 } },
1006   };
1007 
1008   // Look for XOP lowering tricks.
1009   if (ST->hasXOP()) {
1010     // If the right shift is constant then we'll fold the negation so
1011     // it's as cheap as a left shift.
1012     int ShiftISD = ISD;
1013     if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1014       ShiftISD = ISD::SHL;
1015     if (const auto *Entry =
1016             CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1017       if (auto KindCost = Entry->Cost[CostKind])
1018         return LT.first * *KindCost;
1019   }
1020 
1021   if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1022     MVT VT = LT.second;
1023     // Vector shift left by non uniform constant can be lowered
1024     // into vector multiply.
1025     if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1026         ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1027       ISD = ISD::MUL;
1028   }
1029 
1030   static const CostKindTblEntry GLMCostTable[] = {
1031     { ISD::FDIV,  MVT::f32,   { 18, 19, 1, 1 } }, // divss
1032     { ISD::FDIV,  MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1033     { ISD::FDIV,  MVT::f64,   { 33, 34, 1, 1 } }, // divsd
1034     { ISD::FDIV,  MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1035   };
1036 
1037   if (ST->useGLMDivSqrtCosts())
1038     if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1039       if (auto KindCost = Entry->Cost[CostKind])
1040         return LT.first * *KindCost;
1041 
1042   static const CostKindTblEntry SLMCostTable[] = {
1043     { ISD::MUL,   MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1044     { ISD::MUL,   MVT::v8i16, {  2,  5, 1, 1 } }, // pmullw
1045     { ISD::FMUL,  MVT::f64,   {  2,  5, 1, 1 } }, // mulsd
1046     { ISD::FMUL,  MVT::f32,   {  1,  4, 1, 1 } }, // mulss
1047     { ISD::FMUL,  MVT::v2f64, {  4,  7, 1, 1 } }, // mulpd
1048     { ISD::FMUL,  MVT::v4f32, {  2,  5, 1, 1 } }, // mulps
1049     { ISD::FDIV,  MVT::f32,   { 17, 19, 1, 1 } }, // divss
1050     { ISD::FDIV,  MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1051     { ISD::FDIV,  MVT::f64,   { 32, 34, 1, 1 } }, // divsd
1052     { ISD::FDIV,  MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1053     { ISD::FADD,  MVT::v2f64, {  2,  4, 1, 1 } }, // addpd
1054     { ISD::FSUB,  MVT::v2f64, {  2,  4, 1, 1 } }, // subpd
1055     // v2i64/v4i64 mul is custom lowered as a series of long:
1056     // multiplies(3), shifts(3) and adds(2)
1057     // slm muldq version throughput is 2 and addq throughput 4
1058     // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1059     //       3X4 (addq throughput) = 17
1060     { ISD::MUL,   MVT::v2i64, { 17, 22, 9, 9 } },
1061     // slm addq\subq throughput is 4
1062     { ISD::ADD,   MVT::v2i64, {  4,  2, 1, 2 } },
1063     { ISD::SUB,   MVT::v2i64, {  4,  2, 1, 2 } },
1064   };
1065 
1066   if (ST->useSLMArithCosts())
1067     if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1068       if (auto KindCost = Entry->Cost[CostKind])
1069         return LT.first * *KindCost;
1070 
1071   static const CostKindTblEntry AVX2CostTable[] = {
1072     { ISD::SHL,  MVT::v16i8,   {  6, 21,11,16 } }, // vpblendvb sequence.
1073     { ISD::SHL,  MVT::v32i8,   {  6, 23,11,22 } }, // vpblendvb sequence.
1074     { ISD::SHL,  MVT::v8i16,   {  5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1075     { ISD::SHL,  MVT::v16i16,  {  8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1076 
1077     { ISD::SRL,  MVT::v16i8,   {  6, 27,12,18 } }, // vpblendvb sequence.
1078     { ISD::SRL,  MVT::v32i8,   {  8, 30,12,24 } }, // vpblendvb sequence.
1079     { ISD::SRL,  MVT::v8i16,   {  5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1080     { ISD::SRL,  MVT::v16i16,  {  8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1081 
1082     { ISD::SRA,  MVT::v16i8,   { 17, 17,24,30 } }, // vpblendvb sequence.
1083     { ISD::SRA,  MVT::v32i8,   { 18, 20,24,43 } }, // vpblendvb sequence.
1084     { ISD::SRA,  MVT::v8i16,   {  5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1085     { ISD::SRA,  MVT::v16i16,  {  8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1086     { ISD::SRA,  MVT::v2i64,   {  4,  5, 5, 5 } }, // srl/xor/sub sequence.
1087     { ISD::SRA,  MVT::v4i64,   {  8,  8, 5, 9 } }, // srl/xor/sub sequence.
1088 
1089     { ISD::SUB,  MVT::v32i8,   {  1,  1, 1, 2 } }, // psubb
1090     { ISD::ADD,  MVT::v32i8,   {  1,  1, 1, 2 } }, // paddb
1091     { ISD::SUB,  MVT::v16i16,  {  1,  1, 1, 2 } }, // psubw
1092     { ISD::ADD,  MVT::v16i16,  {  1,  1, 1, 2 } }, // paddw
1093     { ISD::SUB,  MVT::v8i32,   {  1,  1, 1, 2 } }, // psubd
1094     { ISD::ADD,  MVT::v8i32,   {  1,  1, 1, 2 } }, // paddd
1095     { ISD::SUB,  MVT::v4i64,   {  1,  1, 1, 2 } }, // psubq
1096     { ISD::ADD,  MVT::v4i64,   {  1,  1, 1, 2 } }, // paddq
1097 
1098     { ISD::MUL,  MVT::v16i8,   {  5, 18, 6,12 } }, // extend/pmullw/pack
1099     { ISD::MUL,  MVT::v32i8,   {  6, 11,10,19 } }, // unpack/pmullw
1100     { ISD::MUL,  MVT::v16i16,  {  2,  5, 1, 2 } }, // pmullw
1101     { ISD::MUL,  MVT::v8i32,   {  4, 10, 1, 2 } }, // pmulld
1102     { ISD::MUL,  MVT::v4i32,   {  2, 10, 1, 2 } }, // pmulld
1103     { ISD::MUL,  MVT::v4i64,   {  6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1104     { ISD::MUL,  MVT::v2i64,   {  6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1105 
1106     { X86ISD::PMULUDQ, MVT::v4i64, { 1,  5, 1, 1 } },
1107 
1108     { ISD::FNEG, MVT::v4f64,   {  1,  1, 1, 2 } }, // vxorpd
1109     { ISD::FNEG, MVT::v8f32,   {  1,  1, 1, 2 } }, // vxorps
1110 
1111     { ISD::FADD, MVT::f64,     {  1,  4, 1, 1 } }, // vaddsd
1112     { ISD::FADD, MVT::f32,     {  1,  4, 1, 1 } }, // vaddss
1113     { ISD::FADD, MVT::v2f64,   {  1,  4, 1, 1 } }, // vaddpd
1114     { ISD::FADD, MVT::v4f32,   {  1,  4, 1, 1 } }, // vaddps
1115     { ISD::FADD, MVT::v4f64,   {  1,  4, 1, 2 } }, // vaddpd
1116     { ISD::FADD, MVT::v8f32,   {  1,  4, 1, 2 } }, // vaddps
1117 
1118     { ISD::FSUB, MVT::f64,     {  1,  4, 1, 1 } }, // vsubsd
1119     { ISD::FSUB, MVT::f32,     {  1,  4, 1, 1 } }, // vsubss
1120     { ISD::FSUB, MVT::v2f64,   {  1,  4, 1, 1 } }, // vsubpd
1121     { ISD::FSUB, MVT::v4f32,   {  1,  4, 1, 1 } }, // vsubps
1122     { ISD::FSUB, MVT::v4f64,   {  1,  4, 1, 2 } }, // vsubpd
1123     { ISD::FSUB, MVT::v8f32,   {  1,  4, 1, 2 } }, // vsubps
1124 
1125     { ISD::FMUL, MVT::f64,     {  1,  5, 1, 1 } }, // vmulsd
1126     { ISD::FMUL, MVT::f32,     {  1,  5, 1, 1 } }, // vmulss
1127     { ISD::FMUL, MVT::v2f64,   {  1,  5, 1, 1 } }, // vmulpd
1128     { ISD::FMUL, MVT::v4f32,   {  1,  5, 1, 1 } }, // vmulps
1129     { ISD::FMUL, MVT::v4f64,   {  1,  5, 1, 2 } }, // vmulpd
1130     { ISD::FMUL, MVT::v8f32,   {  1,  5, 1, 2 } }, // vmulps
1131 
1132     { ISD::FDIV, MVT::f32,     {  7, 13, 1, 1 } }, // vdivss
1133     { ISD::FDIV, MVT::v4f32,   {  7, 13, 1, 1 } }, // vdivps
1134     { ISD::FDIV, MVT::v8f32,   { 14, 21, 1, 3 } }, // vdivps
1135     { ISD::FDIV, MVT::f64,     { 14, 20, 1, 1 } }, // vdivsd
1136     { ISD::FDIV, MVT::v2f64,   { 14, 20, 1, 1 } }, // vdivpd
1137     { ISD::FDIV, MVT::v4f64,   { 28, 35, 1, 3 } }, // vdivpd
1138   };
1139 
1140   // Look for AVX2 lowering tricks for custom cases.
1141   if (ST->hasAVX2())
1142     if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1143       if (auto KindCost = Entry->Cost[CostKind])
1144         return LT.first * *KindCost;
1145 
1146   static const CostKindTblEntry AVX1CostTable[] = {
1147     // We don't have to scalarize unsupported ops. We can issue two half-sized
1148     // operations and we only need to extract the upper YMM half.
1149     // Two ops + 1 extract + 1 insert = 4.
1150     { ISD::MUL,     MVT::v32i8,   { 12, 13, 22, 23 } }, // unpack/pmullw + split
1151     { ISD::MUL,     MVT::v16i16,  {  4,  8,  5,  6 } }, // pmullw + split
1152     { ISD::MUL,     MVT::v8i32,   {  5,  8,  5, 10 } }, // pmulld + split
1153     { ISD::MUL,     MVT::v4i32,   {  2,  5,  1,  3 } }, // pmulld
1154     { ISD::MUL,     MVT::v4i64,   { 12, 15, 19, 20 } },
1155 
1156     { ISD::AND,     MVT::v32i8,   {  1,  1, 1, 2 } }, // vandps
1157     { ISD::AND,     MVT::v16i16,  {  1,  1, 1, 2 } }, // vandps
1158     { ISD::AND,     MVT::v8i32,   {  1,  1, 1, 2 } }, // vandps
1159     { ISD::AND,     MVT::v4i64,   {  1,  1, 1, 2 } }, // vandps
1160 
1161     { ISD::OR,      MVT::v32i8,   {  1,  1, 1, 2 } }, // vorps
1162     { ISD::OR,      MVT::v16i16,  {  1,  1, 1, 2 } }, // vorps
1163     { ISD::OR,      MVT::v8i32,   {  1,  1, 1, 2 } }, // vorps
1164     { ISD::OR,      MVT::v4i64,   {  1,  1, 1, 2 } }, // vorps
1165 
1166     { ISD::XOR,     MVT::v32i8,   {  1,  1, 1, 2 } }, // vxorps
1167     { ISD::XOR,     MVT::v16i16,  {  1,  1, 1, 2 } }, // vxorps
1168     { ISD::XOR,     MVT::v8i32,   {  1,  1, 1, 2 } }, // vxorps
1169     { ISD::XOR,     MVT::v4i64,   {  1,  1, 1, 2 } }, // vxorps
1170 
1171     { ISD::SUB,     MVT::v32i8,   {  4,  2, 5, 6 } }, // psubb + split
1172     { ISD::ADD,     MVT::v32i8,   {  4,  2, 5, 6 } }, // paddb + split
1173     { ISD::SUB,     MVT::v16i16,  {  4,  2, 5, 6 } }, // psubw + split
1174     { ISD::ADD,     MVT::v16i16,  {  4,  2, 5, 6 } }, // paddw + split
1175     { ISD::SUB,     MVT::v8i32,   {  4,  2, 5, 6 } }, // psubd + split
1176     { ISD::ADD,     MVT::v8i32,   {  4,  2, 5, 6 } }, // paddd + split
1177     { ISD::SUB,     MVT::v4i64,   {  4,  2, 5, 6 } }, // psubq + split
1178     { ISD::ADD,     MVT::v4i64,   {  4,  2, 5, 6 } }, // paddq + split
1179     { ISD::SUB,     MVT::v2i64,   {  1,  1, 1, 1 } }, // psubq
1180     { ISD::ADD,     MVT::v2i64,   {  1,  1, 1, 1 } }, // paddq
1181 
1182     { ISD::SHL,     MVT::v16i8,   { 10, 21,11,17 } }, // pblendvb sequence.
1183     { ISD::SHL,     MVT::v32i8,   { 22, 22,27,40 } }, // pblendvb sequence + split.
1184     { ISD::SHL,     MVT::v8i16,   {  6,  9,11,11 } }, // pblendvb sequence.
1185     { ISD::SHL,     MVT::v16i16,  { 13, 16,24,25 } }, // pblendvb sequence + split.
1186     { ISD::SHL,     MVT::v4i32,   {  3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1187     { ISD::SHL,     MVT::v8i32,   {  9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1188     { ISD::SHL,     MVT::v2i64,   {  2,  4, 4, 6 } }, // Shift each lane + blend.
1189     { ISD::SHL,     MVT::v4i64,   {  6,  7,11,15 } }, // Shift each lane + blend + split.
1190 
1191     { ISD::SRL,     MVT::v16i8,   { 11, 27,12,18 } }, // pblendvb sequence.
1192     { ISD::SRL,     MVT::v32i8,   { 23, 23,30,43 } }, // pblendvb sequence + split.
1193     { ISD::SRL,     MVT::v8i16,   { 13, 16,14,22 } }, // pblendvb sequence.
1194     { ISD::SRL,     MVT::v16i16,  { 28, 30,31,48 } }, // pblendvb sequence + split.
1195     { ISD::SRL,     MVT::v4i32,   {  6,  7,12,16 } }, // Shift each lane + blend.
1196     { ISD::SRL,     MVT::v8i32,   { 14, 14,26,34 } }, // Shift each lane + blend + split.
1197     { ISD::SRL,     MVT::v2i64,   {  2,  4, 4, 6 } }, // Shift each lane + blend.
1198     { ISD::SRL,     MVT::v4i64,   {  6,  7,11,15 } }, // Shift each lane + blend + split.
1199 
1200     { ISD::SRA,     MVT::v16i8,   { 21, 22,24,36 } }, // pblendvb sequence.
1201     { ISD::SRA,     MVT::v32i8,   { 44, 45,51,76 } }, // pblendvb sequence + split.
1202     { ISD::SRA,     MVT::v8i16,   { 13, 16,14,22 } }, // pblendvb sequence.
1203     { ISD::SRA,     MVT::v16i16,  { 28, 30,31,48 } }, // pblendvb sequence + split.
1204     { ISD::SRA,     MVT::v4i32,   {  6,  7,12,16 } }, // Shift each lane + blend.
1205     { ISD::SRA,     MVT::v8i32,   { 14, 14,26,34 } }, // Shift each lane + blend + split.
1206     { ISD::SRA,     MVT::v2i64,   {  5,  6,10,14 } }, // Shift each lane + blend.
1207     { ISD::SRA,     MVT::v4i64,   { 12, 12,22,30 } }, // Shift each lane + blend + split.
1208 
1209     { ISD::FNEG,    MVT::v4f64,   {  2,  2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1210     { ISD::FNEG,    MVT::v8f32,   {  2,  2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1211 
1212     { ISD::FADD,    MVT::f64,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1213     { ISD::FADD,    MVT::f32,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1214     { ISD::FADD,    MVT::v2f64,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1215     { ISD::FADD,    MVT::v4f32,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1216     { ISD::FADD,    MVT::v4f64,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1217     { ISD::FADD,    MVT::v8f32,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1218 
1219     { ISD::FSUB,    MVT::f64,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1220     { ISD::FSUB,    MVT::f32,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1221     { ISD::FSUB,    MVT::v2f64,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1222     { ISD::FSUB,    MVT::v4f32,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1223     { ISD::FSUB,    MVT::v4f64,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1224     { ISD::FSUB,    MVT::v8f32,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1225 
1226     { ISD::FMUL,    MVT::f64,     {  2,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1227     { ISD::FMUL,    MVT::f32,     {  1,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1228     { ISD::FMUL,    MVT::v2f64,   {  2,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1229     { ISD::FMUL,    MVT::v4f32,   {  1,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1230     { ISD::FMUL,    MVT::v4f64,   {  4,  5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1231     { ISD::FMUL,    MVT::v8f32,   {  2,  5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1232 
1233     { ISD::FDIV,    MVT::f32,     { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1234     { ISD::FDIV,    MVT::v4f32,   { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1235     { ISD::FDIV,    MVT::v8f32,   { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1236     { ISD::FDIV,    MVT::f64,     { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1237     { ISD::FDIV,    MVT::v2f64,   { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1238     { ISD::FDIV,    MVT::v4f64,   { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1239   };
1240 
1241   if (ST->hasAVX())
1242     if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1243       if (auto KindCost = Entry->Cost[CostKind])
1244         return LT.first * *KindCost;
1245 
1246   static const CostKindTblEntry SSE42CostTable[] = {
1247     { ISD::FADD, MVT::f64,    {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1248     { ISD::FADD, MVT::f32,    {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1249     { ISD::FADD, MVT::v2f64,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1250     { ISD::FADD, MVT::v4f32,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1251 
1252     { ISD::FSUB, MVT::f64,    {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1253     { ISD::FSUB, MVT::f32 ,   {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1254     { ISD::FSUB, MVT::v2f64,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1255     { ISD::FSUB, MVT::v4f32,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1256 
1257     { ISD::FMUL, MVT::f64,    {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1258     { ISD::FMUL, MVT::f32,    {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1259     { ISD::FMUL, MVT::v2f64,  {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1260     { ISD::FMUL, MVT::v4f32,  {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1261 
1262     { ISD::FDIV,  MVT::f32,   { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1263     { ISD::FDIV,  MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1264     { ISD::FDIV,  MVT::f64,   { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1265     { ISD::FDIV,  MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1266 
1267     { ISD::MUL,   MVT::v2i64, {  6, 10,10,10 } }  // 3*pmuludq/3*shift/2*add
1268   };
1269 
1270   if (ST->hasSSE42())
1271     if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1272       if (auto KindCost = Entry->Cost[CostKind])
1273         return LT.first * *KindCost;
1274 
1275   static const CostKindTblEntry SSE41CostTable[] = {
1276     { ISD::SHL,  MVT::v16i8,  { 15, 24,17,22 } }, // pblendvb sequence.
1277     { ISD::SHL,  MVT::v8i16,  { 11, 14,11,11 } }, // pblendvb sequence.
1278     { ISD::SHL,  MVT::v4i32,  { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1279 
1280     { ISD::SRL,  MVT::v16i8,  { 16, 27,18,24 } }, // pblendvb sequence.
1281     { ISD::SRL,  MVT::v8i16,  { 22, 26,23,27 } }, // pblendvb sequence.
1282     { ISD::SRL,  MVT::v4i32,  { 16, 17,15,19 } }, // Shift each lane + blend.
1283     { ISD::SRL,  MVT::v2i64,  {  4,  6, 5, 7 } }, // splat+shuffle sequence.
1284 
1285     { ISD::SRA,  MVT::v16i8,  { 38, 41,30,36 } }, // pblendvb sequence.
1286     { ISD::SRA,  MVT::v8i16,  { 22, 26,23,27 } }, // pblendvb sequence.
1287     { ISD::SRA,  MVT::v4i32,  { 16, 17,15,19 } }, // Shift each lane + blend.
1288     { ISD::SRA,  MVT::v2i64,  {  8, 17, 5, 7 } }, // splat+shuffle sequence.
1289 
1290     { ISD::MUL,  MVT::v16i8,  {  5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
1291     { ISD::MUL,  MVT::v4i32,  {  2, 11, 1, 1 } }  // pmulld (Nehalem from agner.org)
1292   };
1293 
1294   if (ST->hasSSE41())
1295     if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1296       if (auto KindCost = Entry->Cost[CostKind])
1297         return LT.first * *KindCost;
1298 
1299   static const CostKindTblEntry SSE2CostTable[] = {
1300     // We don't correctly identify costs of casts because they are marked as
1301     // custom.
1302     { ISD::SHL,  MVT::v16i8,  { 13, 21,26,28 } }, // cmpgtb sequence.
1303     { ISD::SHL,  MVT::v8i16,  { 24, 27,16,20 } }, // cmpgtw sequence.
1304     { ISD::SHL,  MVT::v4i32,  { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1305     { ISD::SHL,  MVT::v2i64,  {  4,  6, 5, 7 } }, // splat+shuffle sequence.
1306 
1307     { ISD::SRL,  MVT::v16i8,  { 14, 28,27,30 } }, // cmpgtb sequence.
1308     { ISD::SRL,  MVT::v8i16,  { 16, 19,31,31 } }, // cmpgtw sequence.
1309     { ISD::SRL,  MVT::v4i32,  { 12, 12,15,19 } }, // Shift each lane + blend.
1310     { ISD::SRL,  MVT::v2i64,  {  4,  6, 5, 7 } }, // splat+shuffle sequence.
1311 
1312     { ISD::SRA,  MVT::v16i8,  { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1313     { ISD::SRA,  MVT::v8i16,  { 16, 19,31,31 } }, // cmpgtw sequence.
1314     { ISD::SRA,  MVT::v4i32,  { 12, 12,15,19 } }, // Shift each lane + blend.
1315     { ISD::SRA,  MVT::v2i64,  {  8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1316 
1317     { ISD::AND,  MVT::v16i8,  {  1,  1, 1, 1 } }, // pand
1318     { ISD::AND,  MVT::v8i16,  {  1,  1, 1, 1 } }, // pand
1319     { ISD::AND,  MVT::v4i32,  {  1,  1, 1, 1 } }, // pand
1320     { ISD::AND,  MVT::v2i64,  {  1,  1, 1, 1 } }, // pand
1321 
1322     { ISD::OR,   MVT::v16i8,  {  1,  1, 1, 1 } }, // por
1323     { ISD::OR,   MVT::v8i16,  {  1,  1, 1, 1 } }, // por
1324     { ISD::OR,   MVT::v4i32,  {  1,  1, 1, 1 } }, // por
1325     { ISD::OR,   MVT::v2i64,  {  1,  1, 1, 1 } }, // por
1326 
1327     { ISD::XOR,  MVT::v16i8,  {  1,  1, 1, 1 } }, // pxor
1328     { ISD::XOR,  MVT::v8i16,  {  1,  1, 1, 1 } }, // pxor
1329     { ISD::XOR,  MVT::v4i32,  {  1,  1, 1, 1 } }, // pxor
1330     { ISD::XOR,  MVT::v2i64,  {  1,  1, 1, 1 } }, // pxor
1331 
1332     { ISD::ADD,  MVT::v2i64,  {  1,  2, 1, 2 } }, // paddq
1333     { ISD::SUB,  MVT::v2i64,  {  1,  2, 1, 2 } }, // psubq
1334 
1335     { ISD::MUL,  MVT::v16i8,  {  5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1336     { ISD::MUL,  MVT::v8i16,  {  1,  5, 1, 1 } }, // pmullw
1337     { ISD::MUL,  MVT::v4i32,  {  6,  8, 7, 7 } }, // 3*pmuludq/4*shuffle
1338     { ISD::MUL,  MVT::v2i64,  {  7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1339 
1340     { X86ISD::PMULUDQ, MVT::v2i64, { 1,  5, 1, 1 } },
1341 
1342     { ISD::FDIV, MVT::f32,    { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1343     { ISD::FDIV, MVT::v4f32,  { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1344     { ISD::FDIV, MVT::f64,    { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1345     { ISD::FDIV, MVT::v2f64,  { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1346 
1347     { ISD::FNEG, MVT::f32,    {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1348     { ISD::FNEG, MVT::f64,    {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1349     { ISD::FNEG, MVT::v4f32,  {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1350     { ISD::FNEG, MVT::v2f64,  {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1351 
1352     { ISD::FADD, MVT::f32,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1353     { ISD::FADD, MVT::f64,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1354     { ISD::FADD, MVT::v2f64,  {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1355 
1356     { ISD::FSUB, MVT::f32,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1357     { ISD::FSUB, MVT::f64,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1358     { ISD::FSUB, MVT::v2f64,  {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1359 
1360     { ISD::FMUL, MVT::f64,    {  2,  5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1361     { ISD::FMUL, MVT::v2f64,  {  2,  5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1362   };
1363 
1364   if (ST->hasSSE2())
1365     if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1366       if (auto KindCost = Entry->Cost[CostKind])
1367         return LT.first * *KindCost;
1368 
1369   static const CostKindTblEntry SSE1CostTable[] = {
1370     { ISD::FDIV, MVT::f32,   { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1371     { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1372 
1373     { ISD::FNEG, MVT::f32,   {  2,  2, 1, 2 } }, // Pentium III from http://www.agner.org/
1374     { ISD::FNEG, MVT::v4f32, {  2,  2, 1, 2 } }, // Pentium III from http://www.agner.org/
1375 
1376     { ISD::FADD, MVT::f32,   {  1,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1377     { ISD::FADD, MVT::v4f32, {  2,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1378 
1379     { ISD::FSUB, MVT::f32,   {  1,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1380     { ISD::FSUB, MVT::v4f32, {  2,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1381 
1382     { ISD::FMUL, MVT::f32,   {  2,  5, 1, 1 } }, // Pentium III from http://www.agner.org/
1383     { ISD::FMUL, MVT::v4f32, {  2,  5, 1, 1 } }, // Pentium III from http://www.agner.org/
1384   };
1385 
1386   if (ST->hasSSE1())
1387     if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1388       if (auto KindCost = Entry->Cost[CostKind])
1389         return LT.first * *KindCost;
1390 
1391   static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1392     { ISD::ADD,  MVT::i64,  {  1 } }, // Core (Merom) from http://www.agner.org/
1393     { ISD::SUB,  MVT::i64,  {  1 } }, // Core (Merom) from http://www.agner.org/
1394     { ISD::MUL,  MVT::i64,  {  2,  6,  1,  2 } },
1395   };
1396 
1397   if (ST->is64Bit())
1398     if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1399       if (auto KindCost = Entry->Cost[CostKind])
1400         return LT.first * *KindCost;
1401 
1402   static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1403     { ISD::ADD,  MVT::i8,  {  1 } }, // Pentium III from http://www.agner.org/
1404     { ISD::ADD,  MVT::i16, {  1 } }, // Pentium III from http://www.agner.org/
1405     { ISD::ADD,  MVT::i32, {  1 } }, // Pentium III from http://www.agner.org/
1406 
1407     { ISD::SUB,  MVT::i8,  {  1 } }, // Pentium III from http://www.agner.org/
1408     { ISD::SUB,  MVT::i16, {  1 } }, // Pentium III from http://www.agner.org/
1409     { ISD::SUB,  MVT::i32, {  1 } }, // Pentium III from http://www.agner.org/
1410 
1411     { ISD::MUL,  MVT::i8,  {  3,  4, 1, 1 } },
1412     { ISD::MUL,  MVT::i16, {  2,  4, 1, 1 } },
1413     { ISD::MUL,  MVT::i32, {  1,  4, 1, 1 } },
1414 
1415     { ISD::FNEG, MVT::f64, {  2,  2, 1, 3 } }, // (x87)
1416     { ISD::FADD, MVT::f64, {  2,  3, 1, 1 } }, // (x87)
1417     { ISD::FSUB, MVT::f64, {  2,  3, 1, 1 } }, // (x87)
1418     { ISD::FMUL, MVT::f64, {  2,  5, 1, 1 } }, // (x87)
1419     { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1420   };
1421 
1422   if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1423     if (auto KindCost = Entry->Cost[CostKind])
1424       return LT.first * *KindCost;
1425 
1426   // It is not a good idea to vectorize division. We have to scalarize it and
1427   // in the process we will often end up having to spilling regular
1428   // registers. The overhead of division is going to dominate most kernels
1429   // anyways so try hard to prevent vectorization of division - it is
1430   // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1431   // to hide "20 cycles" for each lane.
1432   if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1433       (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1434        ISD == ISD::UREM)) {
1435     InstructionCost ScalarCost =
1436         getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
1437                                Op1Info.getNoProps(), Op2Info.getNoProps());
1438     return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1439   }
1440 
1441   // Handle some basic single instruction code size cases.
1442   if (CostKind == TTI::TCK_CodeSize) {
1443     switch (ISD) {
1444     case ISD::FADD:
1445     case ISD::FSUB:
1446     case ISD::FMUL:
1447     case ISD::FDIV:
1448     case ISD::FNEG:
1449     case ISD::AND:
1450     case ISD::OR:
1451     case ISD::XOR:
1452       return LT.first;
1453       break;
1454     }
1455   }
1456 
1457   // Fallback to the default implementation.
1458   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1459                                        Args, CxtI);
1460 }
1461 
1462 InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1463                                            VectorType *BaseTp,
1464                                            ArrayRef<int> Mask,
1465                                            TTI::TargetCostKind CostKind,
1466                                            int Index, VectorType *SubTp,
1467                                            ArrayRef<const Value *> Args) {
1468   // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1469   // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1470   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1471 
1472   Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1473 
1474   // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1475   if (Kind == TTI::SK_Transpose)
1476     Kind = TTI::SK_PermuteTwoSrc;
1477 
1478   // For Broadcasts we are splatting the first element from the first input
1479   // register, so only need to reference that input and all the output
1480   // registers are the same.
1481   if (Kind == TTI::SK_Broadcast)
1482     LT.first = 1;
1483 
1484   // Treat <X x bfloat> shuffles as <X x half>.
1485   if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1486     LT.second = LT.second.changeVectorElementType(MVT::f16);
1487 
1488   // Subvector extractions are free if they start at the beginning of a
1489   // vector and cheap if the subvectors are aligned.
1490   if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1491     int NumElts = LT.second.getVectorNumElements();
1492     if ((Index % NumElts) == 0)
1493       return 0;
1494     std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1495     if (SubLT.second.isVector()) {
1496       int NumSubElts = SubLT.second.getVectorNumElements();
1497       if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1498         return SubLT.first;
1499       // Handle some cases for widening legalization. For now we only handle
1500       // cases where the original subvector was naturally aligned and evenly
1501       // fit in its legalized subvector type.
1502       // FIXME: Remove some of the alignment restrictions.
1503       // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1504       // vectors.
1505       int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1506       if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1507           (NumSubElts % OrigSubElts) == 0 &&
1508           LT.second.getVectorElementType() ==
1509               SubLT.second.getVectorElementType() &&
1510           LT.second.getVectorElementType().getSizeInBits() ==
1511               BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1512         assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1513                "Unexpected number of elements!");
1514         auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1515                                            LT.second.getVectorNumElements());
1516         auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1517                                            SubLT.second.getVectorNumElements());
1518         int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1519         InstructionCost ExtractCost =
1520             getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1521                            CostKind, ExtractIndex, SubTy);
1522 
1523         // If the original size is 32-bits or more, we can use pshufd. Otherwise
1524         // if we have SSSE3 we can use pshufb.
1525         if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1526           return ExtractCost + 1; // pshufd or pshufb
1527 
1528         assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1529                "Unexpected vector size");
1530 
1531         return ExtractCost + 2; // worst case pshufhw + pshufd
1532       }
1533     }
1534   }
1535 
1536   // Subvector insertions are cheap if the subvectors are aligned.
1537   // Note that in general, the insertion starting at the beginning of a vector
1538   // isn't free, because we need to preserve the rest of the wide vector.
1539   if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1540     int NumElts = LT.second.getVectorNumElements();
1541     std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1542     if (SubLT.second.isVector()) {
1543       int NumSubElts = SubLT.second.getVectorNumElements();
1544       if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1545         return SubLT.first;
1546     }
1547 
1548     // If the insertion isn't aligned, treat it like a 2-op shuffle.
1549     Kind = TTI::SK_PermuteTwoSrc;
1550   }
1551 
1552   // Handle some common (illegal) sub-vector types as they are often very cheap
1553   // to shuffle even on targets without PSHUFB.
1554   EVT VT = TLI->getValueType(DL, BaseTp);
1555   if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1556       !ST->hasSSSE3()) {
1557      static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1558       {TTI::SK_Broadcast,        MVT::v4i16, 1}, // pshuflw
1559       {TTI::SK_Broadcast,        MVT::v2i16, 1}, // pshuflw
1560       {TTI::SK_Broadcast,        MVT::v8i8,  2}, // punpck/pshuflw
1561       {TTI::SK_Broadcast,        MVT::v4i8,  2}, // punpck/pshuflw
1562       {TTI::SK_Broadcast,        MVT::v2i8,  1}, // punpck
1563 
1564       {TTI::SK_Reverse,          MVT::v4i16, 1}, // pshuflw
1565       {TTI::SK_Reverse,          MVT::v2i16, 1}, // pshuflw
1566       {TTI::SK_Reverse,          MVT::v4i8,  3}, // punpck/pshuflw/packus
1567       {TTI::SK_Reverse,          MVT::v2i8,  1}, // punpck
1568 
1569       {TTI::SK_Splice,           MVT::v4i16, 2}, // punpck+psrldq
1570       {TTI::SK_Splice,           MVT::v2i16, 2}, // punpck+psrldq
1571       {TTI::SK_Splice,           MVT::v4i8,  2}, // punpck+psrldq
1572       {TTI::SK_Splice,           MVT::v2i8,  2}, // punpck+psrldq
1573 
1574       {TTI::SK_PermuteTwoSrc,    MVT::v4i16, 2}, // punpck/pshuflw
1575       {TTI::SK_PermuteTwoSrc,    MVT::v2i16, 2}, // punpck/pshuflw
1576       {TTI::SK_PermuteTwoSrc,    MVT::v8i8,  7}, // punpck/pshuflw
1577       {TTI::SK_PermuteTwoSrc,    MVT::v4i8,  4}, // punpck/pshuflw
1578       {TTI::SK_PermuteTwoSrc,    MVT::v2i8,  2}, // punpck
1579 
1580       {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1581       {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1582       {TTI::SK_PermuteSingleSrc, MVT::v8i8,  5}, // punpck/pshuflw
1583       {TTI::SK_PermuteSingleSrc, MVT::v4i8,  3}, // punpck/pshuflw
1584       {TTI::SK_PermuteSingleSrc, MVT::v2i8,  1}, // punpck
1585     };
1586 
1587     if (ST->hasSSE2())
1588       if (const auto *Entry =
1589               CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1590         return Entry->Cost;
1591   }
1592 
1593   // We are going to permute multiple sources and the result will be in multiple
1594   // destinations. Providing an accurate cost only for splits where the element
1595   // type remains the same.
1596   if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1597     MVT LegalVT = LT.second;
1598     if (LegalVT.isVector() &&
1599         LegalVT.getVectorElementType().getSizeInBits() ==
1600             BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1601         LegalVT.getVectorNumElements() <
1602             cast<FixedVectorType>(BaseTp)->getNumElements()) {
1603       unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1604       unsigned LegalVTSize = LegalVT.getStoreSize();
1605       // Number of source vectors after legalization:
1606       unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1607       // Number of destination vectors after legalization:
1608       InstructionCost NumOfDests = LT.first;
1609 
1610       auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1611                                               LegalVT.getVectorNumElements());
1612 
1613       if (!Mask.empty() && NumOfDests.isValid()) {
1614         // Try to perform better estimation of the permutation.
1615         // 1. Split the source/destination vectors into real registers.
1616         // 2. Do the mask analysis to identify which real registers are
1617         // permuted. If more than 1 source registers are used for the
1618         // destination register building, the cost for this destination register
1619         // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1620         // source register is used, build mask and calculate the cost as a cost
1621         // of PermuteSingleSrc.
1622         // Also, for the single register permute we try to identify if the
1623         // destination register is just a copy of the source register or the
1624         // copy of the previous destination register (the cost is
1625         // TTI::TCC_Basic). If the source register is just reused, the cost for
1626         // this operation is 0.
1627         NumOfDests =
1628             getTypeLegalizationCost(
1629                 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1630                 .first;
1631         unsigned E = *NumOfDests.getValue();
1632         unsigned NormalizedVF =
1633             LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1634         unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1635         unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1636         SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1637         copy(Mask, NormalizedMask.begin());
1638         unsigned PrevSrcReg = 0;
1639         ArrayRef<int> PrevRegMask;
1640         InstructionCost Cost = 0;
1641         processShuffleMasks(
1642             NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1643             [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1644              &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1645               if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1646                 // Check if the previous register can be just copied to the next
1647                 // one.
1648                 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1649                     PrevRegMask != RegMask)
1650                   Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
1651                                          RegMask, CostKind, 0, nullptr);
1652                 else
1653                   // Just a copy of previous destination register.
1654                   Cost += TTI::TCC_Basic;
1655                 return;
1656               }
1657               if (SrcReg != DestReg &&
1658                   any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1659                 // Just a copy of the source register.
1660                 Cost += TTI::TCC_Basic;
1661               }
1662               PrevSrcReg = SrcReg;
1663               PrevRegMask = RegMask;
1664             },
1665             [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1666                                                 unsigned /*Unused*/,
1667                                                 unsigned /*Unused*/) {
1668               Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1669                                      CostKind, 0, nullptr);
1670             });
1671         return Cost;
1672       }
1673 
1674       InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1675       return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1676                                             std::nullopt, CostKind, 0, nullptr);
1677     }
1678 
1679     return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1680   }
1681 
1682   // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1683   if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1684     // We assume that source and destination have the same vector type.
1685     InstructionCost NumOfDests = LT.first;
1686     InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1687     LT.first = NumOfDests * NumOfShufflesPerDest;
1688   }
1689 
1690   static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1691       {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1692       {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1693 
1694       {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1695       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1696 
1697       {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1698       {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1699       {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2}  // vpermt2b
1700   };
1701 
1702   if (ST->hasVBMI())
1703     if (const auto *Entry =
1704             CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1705       return LT.first * Entry->Cost;
1706 
1707   static const CostTblEntry AVX512BWShuffleTbl[] = {
1708       {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1709       {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1710       {TTI::SK_Broadcast, MVT::v64i8, 1},  // vpbroadcastb
1711 
1712       {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1713       {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1714       {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1715       {TTI::SK_Reverse, MVT::v64i8, 2},  // pshufb + vshufi64x2
1716 
1717       {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1718       {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1719       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1720       {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1721       {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8},  // extend to v32i16
1722 
1723       {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1724       {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1725       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1726       {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2},  // vpermt2w
1727       {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1728 
1729       {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1730       {TTI::SK_Select, MVT::v64i8,  1}, // vblendmb
1731 
1732       {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1733       {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1734       {TTI::SK_Splice, MVT::v64i8,  2}, // vshufi64x2 + palignr
1735   };
1736 
1737   if (ST->hasBWI())
1738     if (const auto *Entry =
1739             CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1740       return LT.first * Entry->Cost;
1741 
1742   static const CostKindTblEntry AVX512ShuffleTbl[] = {
1743       {TTI::SK_Broadcast, MVT::v8f64,  { 1, 1, 1, 1 } }, // vbroadcastsd
1744       {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1745       {TTI::SK_Broadcast, MVT::v8i64,  { 1, 1, 1, 1 } }, // vpbroadcastq
1746       {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1747       {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1748       {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1749       {TTI::SK_Broadcast, MVT::v64i8,  { 1, 1, 1, 1 } }, // vpbroadcastb
1750 
1751       {TTI::SK_Reverse, MVT::v8f64,  { 1, 3, 1, 1 } }, // vpermpd
1752       {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1753       {TTI::SK_Reverse, MVT::v8i64,  { 1, 3, 1, 1 } }, // vpermq
1754       {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1755       {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1756       {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1757       {TTI::SK_Reverse, MVT::v64i8,  { 7, 7, 7, 7 } }, // per mca
1758 
1759       {TTI::SK_Splice, MVT::v8f64,  { 1, 1, 1, 1 } }, // vpalignd
1760       {TTI::SK_Splice, MVT::v4f64,  { 1, 1, 1, 1 } }, // vpalignd
1761       {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1762       {TTI::SK_Splice, MVT::v8f32,  { 1, 1, 1, 1 } }, // vpalignd
1763       {TTI::SK_Splice, MVT::v8i64,  { 1, 1, 1, 1 } }, // vpalignd
1764       {TTI::SK_Splice, MVT::v4i64,  { 1, 1, 1, 1 } }, // vpalignd
1765       {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1766       {TTI::SK_Splice, MVT::v8i32,  { 1, 1, 1, 1 } }, // vpalignd
1767       {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1768       {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1769       {TTI::SK_Splice, MVT::v64i8,  { 4, 4, 4, 4 } }, // split + palignr
1770 
1771       {TTI::SK_PermuteSingleSrc, MVT::v8f64,  { 1, 3, 1, 1 } }, // vpermpd
1772       {TTI::SK_PermuteSingleSrc, MVT::v4f64,  { 1, 3, 1, 1 } }, // vpermpd
1773       {TTI::SK_PermuteSingleSrc, MVT::v2f64,  { 1, 3, 1, 1 } }, // vpermpd
1774       {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1775       {TTI::SK_PermuteSingleSrc, MVT::v8f32,  { 1, 3, 1, 1 } }, // vpermps
1776       {TTI::SK_PermuteSingleSrc, MVT::v4f32,  { 1, 3, 1, 1 } }, // vpermps
1777       {TTI::SK_PermuteSingleSrc, MVT::v8i64,  { 1, 3, 1, 1 } }, // vpermq
1778       {TTI::SK_PermuteSingleSrc, MVT::v4i64,  { 1, 3, 1, 1 } }, // vpermq
1779       {TTI::SK_PermuteSingleSrc, MVT::v2i64,  { 1, 3, 1, 1 } }, // vpermq
1780       {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1781       {TTI::SK_PermuteSingleSrc, MVT::v8i32,  { 1, 3, 1, 1 } }, // vpermd
1782       {TTI::SK_PermuteSingleSrc, MVT::v4i32,  { 1, 3, 1, 1 } }, // vpermd
1783       {TTI::SK_PermuteSingleSrc, MVT::v16i8,  { 1, 3, 1, 1 } }, // pshufb
1784 
1785       {TTI::SK_PermuteTwoSrc, MVT::v8f64,  { 1, 3, 1, 1 } }, // vpermt2pd
1786       {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1787       {TTI::SK_PermuteTwoSrc, MVT::v8i64,  { 1, 3, 1, 1 } }, // vpermt2q
1788       {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1789       {TTI::SK_PermuteTwoSrc, MVT::v4f64,  { 1, 3, 1, 1 } }, // vpermt2pd
1790       {TTI::SK_PermuteTwoSrc, MVT::v8f32,  { 1, 3, 1, 1 } }, // vpermt2ps
1791       {TTI::SK_PermuteTwoSrc, MVT::v4i64,  { 1, 3, 1, 1 } }, // vpermt2q
1792       {TTI::SK_PermuteTwoSrc, MVT::v8i32,  { 1, 3, 1, 1 } }, // vpermt2d
1793       {TTI::SK_PermuteTwoSrc, MVT::v2f64,  { 1, 3, 1, 1 } }, // vpermt2pd
1794       {TTI::SK_PermuteTwoSrc, MVT::v4f32,  { 1, 3, 1, 1 } }, // vpermt2ps
1795       {TTI::SK_PermuteTwoSrc, MVT::v2i64,  { 1, 3, 1, 1 } }, // vpermt2q
1796       {TTI::SK_PermuteTwoSrc, MVT::v4i32,  { 1, 3, 1, 1 } }, // vpermt2d
1797 
1798       // FIXME: This just applies the type legalization cost rules above
1799       // assuming these completely split.
1800       {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1801       {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1802       {TTI::SK_PermuteSingleSrc, MVT::v64i8,  { 14, 14, 14, 14 } },
1803       {TTI::SK_PermuteTwoSrc,    MVT::v32i16, { 42, 42, 42, 42 } },
1804       {TTI::SK_PermuteTwoSrc,    MVT::v32f16, { 42, 42, 42, 42 } },
1805       {TTI::SK_PermuteTwoSrc,    MVT::v64i8,  { 42, 42, 42, 42 } },
1806 
1807       {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1808       {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1809       {TTI::SK_Select, MVT::v64i8,  { 1, 1, 1, 1 } }, // vpternlogq
1810       {TTI::SK_Select, MVT::v8f64,  { 1, 1, 1, 1 } }, // vblendmpd
1811       {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1812       {TTI::SK_Select, MVT::v8i64,  { 1, 1, 1, 1 } }, // vblendmq
1813       {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1814   };
1815 
1816   if (ST->hasAVX512())
1817     if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1818       if (auto KindCost = Entry->Cost[CostKind])
1819         return LT.first * *KindCost;
1820 
1821   static const CostTblEntry AVX2ShuffleTbl[] = {
1822       {TTI::SK_Broadcast, MVT::v4f64, 1},  // vbroadcastpd
1823       {TTI::SK_Broadcast, MVT::v8f32, 1},  // vbroadcastps
1824       {TTI::SK_Broadcast, MVT::v4i64, 1},  // vpbroadcastq
1825       {TTI::SK_Broadcast, MVT::v8i32, 1},  // vpbroadcastd
1826       {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1827       {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1828       {TTI::SK_Broadcast, MVT::v32i8, 1},  // vpbroadcastb
1829 
1830       {TTI::SK_Reverse, MVT::v4f64, 1},  // vpermpd
1831       {TTI::SK_Reverse, MVT::v8f32, 1},  // vpermps
1832       {TTI::SK_Reverse, MVT::v4i64, 1},  // vpermq
1833       {TTI::SK_Reverse, MVT::v8i32, 1},  // vpermd
1834       {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1835       {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1836       {TTI::SK_Reverse, MVT::v32i8, 2},  // vperm2i128 + pshufb
1837 
1838       {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1839       {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1840       {TTI::SK_Select, MVT::v32i8,  1}, // vpblendvb
1841 
1842       {TTI::SK_Splice, MVT::v8i32,  2}, // vperm2i128 + vpalignr
1843       {TTI::SK_Splice, MVT::v8f32,  2}, // vperm2i128 + vpalignr
1844       {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1845       {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1846       {TTI::SK_Splice, MVT::v32i8,  2}, // vperm2i128 + vpalignr
1847 
1848       {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1},  // vpermpd
1849       {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1},  // vpermps
1850       {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1},  // vpermq
1851       {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1},  // vpermd
1852       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1853                                                   // + vpblendvb
1854       {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1855                                                   // + vpblendvb
1856       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vperm2i128 + 2*vpshufb
1857                                                   // + vpblendvb
1858 
1859       {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3},  // 2*vpermpd + vblendpd
1860       {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3},  // 2*vpermps + vblendps
1861       {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3},  // 2*vpermq + vpblendd
1862       {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3},  // 2*vpermd + vpblendd
1863       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1864                                                // + vpblendvb
1865       {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1866                                                // + vpblendvb
1867       {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7},  // 2*vperm2i128 + 4*vpshufb
1868                                                // + vpblendvb
1869   };
1870 
1871   if (ST->hasAVX2())
1872     if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1873       return LT.first * Entry->Cost;
1874 
1875   static const CostTblEntry XOPShuffleTbl[] = {
1876       {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vpermil2pd
1877       {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2},  // vperm2f128 + vpermil2ps
1878       {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2},  // vperm2f128 + vpermil2pd
1879       {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2},  // vperm2f128 + vpermil2ps
1880       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1881                                                   // + vinsertf128
1882       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vextractf128 + 2*vpperm
1883                                                   // + vinsertf128
1884 
1885       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1886                                                // + vinsertf128
1887       {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1},  // vpperm
1888       {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9},  // 2*vextractf128 + 6*vpperm
1889                                                // + vinsertf128
1890       {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1},  // vpperm
1891   };
1892 
1893   if (ST->hasXOP())
1894     if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1895       return LT.first * Entry->Cost;
1896 
1897   static const CostTblEntry AVX1ShuffleTbl[] = {
1898       {TTI::SK_Broadcast, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
1899       {TTI::SK_Broadcast, MVT::v8f32, 2},  // vperm2f128 + vpermilps
1900       {TTI::SK_Broadcast, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
1901       {TTI::SK_Broadcast, MVT::v8i32, 2},  // vperm2f128 + vpermilps
1902       {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1903       {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1904       {TTI::SK_Broadcast, MVT::v32i8, 2},  // vpshufb + vinsertf128
1905 
1906       {TTI::SK_Reverse, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
1907       {TTI::SK_Reverse, MVT::v8f32, 2},  // vperm2f128 + vpermilps
1908       {TTI::SK_Reverse, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
1909       {TTI::SK_Reverse, MVT::v8i32, 2},  // vperm2f128 + vpermilps
1910       {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1911                                          // + vinsertf128
1912       {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1913                                          // + vinsertf128
1914       {TTI::SK_Reverse, MVT::v32i8, 4},  // vextractf128 + 2*pshufb
1915                                          // + vinsertf128
1916 
1917       {TTI::SK_Select, MVT::v4i64, 1},  // vblendpd
1918       {TTI::SK_Select, MVT::v4f64, 1},  // vblendpd
1919       {TTI::SK_Select, MVT::v8i32, 1},  // vblendps
1920       {TTI::SK_Select, MVT::v8f32, 1},  // vblendps
1921       {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1922       {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1923       {TTI::SK_Select, MVT::v32i8, 3},  // vpand + vpandn + vpor
1924 
1925       {TTI::SK_Splice, MVT::v4i64,  2}, // vperm2f128 + shufpd
1926       {TTI::SK_Splice, MVT::v4f64,  2}, // vperm2f128 + shufpd
1927       {TTI::SK_Splice, MVT::v8i32,  4}, // 2*vperm2f128 + 2*vshufps
1928       {TTI::SK_Splice, MVT::v8f32,  4}, // 2*vperm2f128 + 2*vshufps
1929       {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1930       {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1931       {TTI::SK_Splice, MVT::v32i8,  5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1932 
1933       {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vshufpd
1934       {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2},  // vperm2f128 + vshufpd
1935       {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4},  // 2*vperm2f128 + 2*vshufps
1936       {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4},  // 2*vperm2f128 + 2*vshufps
1937       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1938                                                   // + 2*por + vinsertf128
1939       {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1940                                                   // + 2*por + vinsertf128
1941       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8},  // vextractf128 + 4*pshufb
1942                                                   // + 2*por + vinsertf128
1943 
1944       {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3},   // 2*vperm2f128 + vshufpd
1945       {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3},   // 2*vperm2f128 + vshufpd
1946       {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4},   // 2*vperm2f128 + 2*vshufps
1947       {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4},   // 2*vperm2f128 + 2*vshufps
1948       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1949                                                 // + 4*por + vinsertf128
1950       {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1951                                                 // + 4*por + vinsertf128
1952       {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15},  // 2*vextractf128 + 8*pshufb
1953                                                 // + 4*por + vinsertf128
1954   };
1955 
1956   if (ST->hasAVX())
1957     if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1958       return LT.first * Entry->Cost;
1959 
1960   static const CostTblEntry SSE41ShuffleTbl[] = {
1961       {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1962       {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1963       {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1964       {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1965       {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1966       {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
1967       {TTI::SK_Select, MVT::v16i8, 1}  // pblendvb
1968   };
1969 
1970   if (ST->hasSSE41())
1971     if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1972       return LT.first * Entry->Cost;
1973 
1974   static const CostTblEntry SSSE3ShuffleTbl[] = {
1975       {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1976       {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
1977       {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1978 
1979       {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1980       {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
1981       {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1982 
1983       {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1984       {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
1985       {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1986 
1987       {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
1988       {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
1989       {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
1990       {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
1991       {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
1992 
1993       {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1994       {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
1995       {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1996 
1997       {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1998       {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
1999       {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2000   };
2001 
2002   if (ST->hasSSSE3())
2003     if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2004       return LT.first * Entry->Cost;
2005 
2006   static const CostTblEntry SSE2ShuffleTbl[] = {
2007       {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2008       {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2009       {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2010       {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2011       {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2012       {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2013 
2014       {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2015       {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2016       {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2017       {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2018       {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2019       {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2020                                         // + 2*pshufd + 2*unpck + packus
2021 
2022       {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2023       {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2024       {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2025       {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2026       {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2027       {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2028 
2029       {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2030       {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2031       {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2032       {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2033       {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2034       {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2035 
2036       {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2037       {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2038       {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2039       {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2040                                                   // + pshufd/unpck
2041       {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2042                                                   // + pshufd/unpck
2043     { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2044                                                   // + 2*pshufd + 2*unpck + 2*packus
2045 
2046     { TTI::SK_PermuteTwoSrc,    MVT::v2f64,  1 }, // shufpd
2047     { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // shufpd
2048     { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  2 }, // 2*{unpck,movsd,pshufd}
2049     { TTI::SK_PermuteTwoSrc,    MVT::v8i16,  8 }, // blend+permute
2050     { TTI::SK_PermuteTwoSrc,    MVT::v8f16,  8 }, // blend+permute
2051     { TTI::SK_PermuteTwoSrc,    MVT::v16i8, 13 }, // blend+permute
2052   };
2053 
2054   static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2055       {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2056   };
2057 
2058   if (ST->hasSSE2()) {
2059     bool IsLoad =
2060         llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2061     if (ST->hasSSE3() && IsLoad)
2062       if (const auto *Entry =
2063               CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2064         assert(isLegalBroadcastLoad(BaseTp->getElementType(),
2065                                     LT.second.getVectorElementCount()) &&
2066                "Table entry missing from isLegalBroadcastLoad()");
2067         return LT.first * Entry->Cost;
2068       }
2069 
2070     if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2071       return LT.first * Entry->Cost;
2072   }
2073 
2074   static const CostTblEntry SSE1ShuffleTbl[] = {
2075     { TTI::SK_Broadcast,        MVT::v4f32, 1 }, // shufps
2076     { TTI::SK_Reverse,          MVT::v4f32, 1 }, // shufps
2077     { TTI::SK_Select,           MVT::v4f32, 2 }, // 2*shufps
2078     { TTI::SK_Splice,           MVT::v4f32, 2 }, // 2*shufps
2079     { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2080     { TTI::SK_PermuteTwoSrc,    MVT::v4f32, 2 }, // 2*shufps
2081   };
2082 
2083   if (ST->hasSSE1())
2084     if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2085       return LT.first * Entry->Cost;
2086 
2087   return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2088 }
2089 
2090 InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
2091                                              Type *Src,
2092                                              TTI::CastContextHint CCH,
2093                                              TTI::TargetCostKind CostKind,
2094                                              const Instruction *I) {
2095   int ISD = TLI->InstructionOpcodeToISD(Opcode);
2096   assert(ISD && "Invalid opcode");
2097 
2098   // TODO: Allow non-throughput costs that aren't binary.
2099   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2100     if (CostKind != TTI::TCK_RecipThroughput)
2101       return Cost == 0 ? 0 : 1;
2102     return Cost;
2103   };
2104 
2105   // The cost tables include both specific, custom (non-legal) src/dst type
2106   // conversions and generic, legalized types. We test for customs first, before
2107   // falling back to legalization.
2108   // FIXME: Need a better design of the cost table to handle non-simple types of
2109   // potential massive combinations (elem_num x src_type x dst_type).
2110   static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
2111     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2112     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2113 
2114     // Mask sign extend has an instruction.
2115     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   1 },
2116     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v2i1,   1 },
2117     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   1 },
2118     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v2i1,   1 },
2119     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   1 },
2120     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v4i1,   1 },
2121     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   1 },
2122     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v4i1,   1 },
2123     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   1 },
2124     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v8i1,   1 },
2125     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   1 },
2126     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  1 },
2127     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
2128     { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1,  1 },
2129     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1,  1 },
2130     { ISD::SIGN_EXTEND, MVT::v64i8,  MVT::v64i1,  1 },
2131     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1,  1 },
2132 
2133     // Mask zero extend is a sext + shift.
2134     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   2 },
2135     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v2i1,   2 },
2136     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   2 },
2137     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v2i1,   2 },
2138     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   2 },
2139     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v4i1,   2 },
2140     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   2 },
2141     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v4i1,   2 },
2142     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   2 },
2143     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v8i1,   2 },
2144     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   2 },
2145     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  2 },
2146     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  2 },
2147     { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1,  2 },
2148     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1,  2 },
2149     { ISD::ZERO_EXTEND, MVT::v64i8,  MVT::v64i1,  2 },
2150     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1,  2 },
2151 
2152     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 },
2153     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v16i8,  2 },
2154     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 },
2155     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v8i16,  2 },
2156     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 },
2157     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v16i8,  2 },
2158     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 },
2159     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v8i16,  2 },
2160     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 },
2161     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  2 },
2162     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 },
2163     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 },
2164     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 },
2165     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 },
2166     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i16, 2 },
2167     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v64i8,  2 },
2168     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i16, 2 },
2169 
2170     { ISD::TRUNCATE,    MVT::v32i8,  MVT::v32i16, 2 },
2171     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 }, // widen to zmm
2172     { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i16,  2 }, // vpmovwb
2173     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 }, // vpmovwb
2174     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 }, // vpmovwb
2175   };
2176 
2177   static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
2178     // Mask sign extend has an instruction.
2179     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 },
2180     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v2i1,   1 },
2181     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 },
2182     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 },
2183     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 },
2184     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v16i1,  1 },
2185     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   1 },
2186     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  1 },
2187 
2188     // Mask zero extend is a sext + shift.
2189     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 },
2190     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v2i1,   2 },
2191     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 },
2192     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 },
2193     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 },
2194     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v16i1,  2 },
2195     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   2 },
2196     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
2197 
2198     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i64,  2 },
2199     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v4i32,  2 },
2200     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i32,  2 },
2201     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  2 },
2202     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
2203     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  2 },
2204     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i32, 2 },
2205     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v8i64,  2 },
2206 
2207     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
2208     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
2209 
2210     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
2211     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
2212 
2213     { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f32,  1 },
2214     { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f64,  1 },
2215 
2216     { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f32,  1 },
2217     { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f64,  1 },
2218   };
2219 
2220   // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2221   // 256-bit wide vectors.
2222 
2223   static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
2224     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
2225     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
2226     { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
2227 
2228     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,   3 }, // sext+vpslld+vptestmd
2229     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,   3 }, // sext+vpslld+vptestmd
2230     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,   3 }, // sext+vpslld+vptestmd
2231     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,  3 }, // sext+vpslld+vptestmd
2232     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,  3 }, // sext+vpsllq+vptestmq
2233     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,  3 }, // sext+vpsllq+vptestmq
2234     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,  3 }, // sext+vpsllq+vptestmq
2235     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16, 3 }, // sext+vpslld+vptestmd
2236     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,  2 }, // zmm vpslld+vptestmd
2237     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,  2 }, // zmm vpslld+vptestmd
2238     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,  2 }, // zmm vpslld+vptestmd
2239     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i32, 2 }, // vpslld+vptestmd
2240     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,  2 }, // zmm vpsllq+vptestmq
2241     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,  2 }, // zmm vpsllq+vptestmq
2242     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i64,  2 }, // vpsllq+vptestmq
2243     { ISD::TRUNCATE,  MVT::v2i8,    MVT::v2i32,  2 }, // vpmovdb
2244     { ISD::TRUNCATE,  MVT::v4i8,    MVT::v4i32,  2 }, // vpmovdb
2245     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 2 }, // vpmovdb
2246     { ISD::TRUNCATE,  MVT::v32i8,   MVT::v16i32, 2 }, // vpmovdb
2247     { ISD::TRUNCATE,  MVT::v64i8,   MVT::v16i32, 2 }, // vpmovdb
2248     { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 2 }, // vpmovdw
2249     { ISD::TRUNCATE,  MVT::v32i16,  MVT::v16i32, 2 }, // vpmovdw
2250     { ISD::TRUNCATE,  MVT::v2i8,    MVT::v2i64,  2 }, // vpmovqb
2251     { ISD::TRUNCATE,  MVT::v2i16,   MVT::v2i64,  1 }, // vpshufb
2252     { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i64,  2 }, // vpmovqb
2253     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v8i64,  2 }, // vpmovqb
2254     { ISD::TRUNCATE,  MVT::v32i8,   MVT::v8i64,  2 }, // vpmovqb
2255     { ISD::TRUNCATE,  MVT::v64i8,   MVT::v8i64,  2 }, // vpmovqb
2256     { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  2 }, // vpmovqw
2257     { ISD::TRUNCATE,  MVT::v16i16,  MVT::v8i64,  2 }, // vpmovqw
2258     { ISD::TRUNCATE,  MVT::v32i16,  MVT::v8i64,  2 }, // vpmovqw
2259     { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 }, // vpmovqd
2260     { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,  1 }, // zmm vpmovqd
2261     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
2262 
2263     { ISD::TRUNCATE,  MVT::v16i8,  MVT::v16i16,  3 }, // extend to v16i32
2264     { ISD::TRUNCATE,  MVT::v32i8,  MVT::v32i16,  8 },
2265     { ISD::TRUNCATE,  MVT::v64i8,  MVT::v32i16,  8 },
2266 
2267     // Sign extend is zmm vpternlogd+vptruncdb.
2268     // Zero extend is zmm broadcast load+vptruncdw.
2269     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   3 },
2270     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   4 },
2271     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   3 },
2272     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   4 },
2273     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   3 },
2274     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   4 },
2275     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  3 },
2276     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  4 },
2277 
2278     // Sign extend is zmm vpternlogd+vptruncdw.
2279     // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2280     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   3 },
2281     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   4 },
2282     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   3 },
2283     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   4 },
2284     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   3 },
2285     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   4 },
2286     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  3 },
2287     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
2288 
2289     { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   1 }, // zmm vpternlogd
2290     { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   2 }, // zmm vpternlogd+psrld
2291     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 }, // zmm vpternlogd
2292     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 }, // zmm vpternlogd+psrld
2293     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 }, // zmm vpternlogd
2294     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 }, // zmm vpternlogd+psrld
2295     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 }, // zmm vpternlogq
2296     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 }, // zmm vpternlogq+psrlq
2297     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 }, // zmm vpternlogq
2298     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 }, // zmm vpternlogq+psrlq
2299 
2300     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  1 }, // vpternlogd
2301     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 }, // vpternlogd+psrld
2302     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   1 }, // vpternlogq
2303     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   2 }, // vpternlogq+psrlq
2304 
2305     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
2306     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
2307     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2308     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2309     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
2310     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
2311     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
2312     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
2313     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
2314     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
2315 
2316     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8,  3 }, // FIXME: May not be right
2317     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8,  3 }, // FIXME: May not be right
2318 
2319     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
2320     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
2321     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v16i8,  2 },
2322     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  1 },
2323     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
2324     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 1 },
2325     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
2326     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
2327 
2328     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
2329     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
2330     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v16i8,  2 },
2331     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  1 },
2332     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
2333     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 1 },
2334     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
2335     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
2336     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
2337     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  5 },
2338 
2339     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, 2 },
2340     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f64, 7 },
2341     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v32f64,15 },
2342     { ISD::FP_TO_SINT,  MVT::v64i8,  MVT::v64f32,11 },
2343     { ISD::FP_TO_SINT,  MVT::v64i8,  MVT::v64f64,31 },
2344     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f64,  3 },
2345     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f64, 7 },
2346     { ISD::FP_TO_SINT,  MVT::v32i16, MVT::v32f32, 5 },
2347     { ISD::FP_TO_SINT,  MVT::v32i16, MVT::v32f64,15 },
2348     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  1 },
2349     { ISD::FP_TO_SINT,  MVT::v16i32, MVT::v16f64, 3 },
2350 
2351     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  1 },
2352     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f64,  3 },
2353     { ISD::FP_TO_UINT,  MVT::v8i8,   MVT::v8f64,  3 },
2354     { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
2355     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 3 },
2356     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v16f32, 3 },
2357   };
2358 
2359   static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
2360     // Mask sign extend has an instruction.
2361     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   1 },
2362     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v2i1,   1 },
2363     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   1 },
2364     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v2i1,   1 },
2365     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   1 },
2366     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v4i1,   1 },
2367     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   1 },
2368     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v4i1,   1 },
2369     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   1 },
2370     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v8i1,   1 },
2371     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   1 },
2372     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  1 },
2373     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
2374     { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1,  1 },
2375     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1,  1 },
2376     { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v64i1,  1 },
2377     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1,  1 },
2378 
2379     // Mask zero extend is a sext + shift.
2380     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   2 },
2381     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v2i1,   2 },
2382     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   2 },
2383     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v2i1,   2 },
2384     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   2 },
2385     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v4i1,   2 },
2386     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   2 },
2387     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v4i1,   2 },
2388     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   2 },
2389     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v8i1,   2 },
2390     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   2 },
2391     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  2 },
2392     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  2 },
2393     { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1,  2 },
2394     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1,  2 },
2395     { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v64i1,  2 },
2396     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1,  2 },
2397 
2398     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 },
2399     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v16i8,  2 },
2400     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 },
2401     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v8i16,  2 },
2402     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 },
2403     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v16i8,  2 },
2404     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 },
2405     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v8i16,  2 },
2406     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 },
2407     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  2 },
2408     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 },
2409     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 },
2410     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 },
2411     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 },
2412     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v16i16, 2 },
2413     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i8,  2 },
2414     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v16i16, 2 },
2415 
2416     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 },
2417   };
2418 
2419   static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
2420     // Mask sign extend has an instruction.
2421     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 },
2422     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v2i1,   1 },
2423     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 },
2424     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i1,  1 },
2425     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 },
2426     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i1,   1 },
2427     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i1,  1 },
2428     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 },
2429 
2430     // Mask zero extend is a sext + shift.
2431     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 },
2432     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v2i1,   2 },
2433     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 },
2434     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i1,  2 },
2435     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 },
2436     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i1,   2 },
2437     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i1,  2 },
2438     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 },
2439 
2440     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v4i64,  2 },
2441     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v8i32,  2 },
2442     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i64,  2 },
2443     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v4i32,  2 },
2444     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i32,  2 },
2445     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  2 },
2446     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v4i64,  2 },
2447     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
2448 
2449     { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
2450     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
2451     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
2452     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
2453 
2454     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
2455     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
2456     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
2457     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
2458 
2459     { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v4f32,  1 },
2460     { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f32,  1 },
2461     { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f64,  1 },
2462     { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f64,  1 },
2463 
2464     { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v4f32,  1 },
2465     { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  1 },
2466     { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  1 },
2467     { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  1 },
2468   };
2469 
2470   static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2471     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,   3 }, // sext+vpslld+vptestmd
2472     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,   3 }, // sext+vpslld+vptestmd
2473     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,   3 }, // sext+vpslld+vptestmd
2474     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,  8 }, // split+2*v8i8
2475     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,  3 }, // sext+vpsllq+vptestmq
2476     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,  3 }, // sext+vpsllq+vptestmq
2477     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,  3 }, // sext+vpsllq+vptestmq
2478     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16, 8 }, // split+2*v8i16
2479     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,  2 }, // vpslld+vptestmd
2480     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,  2 }, // vpslld+vptestmd
2481     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,  2 }, // vpslld+vptestmd
2482     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v8i32,  2 }, // vpslld+vptestmd
2483     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,  2 }, // vpsllq+vptestmq
2484     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,  2 }, // vpsllq+vptestmq
2485     { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,  1 }, // vpmovqd
2486     { ISD::TRUNCATE,  MVT::v4i8,    MVT::v4i64,  2 }, // vpmovqb
2487     { ISD::TRUNCATE,  MVT::v4i16,   MVT::v4i64,  2 }, // vpmovqw
2488     { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i32,  2 }, // vpmovwb
2489 
2490     // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2491     // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2492     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   5 },
2493     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   6 },
2494     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   5 },
2495     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   6 },
2496     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   5 },
2497     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   6 },
2498     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1, 10 },
2499     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1, 12 },
2500 
2501     // sign extend is vpcmpeq+maskedmove+vpmovdw
2502     // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2503     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   4 },
2504     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   5 },
2505     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   4 },
2506     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   5 },
2507     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   4 },
2508     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   5 },
2509     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
2510     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
2511 
2512     { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   1 }, // vpternlogd
2513     { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   2 }, // vpternlogd+psrld
2514     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 }, // vpternlogd
2515     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 }, // vpternlogd+psrld
2516     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 }, // vpternlogd
2517     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 }, // vpternlogd+psrld
2518     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i1,  1 }, // vpternlogd
2519     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i1,  2 }, // vpternlogd+psrld
2520 
2521     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 }, // vpternlogq
2522     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 }, // vpternlogq+psrlq
2523     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 }, // vpternlogq
2524     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 }, // vpternlogq+psrlq
2525 
2526     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  1 },
2527     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  1 },
2528     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  1 },
2529     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  1 },
2530     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
2531     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
2532     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  1 },
2533     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  1 },
2534     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
2535     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
2536     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
2537     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
2538 
2539     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
2540     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  1 },
2541     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
2542     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  1 },
2543 
2544     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    1 },
2545     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
2546     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
2547     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  1 },
2548     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
2549     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  1 },
2550     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  1 },
2551     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
2552     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
2553     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
2554     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  5 },
2555     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
2556     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  5 },
2557 
2558     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v8f32,  2 },
2559     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, 2 },
2560     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v32f32, 5 },
2561 
2562     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    1 },
2563     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    1 },
2564     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
2565     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  1 },
2566     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  1 },
2567     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
2568     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  1 },
2569   };
2570 
2571   static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2572     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
2573     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
2574     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
2575     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
2576     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
2577     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
2578 
2579     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  2 },
2580     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  2 },
2581     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  2 },
2582     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  2 },
2583     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
2584     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
2585     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  2 },
2586     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  2 },
2587     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
2588     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
2589     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2590     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2591     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  2 },
2592     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  2 },
2593 
2594     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
2595 
2596     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 4 },
2597     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 4 },
2598     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i16,  1 },
2599     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  1 },
2600     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  1 },
2601     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i32,  4 },
2602     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i64,  4 },
2603     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  1 },
2604     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v2i64,  1 },
2605     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i64,  5 },
2606     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  1 },
2607     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
2608 
2609     { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
2610     { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
2611 
2612     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v8f32,  1 },
2613     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f64,  1 },
2614     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f32,  1 },
2615     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  3 },
2616 
2617     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    3 },
2618     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    3 },
2619     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v8f32,  1 },
2620     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  3 },
2621     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
2622     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  4 },
2623     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  3 },
2624     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  4 },
2625 
2626     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  2 },
2627     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  2 },
2628     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  2 },
2629     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
2630     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
2631     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
2632     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  3 },
2633 
2634     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  2 },
2635     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  2 },
2636     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  2 },
2637     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
2638     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
2639     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
2640     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  2 },
2641     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
2642     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  2 },
2643     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  4 },
2644   };
2645 
2646   static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2647     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   6 },
2648     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   4 },
2649     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   7 },
2650     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   4 },
2651     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
2652     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
2653 
2654     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  3 },
2655     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  3 },
2656     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  3 },
2657     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  3 },
2658     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
2659     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
2660     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  3 },
2661     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  3 },
2662     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
2663     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
2664     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
2665     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
2666 
2667     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  4 },
2668     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  5 },
2669     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 4 },
2670     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  9 },
2671     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i64, 11 },
2672 
2673     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
2674     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 6 },
2675     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 }, // and+extract+packuswb
2676     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i32,  5 },
2677     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
2678     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i64,  5 },
2679     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i64,  3 }, // and+extract+2*packusdw
2680     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
2681 
2682     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i1,   3 },
2683     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i1,   3 },
2684     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i1,   8 },
2685     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  4 },
2686     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v16i8,  2 },
2687     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  4 },
2688     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v8i16,  2 },
2689     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
2690     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  2 },
2691     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  4 },
2692     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  5 },
2693     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  8 },
2694 
2695     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i1,   7 },
2696     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i1,   7 },
2697     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i1,   6 },
2698     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  4 },
2699     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v16i8,  2 },
2700     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  4 },
2701     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v8i16,  2 },
2702     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  4 },
2703     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  4 },
2704     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  5 },
2705     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  6 },
2706     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
2707     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32, 10 },
2708     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64, 10 },
2709     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64, 18 },
2710     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
2711     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 10 },
2712 
2713     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v8f32,  2 },
2714     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f64,  2 },
2715     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v8f32,  2 },
2716     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v4f64,  2 },
2717     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f32,  2 },
2718     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f64,  2 },
2719     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v8f32,  2 },
2720     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v4f64,  2 },
2721     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f64,  2 },
2722     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f32,  2 },
2723     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  5 },
2724 
2725     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v8f32,  2 },
2726     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f64,  2 },
2727     { ISD::FP_TO_UINT,  MVT::v32i8,  MVT::v8f32,  2 },
2728     { ISD::FP_TO_UINT,  MVT::v32i8,  MVT::v4f64,  2 },
2729     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f32,  2 },
2730     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f64,  2 },
2731     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v8f32,  2 },
2732     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v4f64,  2 },
2733     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  3 },
2734     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
2735     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  6 },
2736     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  7 },
2737     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  7 },
2738 
2739     { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  1 },
2740     { ISD::FP_ROUND,    MVT::v4f32,  MVT::v4f64,  1 },
2741   };
2742 
2743   static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2744     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8,   1 },
2745     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8,   1 },
2746     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8,   1 },
2747     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8,   1 },
2748     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8,   1 },
2749     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8,   1 },
2750     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16,   1 },
2751     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16,   1 },
2752     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16,   1 },
2753     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16,   1 },
2754     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32,   1 },
2755     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32,   1 },
2756 
2757     // These truncates end up widening elements.
2758     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   1 }, // PMOVXZBQ
2759     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  1 }, // PMOVXZWQ
2760     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   1 }, // PMOVXZBD
2761 
2762     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  2 },
2763     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  2 },
2764     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  2 },
2765 
2766     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i32,    1 },
2767     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i32,    1 },
2768     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i64,    1 },
2769     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
2770     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  1 },
2771     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
2772     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  1 },
2773     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
2774     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
2775     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  1 },
2776     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
2777 
2778     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i32,    1 },
2779     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i32,    1 },
2780     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    4 },
2781     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    4 },
2782     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  1 },
2783     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
2784     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  1 },
2785     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
2786     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  3 },
2787     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  3 },
2788     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  2 },
2789     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v2i64, 12 },
2790     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64, 22 },
2791     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  4 },
2792 
2793     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f32,    1 },
2794     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f32,    1 },
2795     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f64,    1 },
2796     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f64,    1 },
2797     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f32,  2 },
2798     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v2f64,  2 },
2799     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f32,  1 },
2800     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v2f64,  1 },
2801     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f32,  1 },
2802     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v2f64,  1 },
2803 
2804     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f32,    1 },
2805     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    4 },
2806     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f64,    1 },
2807     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    4 },
2808     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f32,  2 },
2809     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v2f64,  2 },
2810     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f32,  1 },
2811     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v2f64,  1 },
2812     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  4 },
2813     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
2814   };
2815 
2816   static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2817     // These are somewhat magic numbers justified by comparing the
2818     // output of llvm-mca for our various supported scheduler models
2819     // and basing it off the worst case scenario.
2820     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i32,    3 },
2821     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i32,    3 },
2822     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i64,    3 },
2823     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i64,    3 },
2824     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  3 },
2825     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  4 },
2826     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  3 },
2827     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  4 },
2828     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  3 },
2829     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  4 },
2830     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  8 },
2831     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  8 },
2832 
2833     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i32,    3 },
2834     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i32,    3 },
2835     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    8 },
2836     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    9 },
2837     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  4 },
2838     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  4 },
2839     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  4 },
2840     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  4 },
2841     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  7 },
2842     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  7 },
2843     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  5 },
2844     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64, 15 },
2845     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v2i64, 18 },
2846 
2847     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f32,    4 },
2848     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f32,    4 },
2849     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f64,    4 },
2850     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f64,    4 },
2851     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f32,  6 },
2852     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v2f64,  6 },
2853     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f32,  5 },
2854     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v2f64,  5 },
2855     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f32,  4 },
2856     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v2f64,  4 },
2857 
2858     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f32,    4 },
2859     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    4 },
2860     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f64,    4 },
2861     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,   15 },
2862     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f32,  6 },
2863     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v2f64,  6 },
2864     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f32,  5 },
2865     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v2f64,  5 },
2866     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  8 },
2867     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  8 },
2868 
2869     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v16i8,  4 },
2870     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v16i8,  4 },
2871     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v16i8,  2 },
2872     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v16i8,  3 },
2873     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v16i8,  1 },
2874     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v16i8,  2 },
2875     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v8i16,  2 },
2876     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v8i16,  3 },
2877     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v8i16,  1 },
2878     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v8i16,  2 },
2879     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v4i32,  1 },
2880     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v4i32,  2 },
2881 
2882     // These truncates are really widening elements.
2883     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i32,  1 }, // PSHUFD
2884     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 }, // PUNPCKLWD+DQ
2885     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   3 }, // PUNPCKLBW+WD+PSHUFD
2886     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  1 }, // PUNPCKLWD
2887     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 }, // PUNPCKLBW+WD
2888     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   1 }, // PUNPCKLBW
2889 
2890     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i16,  2 }, // PAND+PACKUSWB
2891     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
2892     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  3 }, // PAND+2*PACKUSWB
2893     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
2894     { ISD::TRUNCATE,    MVT::v2i16,  MVT::v2i32,  1 },
2895     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  3 },
2896     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
2897     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32,10 },
2898     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  4 }, // PAND+3*PACKUSWB
2899     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v2i64,  2 }, // PSHUFD+PSHUFLW
2900     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v2i64,  1 }, // PSHUFD
2901   };
2902 
2903   // Attempt to map directly to (simple) MVT types to let us match custom entries.
2904   EVT SrcTy = TLI->getValueType(DL, Src);
2905   EVT DstTy = TLI->getValueType(DL, Dst);
2906 
2907   // The function getSimpleVT only handles simple value types.
2908   if (SrcTy.isSimple() && DstTy.isSimple()) {
2909     MVT SimpleSrcTy = SrcTy.getSimpleVT();
2910     MVT SimpleDstTy = DstTy.getSimpleVT();
2911 
2912     if (ST->useAVX512Regs()) {
2913       if (ST->hasBWI())
2914         if (const auto *Entry = ConvertCostTableLookup(
2915                 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2916           return AdjustCost(Entry->Cost);
2917 
2918       if (ST->hasDQI())
2919         if (const auto *Entry = ConvertCostTableLookup(
2920                 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2921           return AdjustCost(Entry->Cost);
2922 
2923       if (ST->hasAVX512())
2924         if (const auto *Entry = ConvertCostTableLookup(
2925                 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2926           return AdjustCost(Entry->Cost);
2927     }
2928 
2929     if (ST->hasBWI())
2930       if (const auto *Entry = ConvertCostTableLookup(
2931               AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2932         return AdjustCost(Entry->Cost);
2933 
2934     if (ST->hasDQI())
2935       if (const auto *Entry = ConvertCostTableLookup(
2936               AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2937         return AdjustCost(Entry->Cost);
2938 
2939     if (ST->hasAVX512())
2940       if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2941                                                      SimpleDstTy, SimpleSrcTy))
2942         return AdjustCost(Entry->Cost);
2943 
2944     if (ST->hasAVX2()) {
2945       if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2946                                                      SimpleDstTy, SimpleSrcTy))
2947         return AdjustCost(Entry->Cost);
2948     }
2949 
2950     if (ST->hasAVX()) {
2951       if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2952                                                      SimpleDstTy, SimpleSrcTy))
2953         return AdjustCost(Entry->Cost);
2954     }
2955 
2956     if (ST->hasSSE41()) {
2957       if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2958                                                      SimpleDstTy, SimpleSrcTy))
2959         return AdjustCost(Entry->Cost);
2960     }
2961 
2962     if (ST->hasSSE2()) {
2963       if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2964                                                      SimpleDstTy, SimpleSrcTy))
2965         return AdjustCost(Entry->Cost);
2966     }
2967   }
2968 
2969   // Fall back to legalized types.
2970   std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
2971   std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
2972 
2973   // If we're truncating to the same legalized type - just assume its free.
2974   if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
2975     return TTI::TCC_Free;
2976 
2977   if (ST->useAVX512Regs()) {
2978     if (ST->hasBWI())
2979       if (const auto *Entry = ConvertCostTableLookup(
2980               AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2981         return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2982 
2983     if (ST->hasDQI())
2984       if (const auto *Entry = ConvertCostTableLookup(
2985               AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
2986         return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2987 
2988     if (ST->hasAVX512())
2989       if (const auto *Entry = ConvertCostTableLookup(
2990               AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
2991         return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2992   }
2993 
2994   if (ST->hasBWI())
2995     if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2996                                                    LTDest.second, LTSrc.second))
2997       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2998 
2999   if (ST->hasDQI())
3000     if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3001                                                    LTDest.second, LTSrc.second))
3002       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3003 
3004   if (ST->hasAVX512())
3005     if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3006                                                    LTDest.second, LTSrc.second))
3007       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3008 
3009   if (ST->hasAVX2())
3010     if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3011                                                    LTDest.second, LTSrc.second))
3012       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3013 
3014   if (ST->hasAVX())
3015     if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3016                                                    LTDest.second, LTSrc.second))
3017       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3018 
3019   if (ST->hasSSE41())
3020     if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3021                                                    LTDest.second, LTSrc.second))
3022       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3023 
3024   if (ST->hasSSE2())
3025     if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3026                                                    LTDest.second, LTSrc.second))
3027       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3028 
3029   // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3030   // sitofp.
3031   if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3032       1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3033     Type *ExtSrc = Src->getWithNewBitWidth(32);
3034     unsigned ExtOpc =
3035         (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3036 
3037     // For scalar loads the extend would be free.
3038     InstructionCost ExtCost = 0;
3039     if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3040       ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3041 
3042     return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3043                                       TTI::CastContextHint::None, CostKind);
3044   }
3045 
3046   // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3047   // i32.
3048   if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3049       1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3050     Type *TruncDst = Dst->getWithNewBitWidth(32);
3051     return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3052            getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3053                             TTI::CastContextHint::None, CostKind);
3054   }
3055 
3056   return AdjustCost(
3057       BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3058 }
3059 
3060 InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
3061                                                Type *CondTy,
3062                                                CmpInst::Predicate VecPred,
3063                                                TTI::TargetCostKind CostKind,
3064                                                const Instruction *I) {
3065   // Early out if this type isn't scalar/vector integer/float.
3066   if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3067     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3068                                      I);
3069 
3070   // Legalize the type.
3071   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3072 
3073   MVT MTy = LT.second;
3074 
3075   int ISD = TLI->InstructionOpcodeToISD(Opcode);
3076   assert(ISD && "Invalid opcode");
3077 
3078   InstructionCost ExtraCost = 0;
3079   if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3080     // Some vector comparison predicates cost extra instructions.
3081     // TODO: Should we invert this and assume worst case cmp costs
3082     // and reduce for particular predicates?
3083     if (MTy.isVector() &&
3084         !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3085           (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3086           ST->hasBWI())) {
3087       // Fallback to I if a specific predicate wasn't specified.
3088       CmpInst::Predicate Pred = VecPred;
3089       if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3090                 Pred == CmpInst::BAD_FCMP_PREDICATE))
3091         Pred = cast<CmpInst>(I)->getPredicate();
3092 
3093       switch (Pred) {
3094       case CmpInst::Predicate::ICMP_NE:
3095         // xor(cmpeq(x,y),-1)
3096         ExtraCost = 1;
3097         break;
3098       case CmpInst::Predicate::ICMP_SGE:
3099       case CmpInst::Predicate::ICMP_SLE:
3100         // xor(cmpgt(x,y),-1)
3101         ExtraCost = 1;
3102         break;
3103       case CmpInst::Predicate::ICMP_ULT:
3104       case CmpInst::Predicate::ICMP_UGT:
3105         // cmpgt(xor(x,signbit),xor(y,signbit))
3106         // xor(cmpeq(pmaxu(x,y),x),-1)
3107         ExtraCost = 2;
3108         break;
3109       case CmpInst::Predicate::ICMP_ULE:
3110       case CmpInst::Predicate::ICMP_UGE:
3111         if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3112             (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3113           // cmpeq(psubus(x,y),0)
3114           // cmpeq(pminu(x,y),x)
3115           ExtraCost = 1;
3116         } else {
3117           // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3118           ExtraCost = 3;
3119         }
3120         break;
3121       case CmpInst::Predicate::FCMP_ONE:
3122       case CmpInst::Predicate::FCMP_UEQ:
3123         // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3124         // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3125         if (CondTy && !ST->hasAVX())
3126           return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3127                                     CmpInst::Predicate::FCMP_UNO, CostKind) +
3128                  getCmpSelInstrCost(Opcode, ValTy, CondTy,
3129                                     CmpInst::Predicate::FCMP_OEQ, CostKind) +
3130                  getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3131 
3132         break;
3133       case CmpInst::Predicate::BAD_ICMP_PREDICATE:
3134       case CmpInst::Predicate::BAD_FCMP_PREDICATE:
3135         // Assume worst case scenario and add the maximum extra cost.
3136         ExtraCost = 3;
3137         break;
3138       default:
3139         break;
3140       }
3141     }
3142   }
3143 
3144   static const CostKindTblEntry SLMCostTbl[] = {
3145     // slm pcmpeq/pcmpgt throughput is 2
3146     { ISD::SETCC,   MVT::v2i64,   { 2, 5, 1, 2 } },
3147     // slm pblendvb/blendvpd/blendvps throughput is 4
3148     { ISD::SELECT,  MVT::v2f64,   { 4, 4, 1, 3 } }, // vblendvpd
3149     { ISD::SELECT,  MVT::v4f32,   { 4, 4, 1, 3 } }, // vblendvps
3150     { ISD::SELECT,  MVT::v2i64,   { 4, 4, 1, 3 } }, // pblendvb
3151     { ISD::SELECT,  MVT::v8i32,   { 4, 4, 1, 3 } }, // pblendvb
3152     { ISD::SELECT,  MVT::v8i16,   { 4, 4, 1, 3 } }, // pblendvb
3153     { ISD::SELECT,  MVT::v16i8,   { 4, 4, 1, 3 } }, // pblendvb
3154   };
3155 
3156   static const CostKindTblEntry AVX512BWCostTbl[] = {
3157     { ISD::SETCC,   MVT::v32i16,  { 1, 1, 1, 1 } },
3158     { ISD::SETCC,   MVT::v16i16,  { 1, 1, 1, 1 } },
3159     { ISD::SETCC,   MVT::v64i8,   { 1, 1, 1, 1 } },
3160     { ISD::SETCC,   MVT::v32i8,   { 1, 1, 1, 1 } },
3161 
3162     { ISD::SELECT,  MVT::v32i16,  { 1, 1, 1, 1 } },
3163     { ISD::SELECT,  MVT::v64i8,   { 1, 1, 1, 1 } },
3164   };
3165 
3166   static const CostKindTblEntry AVX512CostTbl[] = {
3167     { ISD::SETCC,   MVT::v8f64,   { 1, 4, 1, 1 } },
3168     { ISD::SETCC,   MVT::v4f64,   { 1, 4, 1, 1 } },
3169     { ISD::SETCC,   MVT::v16f32,  { 1, 4, 1, 1 } },
3170     { ISD::SETCC,   MVT::v8f32,   { 1, 4, 1, 1 } },
3171 
3172     { ISD::SETCC,   MVT::v8i64,   { 1, 1, 1, 1 } },
3173     { ISD::SETCC,   MVT::v4i64,   { 1, 1, 1, 1 } },
3174     { ISD::SETCC,   MVT::v2i64,   { 1, 1, 1, 1 } },
3175     { ISD::SETCC,   MVT::v16i32,  { 1, 1, 1, 1 } },
3176     { ISD::SETCC,   MVT::v8i32,   { 1, 1, 1, 1 } },
3177     { ISD::SETCC,   MVT::v32i16,  { 3, 7, 5, 5 } },
3178     { ISD::SETCC,   MVT::v64i8,   { 3, 7, 5, 5 } },
3179 
3180     { ISD::SELECT,  MVT::v8i64,   { 1, 1, 1, 1 } },
3181     { ISD::SELECT,  MVT::v4i64,   { 1, 1, 1, 1 } },
3182     { ISD::SELECT,  MVT::v2i64,   { 1, 1, 1, 1 } },
3183     { ISD::SELECT,  MVT::v16i32,  { 1, 1, 1, 1 } },
3184     { ISD::SELECT,  MVT::v8i32,   { 1, 1, 1, 1 } },
3185     { ISD::SELECT,  MVT::v4i32,   { 1, 1, 1, 1 } },
3186     { ISD::SELECT,  MVT::v8f64,   { 1, 1, 1, 1 } },
3187     { ISD::SELECT,  MVT::v4f64,   { 1, 1, 1, 1 } },
3188     { ISD::SELECT,  MVT::v2f64,   { 1, 1, 1, 1 } },
3189     { ISD::SELECT,  MVT::f64,     { 1, 1, 1, 1 } },
3190     { ISD::SELECT,  MVT::v16f32,  { 1, 1, 1, 1 } },
3191     { ISD::SELECT,  MVT::v8f32 ,  { 1, 1, 1, 1 } },
3192     { ISD::SELECT,  MVT::v4f32,   { 1, 1, 1, 1 } },
3193     { ISD::SELECT,  MVT::f32  ,   { 1, 1, 1, 1 } },
3194 
3195     { ISD::SELECT,  MVT::v32i16,  { 2, 2, 4, 4 } },
3196     { ISD::SELECT,  MVT::v16i16,  { 1, 1, 1, 1 } },
3197     { ISD::SELECT,  MVT::v8i16,   { 1, 1, 1, 1 } },
3198     { ISD::SELECT,  MVT::v64i8,   { 2, 2, 4, 4 } },
3199     { ISD::SELECT,  MVT::v32i8,   { 1, 1, 1, 1 } },
3200     { ISD::SELECT,  MVT::v16i8,   { 1, 1, 1, 1 } },
3201   };
3202 
3203   static const CostKindTblEntry AVX2CostTbl[] = {
3204     { ISD::SETCC,   MVT::v4f64,   { 1, 4, 1, 2 } },
3205     { ISD::SETCC,   MVT::v2f64,   { 1, 4, 1, 1 } },
3206     { ISD::SETCC,   MVT::f64,     { 1, 4, 1, 1 } },
3207     { ISD::SETCC,   MVT::v8f32,   { 1, 4, 1, 2 } },
3208     { ISD::SETCC,   MVT::v4f32,   { 1, 4, 1, 1 } },
3209     { ISD::SETCC,   MVT::f32,     { 1, 4, 1, 1 } },
3210 
3211     { ISD::SETCC,   MVT::v4i64,   { 1, 1, 1, 2 } },
3212     { ISD::SETCC,   MVT::v8i32,   { 1, 1, 1, 2 } },
3213     { ISD::SETCC,   MVT::v16i16,  { 1, 1, 1, 2 } },
3214     { ISD::SETCC,   MVT::v32i8,   { 1, 1, 1, 2 } },
3215 
3216     { ISD::SELECT,  MVT::v4f64,   { 2, 2, 1, 2 } }, // vblendvpd
3217     { ISD::SELECT,  MVT::v8f32,   { 2, 2, 1, 2 } }, // vblendvps
3218     { ISD::SELECT,  MVT::v4i64,   { 2, 2, 1, 2 } }, // pblendvb
3219     { ISD::SELECT,  MVT::v8i32,   { 2, 2, 1, 2 } }, // pblendvb
3220     { ISD::SELECT,  MVT::v16i16,  { 2, 2, 1, 2 } }, // pblendvb
3221     { ISD::SELECT,  MVT::v32i8,   { 2, 2, 1, 2 } }, // pblendvb
3222   };
3223 
3224   static const CostKindTblEntry XOPCostTbl[] = {
3225     { ISD::SETCC,   MVT::v4i64,   { 4, 2, 5, 6 } },
3226     { ISD::SETCC,   MVT::v2i64,   { 1, 1, 1, 1 } },
3227   };
3228 
3229   static const CostKindTblEntry AVX1CostTbl[] = {
3230     { ISD::SETCC,   MVT::v4f64,   { 2, 3, 1, 2 } },
3231     { ISD::SETCC,   MVT::v2f64,   { 1, 3, 1, 1 } },
3232     { ISD::SETCC,   MVT::f64,     { 1, 3, 1, 1 } },
3233     { ISD::SETCC,   MVT::v8f32,   { 2, 3, 1, 2 } },
3234     { ISD::SETCC,   MVT::v4f32,   { 1, 3, 1, 1 } },
3235     { ISD::SETCC,   MVT::f32,     { 1, 3, 1, 1 } },
3236 
3237     // AVX1 does not support 8-wide integer compare.
3238     { ISD::SETCC,   MVT::v4i64,   { 4, 2, 5, 6 } },
3239     { ISD::SETCC,   MVT::v8i32,   { 4, 2, 5, 6 } },
3240     { ISD::SETCC,   MVT::v16i16,  { 4, 2, 5, 6 } },
3241     { ISD::SETCC,   MVT::v32i8,   { 4, 2, 5, 6 } },
3242 
3243     { ISD::SELECT,  MVT::v4f64,   { 3, 3, 1, 2 } }, // vblendvpd
3244     { ISD::SELECT,  MVT::v8f32,   { 3, 3, 1, 2 } }, // vblendvps
3245     { ISD::SELECT,  MVT::v4i64,   { 3, 3, 1, 2 } }, // vblendvpd
3246     { ISD::SELECT,  MVT::v8i32,   { 3, 3, 1, 2 } }, // vblendvps
3247     { ISD::SELECT,  MVT::v16i16,  { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3248     { ISD::SELECT,  MVT::v32i8,   { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3249   };
3250 
3251   static const CostKindTblEntry SSE42CostTbl[] = {
3252     { ISD::SETCC,   MVT::v2i64,   { 1, 2, 1, 2 } },
3253   };
3254 
3255   static const CostKindTblEntry SSE41CostTbl[] = {
3256     { ISD::SETCC,   MVT::v2f64,   { 1, 5, 1, 1 } },
3257     { ISD::SETCC,   MVT::v4f32,   { 1, 5, 1, 1 } },
3258 
3259     { ISD::SELECT,  MVT::v2f64,   { 2, 2, 1, 2 } }, // blendvpd
3260     { ISD::SELECT,  MVT::f64,     { 2, 2, 1, 2 } }, // blendvpd
3261     { ISD::SELECT,  MVT::v4f32,   { 2, 2, 1, 2 } }, // blendvps
3262     { ISD::SELECT,  MVT::f32  ,   { 2, 2, 1, 2 } }, // blendvps
3263     { ISD::SELECT,  MVT::v2i64,   { 2, 2, 1, 2 } }, // pblendvb
3264     { ISD::SELECT,  MVT::v4i32,   { 2, 2, 1, 2 } }, // pblendvb
3265     { ISD::SELECT,  MVT::v8i16,   { 2, 2, 1, 2 } }, // pblendvb
3266     { ISD::SELECT,  MVT::v16i8,   { 2, 2, 1, 2 } }, // pblendvb
3267   };
3268 
3269   static const CostKindTblEntry SSE2CostTbl[] = {
3270     { ISD::SETCC,   MVT::v2f64,   { 2, 5, 1, 1 } },
3271     { ISD::SETCC,   MVT::f64,     { 1, 5, 1, 1 } },
3272 
3273     { ISD::SETCC,   MVT::v2i64,   { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3274     { ISD::SETCC,   MVT::v4i32,   { 1, 1, 1, 1 } },
3275     { ISD::SETCC,   MVT::v8i16,   { 1, 1, 1, 1 } },
3276     { ISD::SETCC,   MVT::v16i8,   { 1, 1, 1, 1 } },
3277 
3278     { ISD::SELECT,  MVT::v2f64,   { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3279     { ISD::SELECT,  MVT::f64,     { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3280     { ISD::SELECT,  MVT::v2i64,   { 2, 2, 3, 3 } }, // pand + pandn + por
3281     { ISD::SELECT,  MVT::v4i32,   { 2, 2, 3, 3 } }, // pand + pandn + por
3282     { ISD::SELECT,  MVT::v8i16,   { 2, 2, 3, 3 } }, // pand + pandn + por
3283     { ISD::SELECT,  MVT::v16i8,   { 2, 2, 3, 3 } }, // pand + pandn + por
3284   };
3285 
3286   static const CostKindTblEntry SSE1CostTbl[] = {
3287     { ISD::SETCC,   MVT::v4f32,   { 2, 5, 1, 1 } },
3288     { ISD::SETCC,   MVT::f32,     { 1, 5, 1, 1 } },
3289 
3290     { ISD::SELECT,  MVT::v4f32,   { 2, 2, 3, 3 } }, // andps + andnps + orps
3291     { ISD::SELECT,  MVT::f32,     { 2, 2, 3, 3 } }, // andps + andnps + orps
3292   };
3293 
3294   if (ST->useSLMArithCosts())
3295     if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3296       if (auto KindCost = Entry->Cost[CostKind])
3297         return LT.first * (ExtraCost + *KindCost);
3298 
3299   if (ST->hasBWI())
3300     if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3301       if (auto KindCost = Entry->Cost[CostKind])
3302         return LT.first * (ExtraCost + *KindCost);
3303 
3304   if (ST->hasAVX512())
3305     if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3306       if (auto KindCost = Entry->Cost[CostKind])
3307         return LT.first * (ExtraCost + *KindCost);
3308 
3309   if (ST->hasAVX2())
3310     if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3311       if (auto KindCost = Entry->Cost[CostKind])
3312         return LT.first * (ExtraCost + *KindCost);
3313 
3314   if (ST->hasXOP())
3315     if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3316       if (auto KindCost = Entry->Cost[CostKind])
3317         return LT.first * (ExtraCost + *KindCost);
3318 
3319   if (ST->hasAVX())
3320     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3321       if (auto KindCost = Entry->Cost[CostKind])
3322         return LT.first * (ExtraCost + *KindCost);
3323 
3324   if (ST->hasSSE42())
3325     if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3326       if (auto KindCost = Entry->Cost[CostKind])
3327         return LT.first * (ExtraCost + *KindCost);
3328 
3329   if (ST->hasSSE41())
3330     if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3331       if (auto KindCost = Entry->Cost[CostKind])
3332         return LT.first * (ExtraCost + *KindCost);
3333 
3334   if (ST->hasSSE2())
3335     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3336       if (auto KindCost = Entry->Cost[CostKind])
3337         return LT.first * (ExtraCost + *KindCost);
3338 
3339   if (ST->hasSSE1())
3340     if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3341       if (auto KindCost = Entry->Cost[CostKind])
3342         return LT.first * (ExtraCost + *KindCost);
3343 
3344   // Assume a 3cy latency for fp select ops.
3345   if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3346     if (ValTy->getScalarType()->isFloatingPointTy())
3347       return 3;
3348 
3349   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3350 }
3351 
3352 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
3353 
3354 InstructionCost
3355 X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
3356                                   TTI::TargetCostKind CostKind) {
3357   // Costs should match the codegen from:
3358   // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3359   // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3360   // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3361   // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3362   // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3363 
3364   // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3365   //       specialized in these tables yet.
3366   static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3367     { ISD::FSHL,       MVT::v8i64,   {  1,  1,  1,  1 } },
3368     { ISD::FSHL,       MVT::v4i64,   {  1,  1,  1,  1 } },
3369     { ISD::FSHL,       MVT::v2i64,   {  1,  1,  1,  1 } },
3370     { ISD::FSHL,       MVT::v16i32,  {  1,  1,  1,  1 } },
3371     { ISD::FSHL,       MVT::v8i32,   {  1,  1,  1,  1 } },
3372     { ISD::FSHL,       MVT::v4i32,   {  1,  1,  1,  1 } },
3373     { ISD::FSHL,       MVT::v32i16,  {  1,  1,  1,  1 } },
3374     { ISD::FSHL,       MVT::v16i16,  {  1,  1,  1,  1 } },
3375     { ISD::FSHL,       MVT::v8i16,   {  1,  1,  1,  1 } },
3376     { ISD::ROTL,       MVT::v32i16,  {  1,  1,  1,  1 } },
3377     { ISD::ROTL,       MVT::v16i16,  {  1,  1,  1,  1 } },
3378     { ISD::ROTL,       MVT::v8i16,   {  1,  1,  1,  1 } },
3379     { ISD::ROTR,       MVT::v32i16,  {  1,  1,  1,  1 } },
3380     { ISD::ROTR,       MVT::v16i16,  {  1,  1,  1,  1 } },
3381     { ISD::ROTR,       MVT::v8i16,   {  1,  1,  1,  1 } },
3382   };
3383   static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3384     { ISD::CTPOP,      MVT::v32i16,  {  1,  1,  1,  1 } },
3385     { ISD::CTPOP,      MVT::v64i8,   {  1,  1,  1,  1 } },
3386     { ISD::CTPOP,      MVT::v16i16,  {  1,  1,  1,  1 } },
3387     { ISD::CTPOP,      MVT::v32i8,   {  1,  1,  1,  1 } },
3388     { ISD::CTPOP,      MVT::v8i16,   {  1,  1,  1,  1 } },
3389     { ISD::CTPOP,      MVT::v16i8,   {  1,  1,  1,  1 } },
3390   };
3391   static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3392     { ISD::CTPOP,      MVT::v8i64,   {  1,  1,  1,  1 } },
3393     { ISD::CTPOP,      MVT::v16i32,  {  1,  1,  1,  1 } },
3394     { ISD::CTPOP,      MVT::v4i64,   {  1,  1,  1,  1 } },
3395     { ISD::CTPOP,      MVT::v8i32,   {  1,  1,  1,  1 } },
3396     { ISD::CTPOP,      MVT::v2i64,   {  1,  1,  1,  1 } },
3397     { ISD::CTPOP,      MVT::v4i32,   {  1,  1,  1,  1 } },
3398   };
3399   static const CostKindTblEntry AVX512CDCostTbl[] = {
3400     { ISD::CTLZ,       MVT::v8i64,   {  1,  5,  1,  1 } },
3401     { ISD::CTLZ,       MVT::v16i32,  {  1,  5,  1,  1 } },
3402     { ISD::CTLZ,       MVT::v32i16,  { 18, 27, 23, 27 } },
3403     { ISD::CTLZ,       MVT::v64i8,   {  3, 16,  9, 11 } },
3404     { ISD::CTLZ,       MVT::v4i64,   {  1,  5,  1,  1 } },
3405     { ISD::CTLZ,       MVT::v8i32,   {  1,  5,  1,  1 } },
3406     { ISD::CTLZ,       MVT::v16i16,  {  8, 19, 11, 13 } },
3407     { ISD::CTLZ,       MVT::v32i8,   {  2, 11,  9, 10 } },
3408     { ISD::CTLZ,       MVT::v2i64,   {  1,  5,  1,  1 } },
3409     { ISD::CTLZ,       MVT::v4i32,   {  1,  5,  1,  1 } },
3410     { ISD::CTLZ,       MVT::v8i16,   {  3, 15,  4,  6 } },
3411     { ISD::CTLZ,       MVT::v16i8,   {  2, 10,  9, 10 } },
3412 
3413     { ISD::CTTZ,       MVT::v8i64,   {  2,  8,  6,  7 } },
3414     { ISD::CTTZ,       MVT::v16i32,  {  2,  8,  6,  7 } },
3415     { ISD::CTTZ,       MVT::v4i64,   {  1,  8,  6,  6 } },
3416     { ISD::CTTZ,       MVT::v8i32,   {  1,  8,  6,  6 } },
3417     { ISD::CTTZ,       MVT::v2i64,   {  1,  8,  6,  6 } },
3418     { ISD::CTTZ,       MVT::v4i32,   {  1,  8,  6,  6 } },
3419   };
3420   static const CostKindTblEntry AVX512BWCostTbl[] = {
3421     { ISD::ABS,        MVT::v32i16,  {  1,  1,  1,  1 } },
3422     { ISD::ABS,        MVT::v64i8,   {  1,  1,  1,  1 } },
3423     { ISD::BITREVERSE, MVT::v2i64,   {  3, 10, 10, 11 } },
3424     { ISD::BITREVERSE, MVT::v4i64,   {  3, 11, 10, 11 } },
3425     { ISD::BITREVERSE, MVT::v8i64,   {  3, 12, 10, 14 } },
3426     { ISD::BITREVERSE, MVT::v4i32,   {  3, 10, 10, 11 } },
3427     { ISD::BITREVERSE, MVT::v8i32,   {  3, 11, 10, 11 } },
3428     { ISD::BITREVERSE, MVT::v16i32,  {  3, 12, 10, 14 } },
3429     { ISD::BITREVERSE, MVT::v8i16,   {  3, 10, 10, 11 } },
3430     { ISD::BITREVERSE, MVT::v16i16,  {  3, 11, 10, 11 } },
3431     { ISD::BITREVERSE, MVT::v32i16,  {  3, 12, 10, 14 } },
3432     { ISD::BITREVERSE, MVT::v16i8,   {  2,  5,  9,  9 } },
3433     { ISD::BITREVERSE, MVT::v32i8,   {  2,  5,  9,  9 } },
3434     { ISD::BITREVERSE, MVT::v64i8,   {  2,  5,  9, 12 } },
3435     { ISD::BSWAP,      MVT::v2i64,   {  1,  1,  1,  2 } },
3436     { ISD::BSWAP,      MVT::v4i64,   {  1,  1,  1,  2 } },
3437     { ISD::BSWAP,      MVT::v8i64,   {  1,  1,  1,  2 } },
3438     { ISD::BSWAP,      MVT::v4i32,   {  1,  1,  1,  2 } },
3439     { ISD::BSWAP,      MVT::v8i32,   {  1,  1,  1,  2 } },
3440     { ISD::BSWAP,      MVT::v16i32,  {  1,  1,  1,  2 } },
3441     { ISD::BSWAP,      MVT::v8i16,   {  1,  1,  1,  2 } },
3442     { ISD::BSWAP,      MVT::v16i16,  {  1,  1,  1,  2 } },
3443     { ISD::BSWAP,      MVT::v32i16,  {  1,  1,  1,  2 } },
3444     { ISD::CTLZ,       MVT::v8i64,   {  8, 22, 23, 23 } },
3445     { ISD::CTLZ,       MVT::v16i32,  {  8, 23, 25, 25 } },
3446     { ISD::CTLZ,       MVT::v32i16,  {  4, 15, 15, 16 } },
3447     { ISD::CTLZ,       MVT::v64i8,   {  3, 12, 10,  9 } },
3448     { ISD::CTPOP,      MVT::v2i64,   {  3,  7, 10, 10 } },
3449     { ISD::CTPOP,      MVT::v4i64,   {  3,  7, 10, 10 } },
3450     { ISD::CTPOP,      MVT::v8i64,   {  3,  8, 10, 12 } },
3451     { ISD::CTPOP,      MVT::v4i32,   {  7, 11, 14, 14 } },
3452     { ISD::CTPOP,      MVT::v8i32,   {  7, 11, 14, 14 } },
3453     { ISD::CTPOP,      MVT::v16i32,  {  7, 12, 14, 16 } },
3454     { ISD::CTPOP,      MVT::v8i16,   {  2,  7, 11, 11 } },
3455     { ISD::CTPOP,      MVT::v16i16,  {  2,  7, 11, 11 } },
3456     { ISD::CTPOP,      MVT::v32i16,  {  3,  7, 11, 13 } },
3457     { ISD::CTPOP,      MVT::v16i8,   {  2,  4,  8,  8 } },
3458     { ISD::CTPOP,      MVT::v32i8,   {  2,  4,  8,  8 } },
3459     { ISD::CTPOP,      MVT::v64i8,   {  2,  5,  8, 10 } },
3460     { ISD::CTTZ,       MVT::v8i16,   {  3,  9, 14, 14 } },
3461     { ISD::CTTZ,       MVT::v16i16,  {  3,  9, 14, 14 } },
3462     { ISD::CTTZ,       MVT::v32i16,  {  3, 10, 14, 16 } },
3463     { ISD::CTTZ,       MVT::v16i8,   {  2,  6, 11, 11 } },
3464     { ISD::CTTZ,       MVT::v32i8,   {  2,  6, 11, 11 } },
3465     { ISD::CTTZ,       MVT::v64i8,   {  3,  7, 11, 13 } },
3466     { ISD::ROTL,       MVT::v32i16,  {  2,  8,  6,  8 } },
3467     { ISD::ROTL,       MVT::v16i16,  {  2,  8,  6,  7 } },
3468     { ISD::ROTL,       MVT::v8i16,   {  2,  7,  6,  7 } },
3469     { ISD::ROTL,       MVT::v64i8,   {  5,  6, 11, 12 } },
3470     { ISD::ROTL,       MVT::v32i8,   {  5, 15,  7, 10 } },
3471     { ISD::ROTL,       MVT::v16i8,   {  5, 15,  7, 10 } },
3472     { ISD::ROTR,       MVT::v32i16,  {  2,  8,  6,  8 } },
3473     { ISD::ROTR,       MVT::v16i16,  {  2,  8,  6,  7 } },
3474     { ISD::ROTR,       MVT::v8i16,   {  2,  7,  6,  7 } },
3475     { ISD::ROTR,       MVT::v64i8,   {  5,  6, 12, 14 } },
3476     { ISD::ROTR,       MVT::v32i8,   {  5, 14,  6,  9 } },
3477     { ISD::ROTR,       MVT::v16i8,   {  5, 14,  6,  9 } },
3478     { ISD::SADDSAT,    MVT::v32i16,  {  1 } },
3479     { ISD::SADDSAT,    MVT::v64i8,   {  1 } },
3480     { ISD::SMAX,       MVT::v32i16,  {  1,  1,  1,  1 } },
3481     { ISD::SMAX,       MVT::v64i8,   {  1,  1,  1,  1 } },
3482     { ISD::SMIN,       MVT::v32i16,  {  1,  1,  1,  1 } },
3483     { ISD::SMIN,       MVT::v64i8,   {  1,  1,  1,  1 } },
3484     { ISD::SSUBSAT,    MVT::v32i16,  {  1 } },
3485     { ISD::SSUBSAT,    MVT::v64i8,   {  1 } },
3486     { ISD::UADDSAT,    MVT::v32i16,  {  1 } },
3487     { ISD::UADDSAT,    MVT::v64i8,   {  1 } },
3488     { ISD::UMAX,       MVT::v32i16,  {  1,  1,  1,  1 } },
3489     { ISD::UMAX,       MVT::v64i8,   {  1,  1,  1,  1 } },
3490     { ISD::UMIN,       MVT::v32i16,  {  1,  1,  1,  1 } },
3491     { ISD::UMIN,       MVT::v64i8,   {  1,  1,  1,  1 } },
3492     { ISD::USUBSAT,    MVT::v32i16,  {  1 } },
3493     { ISD::USUBSAT,    MVT::v64i8,   {  1 } },
3494   };
3495   static const CostKindTblEntry AVX512CostTbl[] = {
3496     { ISD::ABS,        MVT::v8i64,   {  1,  1,  1,  1 } },
3497     { ISD::ABS,        MVT::v4i64,   {  1,  1,  1,  1 } },
3498     { ISD::ABS,        MVT::v2i64,   {  1,  1,  1,  1 } },
3499     { ISD::ABS,        MVT::v16i32,  {  1,  1,  1,  1 } },
3500     { ISD::ABS,        MVT::v8i32,   {  1,  1,  1,  1 } },
3501     { ISD::ABS,        MVT::v32i16,  {  2,  7,  4,  4 } },
3502     { ISD::ABS,        MVT::v16i16,  {  1,  1,  1,  1 } },
3503     { ISD::ABS,        MVT::v64i8,   {  2,  7,  4,  4 } },
3504     { ISD::ABS,        MVT::v32i8,   {  1,  1,  1,  1 } },
3505     { ISD::BITREVERSE, MVT::v8i64,   {  9, 13, 20, 20 } },
3506     { ISD::BITREVERSE, MVT::v16i32,  {  9, 13, 20, 20 } },
3507     { ISD::BITREVERSE, MVT::v32i16,  {  9, 13, 20, 20 } },
3508     { ISD::BITREVERSE, MVT::v64i8,   {  6, 11, 17, 17 } },
3509     { ISD::BSWAP,      MVT::v8i64,   {  4,  7,  5,  5 } },
3510     { ISD::BSWAP,      MVT::v16i32,  {  4,  7,  5,  5 } },
3511     { ISD::BSWAP,      MVT::v32i16,  {  4,  7,  5,  5 } },
3512     { ISD::CTLZ,       MVT::v8i64,   { 10, 28, 32, 32 } },
3513     { ISD::CTLZ,       MVT::v16i32,  { 12, 30, 38, 38 } },
3514     { ISD::CTLZ,       MVT::v32i16,  {  8, 15, 29, 29 } },
3515     { ISD::CTLZ,       MVT::v64i8,   {  6, 11, 19, 19 } },
3516     { ISD::CTPOP,      MVT::v8i64,   { 16, 16, 19, 19 } },
3517     { ISD::CTPOP,      MVT::v16i32,  { 24, 19, 27, 27 } },
3518     { ISD::CTPOP,      MVT::v32i16,  { 18, 15, 22, 22 } },
3519     { ISD::CTPOP,      MVT::v64i8,   { 12, 11, 16, 16 } },
3520     { ISD::CTTZ,       MVT::v8i64,   {  2,  8,  6,  7 } },
3521     { ISD::CTTZ,       MVT::v16i32,  {  2,  8,  6,  7 } },
3522     { ISD::CTTZ,       MVT::v32i16,  {  7, 17, 27, 27 } },
3523     { ISD::CTTZ,       MVT::v64i8,   {  6, 13, 21, 21 } },
3524     { ISD::ROTL,       MVT::v8i64,   {  1,  1,  1,  1 } },
3525     { ISD::ROTL,       MVT::v4i64,   {  1,  1,  1,  1 } },
3526     { ISD::ROTL,       MVT::v2i64,   {  1,  1,  1,  1 } },
3527     { ISD::ROTL,       MVT::v16i32,  {  1,  1,  1,  1 } },
3528     { ISD::ROTL,       MVT::v8i32,   {  1,  1,  1,  1 } },
3529     { ISD::ROTL,       MVT::v4i32,   {  1,  1,  1,  1 } },
3530     { ISD::ROTR,       MVT::v8i64,   {  1,  1,  1,  1 } },
3531     { ISD::ROTR,       MVT::v4i64,   {  1,  1,  1,  1 } },
3532     { ISD::ROTR,       MVT::v2i64,   {  1,  1,  1,  1 } },
3533     { ISD::ROTR,       MVT::v16i32,  {  1,  1,  1,  1 } },
3534     { ISD::ROTR,       MVT::v8i32,   {  1,  1,  1,  1 } },
3535     { ISD::ROTR,       MVT::v4i32,   {  1,  1,  1,  1 } },
3536     { ISD::SMAX,       MVT::v8i64,   {  1,  3,  1,  1 } },
3537     { ISD::SMAX,       MVT::v16i32,  {  1,  1,  1,  1 } },
3538     { ISD::SMAX,       MVT::v32i16,  {  3,  7,  5,  5 } },
3539     { ISD::SMAX,       MVT::v64i8,   {  3,  7,  5,  5 } },
3540     { ISD::SMAX,       MVT::v4i64,   {  1,  3,  1,  1 } },
3541     { ISD::SMAX,       MVT::v2i64,   {  1,  3,  1,  1 } },
3542     { ISD::SMIN,       MVT::v8i64,   {  1,  3,  1,  1 } },
3543     { ISD::SMIN,       MVT::v16i32,  {  1,  1,  1,  1 } },
3544     { ISD::SMIN,       MVT::v32i16,  {  3,  7,  5,  5 } },
3545     { ISD::SMIN,       MVT::v64i8,   {  3,  7,  5,  5 } },
3546     { ISD::SMIN,       MVT::v4i64,   {  1,  3,  1,  1 } },
3547     { ISD::SMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
3548     { ISD::UMAX,       MVT::v8i64,   {  1,  3,  1,  1 } },
3549     { ISD::UMAX,       MVT::v16i32,  {  1,  1,  1,  1 } },
3550     { ISD::UMAX,       MVT::v32i16,  {  3,  7,  5,  5 } },
3551     { ISD::UMAX,       MVT::v64i8,   {  3,  7,  5,  5 } },
3552     { ISD::UMAX,       MVT::v4i64,   {  1,  3,  1,  1 } },
3553     { ISD::UMAX,       MVT::v2i64,   {  1,  3,  1,  1 } },
3554     { ISD::UMIN,       MVT::v8i64,   {  1,  3,  1,  1 } },
3555     { ISD::UMIN,       MVT::v16i32,  {  1,  1,  1,  1 } },
3556     { ISD::UMIN,       MVT::v32i16,  {  3,  7,  5,  5 } },
3557     { ISD::UMIN,       MVT::v64i8,   {  3,  7,  5,  5 } },
3558     { ISD::UMIN,       MVT::v4i64,   {  1,  3,  1,  1 } },
3559     { ISD::UMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
3560     { ISD::USUBSAT,    MVT::v16i32,  {  2 } }, // pmaxud + psubd
3561     { ISD::USUBSAT,    MVT::v2i64,   {  2 } }, // pmaxuq + psubq
3562     { ISD::USUBSAT,    MVT::v4i64,   {  2 } }, // pmaxuq + psubq
3563     { ISD::USUBSAT,    MVT::v8i64,   {  2 } }, // pmaxuq + psubq
3564     { ISD::UADDSAT,    MVT::v16i32,  {  3 } }, // not + pminud + paddd
3565     { ISD::UADDSAT,    MVT::v2i64,   {  3 } }, // not + pminuq + paddq
3566     { ISD::UADDSAT,    MVT::v4i64,   {  3 } }, // not + pminuq + paddq
3567     { ISD::UADDSAT,    MVT::v8i64,   {  3 } }, // not + pminuq + paddq
3568     { ISD::SADDSAT,    MVT::v32i16,  {  2 } },
3569     { ISD::SADDSAT,    MVT::v64i8,   {  2 } },
3570     { ISD::SSUBSAT,    MVT::v32i16,  {  2 } },
3571     { ISD::SSUBSAT,    MVT::v64i8,   {  2 } },
3572     { ISD::UADDSAT,    MVT::v32i16,  {  2 } },
3573     { ISD::UADDSAT,    MVT::v64i8,   {  2 } },
3574     { ISD::USUBSAT,    MVT::v32i16,  {  2 } },
3575     { ISD::USUBSAT,    MVT::v64i8,   {  2 } },
3576     { ISD::FMAXNUM,    MVT::f32,     {  2,  2,  3,  3 } },
3577     { ISD::FMAXNUM,    MVT::v4f32,   {  1,  1,  3,  3 } },
3578     { ISD::FMAXNUM,    MVT::v8f32,   {  2,  2,  3,  3 } },
3579     { ISD::FMAXNUM,    MVT::v16f32,  {  4,  4,  3,  3 } },
3580     { ISD::FMAXNUM,    MVT::f64,     {  2,  2,  3,  3 } },
3581     { ISD::FMAXNUM,    MVT::v2f64,   {  1,  1,  3,  3 } },
3582     { ISD::FMAXNUM,    MVT::v4f64,   {  2,  2,  3,  3 } },
3583     { ISD::FMAXNUM,    MVT::v8f64,   {  3,  3,  3,  3 } },
3584     { ISD::FSQRT,      MVT::f32,     {  3, 12,  1,  1 } }, // Skylake from http://www.agner.org/
3585     { ISD::FSQRT,      MVT::v4f32,   {  3, 12,  1,  1 } }, // Skylake from http://www.agner.org/
3586     { ISD::FSQRT,      MVT::v8f32,   {  6, 12,  1,  1 } }, // Skylake from http://www.agner.org/
3587     { ISD::FSQRT,      MVT::v16f32,  { 12, 20,  1,  3 } }, // Skylake from http://www.agner.org/
3588     { ISD::FSQRT,      MVT::f64,     {  6, 18,  1,  1 } }, // Skylake from http://www.agner.org/
3589     { ISD::FSQRT,      MVT::v2f64,   {  6, 18,  1,  1 } }, // Skylake from http://www.agner.org/
3590     { ISD::FSQRT,      MVT::v4f64,   { 12, 18,  1,  1 } }, // Skylake from http://www.agner.org/
3591     { ISD::FSQRT,      MVT::v8f64,   { 24, 32,  1,  3 } }, // Skylake from http://www.agner.org/
3592   };
3593   static const CostKindTblEntry XOPCostTbl[] = {
3594     { ISD::BITREVERSE, MVT::v4i64,   {  3,  6,  5,  6 } },
3595     { ISD::BITREVERSE, MVT::v8i32,   {  3,  6,  5,  6 } },
3596     { ISD::BITREVERSE, MVT::v16i16,  {  3,  6,  5,  6 } },
3597     { ISD::BITREVERSE, MVT::v32i8,   {  3,  6,  5,  6 } },
3598     { ISD::BITREVERSE, MVT::v2i64,   {  2,  7,  1,  1 } },
3599     { ISD::BITREVERSE, MVT::v4i32,   {  2,  7,  1,  1 } },
3600     { ISD::BITREVERSE, MVT::v8i16,   {  2,  7,  1,  1 } },
3601     { ISD::BITREVERSE, MVT::v16i8,   {  2,  7,  1,  1 } },
3602     { ISD::BITREVERSE, MVT::i64,     {  2,  2,  3,  4 } },
3603     { ISD::BITREVERSE, MVT::i32,     {  2,  2,  3,  4 } },
3604     { ISD::BITREVERSE, MVT::i16,     {  2,  2,  3,  4 } },
3605     { ISD::BITREVERSE, MVT::i8,      {  2,  2,  3,  4 } },
3606     // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3607     { ISD::ROTL,       MVT::v4i64,   {  4,  7,  5,  6 } },
3608     { ISD::ROTL,       MVT::v8i32,   {  4,  7,  5,  6 } },
3609     { ISD::ROTL,       MVT::v16i16,  {  4,  7,  5,  6 } },
3610     { ISD::ROTL,       MVT::v32i8,   {  4,  7,  5,  6 } },
3611     { ISD::ROTL,       MVT::v2i64,   {  1,  3,  1,  1 } },
3612     { ISD::ROTL,       MVT::v4i32,   {  1,  3,  1,  1 } },
3613     { ISD::ROTL,       MVT::v8i16,   {  1,  3,  1,  1 } },
3614     { ISD::ROTL,       MVT::v16i8,   {  1,  3,  1,  1 } },
3615     { ISD::ROTR,       MVT::v4i64,   {  4,  7,  8,  9 } },
3616     { ISD::ROTR,       MVT::v8i32,   {  4,  7,  8,  9 } },
3617     { ISD::ROTR,       MVT::v16i16,  {  4,  7,  8,  9 } },
3618     { ISD::ROTR,       MVT::v32i8,   {  4,  7,  8,  9 } },
3619     { ISD::ROTR,       MVT::v2i64,   {  1,  3,  3,  3 } },
3620     { ISD::ROTR,       MVT::v4i32,   {  1,  3,  3,  3 } },
3621     { ISD::ROTR,       MVT::v8i16,   {  1,  3,  3,  3 } },
3622     { ISD::ROTR,       MVT::v16i8,   {  1,  3,  3,  3 } }
3623   };
3624   static const CostKindTblEntry AVX2CostTbl[] = {
3625     { ISD::ABS,        MVT::v2i64,   {  2,  4,  3,  5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3626     { ISD::ABS,        MVT::v4i64,   {  2,  4,  3,  5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3627     { ISD::ABS,        MVT::v4i32,   {  1,  1,  1,  1 } },
3628     { ISD::ABS,        MVT::v8i32,   {  1,  1,  1,  2 } },
3629     { ISD::ABS,        MVT::v8i16,   {  1,  1,  1,  1 } },
3630     { ISD::ABS,        MVT::v16i16,  {  1,  1,  1,  2 } },
3631     { ISD::ABS,        MVT::v16i8,   {  1,  1,  1,  1 } },
3632     { ISD::ABS,        MVT::v32i8,   {  1,  1,  1,  2 } },
3633     { ISD::BITREVERSE, MVT::v2i64,   {  3, 11, 10, 11 } },
3634     { ISD::BITREVERSE, MVT::v4i64,   {  5, 11, 10, 17 } },
3635     { ISD::BITREVERSE, MVT::v4i32,   {  3, 11, 10, 11 } },
3636     { ISD::BITREVERSE, MVT::v8i32,   {  5, 11, 10, 17 } },
3637     { ISD::BITREVERSE, MVT::v8i16,   {  3, 11, 10, 11 } },
3638     { ISD::BITREVERSE, MVT::v16i16,  {  5, 11, 10, 17 } },
3639     { ISD::BITREVERSE, MVT::v16i8,   {  3,  6,  9,  9 } },
3640     { ISD::BITREVERSE, MVT::v32i8,   {  4,  5,  9, 15 } },
3641     { ISD::BSWAP,      MVT::v2i64,   {  1,  2,  1,  2 } },
3642     { ISD::BSWAP,      MVT::v4i64,   {  1,  3,  1,  2 } },
3643     { ISD::BSWAP,      MVT::v4i32,   {  1,  2,  1,  2 } },
3644     { ISD::BSWAP,      MVT::v8i32,   {  1,  3,  1,  2 } },
3645     { ISD::BSWAP,      MVT::v8i16,   {  1,  2,  1,  2 } },
3646     { ISD::BSWAP,      MVT::v16i16,  {  1,  3,  1,  2 } },
3647     { ISD::CTLZ,       MVT::v2i64,   {  7, 18, 24, 25 } },
3648     { ISD::CTLZ,       MVT::v4i64,   { 14, 18, 24, 44 } },
3649     { ISD::CTLZ,       MVT::v4i32,   {  5, 16, 19, 20 } },
3650     { ISD::CTLZ,       MVT::v8i32,   { 10, 16, 19, 34 } },
3651     { ISD::CTLZ,       MVT::v8i16,   {  4, 13, 14, 15 } },
3652     { ISD::CTLZ,       MVT::v16i16,  {  6, 14, 14, 24 } },
3653     { ISD::CTLZ,       MVT::v16i8,   {  3, 12,  9, 10 } },
3654     { ISD::CTLZ,       MVT::v32i8,   {  4, 12,  9, 14 } },
3655     { ISD::CTPOP,      MVT::v2i64,   {  3,  9, 10, 10 } },
3656     { ISD::CTPOP,      MVT::v4i64,   {  4,  9, 10, 14 } },
3657     { ISD::CTPOP,      MVT::v4i32,   {  7, 12, 14, 14 } },
3658     { ISD::CTPOP,      MVT::v8i32,   {  7, 12, 14, 18 } },
3659     { ISD::CTPOP,      MVT::v8i16,   {  3,  7, 11, 11 } },
3660     { ISD::CTPOP,      MVT::v16i16,  {  6,  8, 11, 18 } },
3661     { ISD::CTPOP,      MVT::v16i8,   {  2,  5,  8,  8 } },
3662     { ISD::CTPOP,      MVT::v32i8,   {  3,  5,  8, 12 } },
3663     { ISD::CTTZ,       MVT::v2i64,   {  4, 11, 13, 13 } },
3664     { ISD::CTTZ,       MVT::v4i64,   {  5, 11, 13, 20 } },
3665     { ISD::CTTZ,       MVT::v4i32,   {  7, 14, 17, 17 } },
3666     { ISD::CTTZ,       MVT::v8i32,   {  7, 15, 17, 24 } },
3667     { ISD::CTTZ,       MVT::v8i16,   {  4,  9, 14, 14 } },
3668     { ISD::CTTZ,       MVT::v16i16,  {  6,  9, 14, 24 } },
3669     { ISD::CTTZ,       MVT::v16i8,   {  3,  7, 11, 11 } },
3670     { ISD::CTTZ,       MVT::v32i8,   {  5,  7, 11, 18 } },
3671     { ISD::SADDSAT,    MVT::v16i16,  {  1 } },
3672     { ISD::SADDSAT,    MVT::v32i8,   {  1 } },
3673     { ISD::SMAX,       MVT::v2i64,   {  2,  7,  2,  3 } },
3674     { ISD::SMAX,       MVT::v4i64,   {  2,  7,  2,  3 } },
3675     { ISD::SMAX,       MVT::v8i32,   {  1,  1,  1,  2 } },
3676     { ISD::SMAX,       MVT::v16i16,  {  1,  1,  1,  2 } },
3677     { ISD::SMAX,       MVT::v32i8,   {  1,  1,  1,  2 } },
3678     { ISD::SMIN,       MVT::v2i64,   {  2,  7,  2,  3 } },
3679     { ISD::SMIN,       MVT::v4i64,   {  2,  7,  2,  3 } },
3680     { ISD::SMIN,       MVT::v8i32,   {  1,  1,  1,  2 } },
3681     { ISD::SMIN,       MVT::v16i16,  {  1,  1,  1,  2 } },
3682     { ISD::SMIN,       MVT::v32i8,   {  1,  1,  1,  2 } },
3683     { ISD::SSUBSAT,    MVT::v16i16,  {  1 } },
3684     { ISD::SSUBSAT,    MVT::v32i8,   {  1 } },
3685     { ISD::UADDSAT,    MVT::v16i16,  {  1 } },
3686     { ISD::UADDSAT,    MVT::v32i8,   {  1 } },
3687     { ISD::UADDSAT,    MVT::v8i32,   {  3 } }, // not + pminud + paddd
3688     { ISD::UMAX,       MVT::v2i64,   {  2,  8,  5,  6 } },
3689     { ISD::UMAX,       MVT::v4i64,   {  2,  8,  5,  8 } },
3690     { ISD::UMAX,       MVT::v8i32,   {  1,  1,  1,  2 } },
3691     { ISD::UMAX,       MVT::v16i16,  {  1,  1,  1,  2 } },
3692     { ISD::UMAX,       MVT::v32i8,   {  1,  1,  1,  2 } },
3693     { ISD::UMIN,       MVT::v2i64,   {  2,  8,  5,  6 } },
3694     { ISD::UMIN,       MVT::v4i64,   {  2,  8,  5,  8 } },
3695     { ISD::UMIN,       MVT::v8i32,   {  1,  1,  1,  2 } },
3696     { ISD::UMIN,       MVT::v16i16,  {  1,  1,  1,  2 } },
3697     { ISD::UMIN,       MVT::v32i8,   {  1,  1,  1,  2 } },
3698     { ISD::USUBSAT,    MVT::v16i16,  {  1 } },
3699     { ISD::USUBSAT,    MVT::v32i8,   {  1 } },
3700     { ISD::USUBSAT,    MVT::v8i32,   {  2 } }, // pmaxud + psubd
3701     { ISD::FMAXNUM,    MVT::f32,     {  2,  7,  3,  5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3702     { ISD::FMAXNUM,    MVT::v4f32,   {  2,  7,  3,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3703     { ISD::FMAXNUM,    MVT::v8f32,   {  3,  7,  3,  6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3704     { ISD::FMAXNUM,    MVT::f64,     {  2,  7,  3,  5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3705     { ISD::FMAXNUM,    MVT::v2f64,   {  2,  7,  3,  5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3706     { ISD::FMAXNUM,    MVT::v4f64,   {  3,  7,  3,  6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3707     { ISD::FSQRT,      MVT::f32,     {  7, 15,  1,  1 } }, // vsqrtss
3708     { ISD::FSQRT,      MVT::v4f32,   {  7, 15,  1,  1 } }, // vsqrtps
3709     { ISD::FSQRT,      MVT::v8f32,   { 14, 21,  1,  3 } }, // vsqrtps
3710     { ISD::FSQRT,      MVT::f64,     { 14, 21,  1,  1 } }, // vsqrtsd
3711     { ISD::FSQRT,      MVT::v2f64,   { 14, 21,  1,  1 } }, // vsqrtpd
3712     { ISD::FSQRT,      MVT::v4f64,   { 28, 35,  1,  3 } }, // vsqrtpd
3713   };
3714   static const CostKindTblEntry AVX1CostTbl[] = {
3715     { ISD::ABS,        MVT::v4i64,   {  6,  8,  6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3716     { ISD::ABS,        MVT::v8i32,   {  3,  6,  4,  5 } },
3717     { ISD::ABS,        MVT::v16i16,  {  3,  6,  4,  5 } },
3718     { ISD::ABS,        MVT::v32i8,   {  3,  6,  4,  5 } },
3719     { ISD::BITREVERSE, MVT::v4i64,   { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3720     { ISD::BITREVERSE, MVT::v2i64,   {  8, 13, 10, 16 } },
3721     { ISD::BITREVERSE, MVT::v8i32,   { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3722     { ISD::BITREVERSE, MVT::v4i32,   {  8, 13, 10, 16 } },
3723     { ISD::BITREVERSE, MVT::v16i16,  { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3724     { ISD::BITREVERSE, MVT::v8i16,   {  8, 13, 10, 16 } },
3725     { ISD::BITREVERSE, MVT::v32i8,   { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
3726     { ISD::BITREVERSE, MVT::v16i8,   {  7,  7,  9, 13 } },
3727     { ISD::BSWAP,      MVT::v4i64,   {  5,  7,  5, 10 } },
3728     { ISD::BSWAP,      MVT::v2i64,   {  2,  3,  1,  3 } },
3729     { ISD::BSWAP,      MVT::v8i32,   {  5,  7,  5, 10 } },
3730     { ISD::BSWAP,      MVT::v4i32,   {  2,  3,  1,  3 } },
3731     { ISD::BSWAP,      MVT::v16i16,  {  5,  6,  5, 10 } },
3732     { ISD::BSWAP,      MVT::v8i16,   {  2,  2,  1,  3 } },
3733     { ISD::CTLZ,       MVT::v4i64,   { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3734     { ISD::CTLZ,       MVT::v2i64,   { 14, 24, 24, 28 } },
3735     { ISD::CTLZ,       MVT::v8i32,   { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3736     { ISD::CTLZ,       MVT::v4i32,   { 12, 20, 19, 23 } },
3737     { ISD::CTLZ,       MVT::v16i16,  { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3738     { ISD::CTLZ,       MVT::v8i16,   {  9, 16, 14, 18 } },
3739     { ISD::CTLZ,       MVT::v32i8,   { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3740     { ISD::CTLZ,       MVT::v16i8,   {  7, 12,  9, 13 } },
3741     { ISD::CTPOP,      MVT::v4i64,   { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3742     { ISD::CTPOP,      MVT::v2i64,   {  7, 14, 10, 14 } },
3743     { ISD::CTPOP,      MVT::v8i32,   { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3744     { ISD::CTPOP,      MVT::v4i32,   {  9, 20, 14, 18 } },
3745     { ISD::CTPOP,      MVT::v16i16,  { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3746     { ISD::CTPOP,      MVT::v8i16,   {  8, 18, 11, 15 } },
3747     { ISD::CTPOP,      MVT::v32i8,   { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3748     { ISD::CTPOP,      MVT::v16i8,   {  6, 12,  8, 12 } },
3749     { ISD::CTTZ,       MVT::v4i64,   { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3750     { ISD::CTTZ,       MVT::v2i64,   {  9, 19, 13, 17 } },
3751     { ISD::CTTZ,       MVT::v8i32,   { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3752     { ISD::CTTZ,       MVT::v4i32,   { 11, 24, 17, 21 } },
3753     { ISD::CTTZ,       MVT::v16i16,  { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3754     { ISD::CTTZ,       MVT::v8i16,   {  9, 21, 14, 18 } },
3755     { ISD::CTTZ,       MVT::v32i8,   { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3756     { ISD::CTTZ,       MVT::v16i8,   {  8, 16, 11, 15 } },
3757     { ISD::SADDSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
3758     { ISD::SADDSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
3759     { ISD::SMAX,       MVT::v4i64,   {  6,  9,  6, 12 } }, // 2 x 128-bit Op + extract/insert
3760     { ISD::SMAX,       MVT::v2i64,   {  3,  7,  2,  4 } },
3761     { ISD::SMAX,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3762     { ISD::SMAX,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3763     { ISD::SMAX,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3764     { ISD::SMIN,       MVT::v4i64,   {  6,  9,  6, 12 } }, // 2 x 128-bit Op + extract/insert
3765     { ISD::SMIN,       MVT::v2i64,   {  3,  7,  2,  3 } },
3766     { ISD::SMIN,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3767     { ISD::SMIN,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3768     { ISD::SMIN,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3769     { ISD::SSUBSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
3770     { ISD::SSUBSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
3771     { ISD::UADDSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
3772     { ISD::UADDSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
3773     { ISD::UADDSAT,    MVT::v8i32,   {  8 } }, // 2 x 128-bit Op + extract/insert
3774     { ISD::UMAX,       MVT::v4i64,   {  9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3775     { ISD::UMAX,       MVT::v2i64,   {  4,  8,  5,  7 } },
3776     { ISD::UMAX,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3777     { ISD::UMAX,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3778     { ISD::UMAX,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3779     { ISD::UMIN,       MVT::v4i64,   {  9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3780     { ISD::UMIN,       MVT::v2i64,   {  4,  8,  5,  7 } },
3781     { ISD::UMIN,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3782     { ISD::UMIN,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3783     { ISD::UMIN,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3784     { ISD::USUBSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
3785     { ISD::USUBSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
3786     { ISD::USUBSAT,    MVT::v8i32,   {  6 } }, // 2 x 128-bit Op + extract/insert
3787     { ISD::FMAXNUM,    MVT::f32,     {  3,  6,  3,  5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3788     { ISD::FMAXNUM,    MVT::v4f32,   {  3,  6,  3,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3789     { ISD::FMAXNUM,    MVT::v8f32,   {  5,  7,  3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3790     { ISD::FMAXNUM,    MVT::f64,     {  3,  6,  3,  5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3791     { ISD::FMAXNUM,    MVT::v2f64,   {  3,  6,  3,  5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3792     { ISD::FMAXNUM,    MVT::v4f64,   {  5,  7,  3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3793     { ISD::FSQRT,      MVT::f32,     { 21, 21,  1,  1 } }, // vsqrtss
3794     { ISD::FSQRT,      MVT::v4f32,   { 21, 21,  1,  1 } }, // vsqrtps
3795     { ISD::FSQRT,      MVT::v8f32,   { 42, 42,  1,  3 } }, // vsqrtps
3796     { ISD::FSQRT,      MVT::f64,     { 27, 27,  1,  1 } }, // vsqrtsd
3797     { ISD::FSQRT,      MVT::v2f64,   { 27, 27,  1,  1 } }, // vsqrtpd
3798     { ISD::FSQRT,      MVT::v4f64,   { 54, 54,  1,  3 } }, // vsqrtpd
3799   };
3800   static const CostKindTblEntry GLMCostTbl[] = {
3801     { ISD::FSQRT,      MVT::f32,     { 19, 20, 1, 1 } }, // sqrtss
3802     { ISD::FSQRT,      MVT::v4f32,   { 37, 41, 1, 5 } }, // sqrtps
3803     { ISD::FSQRT,      MVT::f64,     { 34, 35, 1, 1 } }, // sqrtsd
3804     { ISD::FSQRT,      MVT::v2f64,   { 67, 71, 1, 5 } }, // sqrtpd
3805   };
3806   static const CostKindTblEntry SLMCostTbl[] = {
3807     { ISD::FSQRT,      MVT::f32,     { 20, 20, 1, 1 } }, // sqrtss
3808     { ISD::FSQRT,      MVT::v4f32,   { 40, 41, 1, 5 } }, // sqrtps
3809     { ISD::FSQRT,      MVT::f64,     { 35, 35, 1, 1 } }, // sqrtsd
3810     { ISD::FSQRT,      MVT::v2f64,   { 70, 71, 1, 5 } }, // sqrtpd
3811   };
3812   static const CostKindTblEntry SSE42CostTbl[] = {
3813     { ISD::USUBSAT,    MVT::v4i32,   {  2 } }, // pmaxud + psubd
3814     { ISD::UADDSAT,    MVT::v4i32,   {  3 } }, // not + pminud + paddd
3815     { ISD::FMAXNUM,    MVT::f32,     {  5,  5,  7,  7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3816     { ISD::FMAXNUM,    MVT::v4f32,   {  4,  4,  4,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3817     { ISD::FMAXNUM,    MVT::f64,     {  5,  5,  7,  7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3818     { ISD::FMAXNUM,    MVT::v2f64,   {  4,  4,  4,  5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3819     { ISD::FSQRT,      MVT::f32,     { 18, 18,  1,  1 } }, // Nehalem from http://www.agner.org/
3820     { ISD::FSQRT,      MVT::v4f32,   { 18, 18,  1,  1 } }, // Nehalem from http://www.agner.org/
3821   };
3822   static const CostKindTblEntry SSE41CostTbl[] = {
3823     { ISD::ABS,        MVT::v2i64,   {  3,  4,  3,  5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
3824     { ISD::SMAX,       MVT::v2i64,   {  3,  7,  2,  3 } },
3825     { ISD::SMAX,       MVT::v4i32,   {  1,  1,  1,  1 } },
3826     { ISD::SMAX,       MVT::v16i8,   {  1,  1,  1,  1 } },
3827     { ISD::SMIN,       MVT::v2i64,   {  3,  7,  2,  3 } },
3828     { ISD::SMIN,       MVT::v4i32,   {  1,  1,  1,  1 } },
3829     { ISD::SMIN,       MVT::v16i8,   {  1,  1,  1,  1 } },
3830     { ISD::UMAX,       MVT::v2i64,   {  2, 11,  6,  7 } },
3831     { ISD::UMAX,       MVT::v4i32,   {  1,  1,  1,  1 } },
3832     { ISD::UMAX,       MVT::v8i16,   {  1,  1,  1,  1 } },
3833     { ISD::UMIN,       MVT::v2i64,   {  2, 11,  6,  7 } },
3834     { ISD::UMIN,       MVT::v4i32,   {  1,  1,  1,  1 } },
3835     { ISD::UMIN,       MVT::v8i16,   {  1,  1,  1,  1 } },
3836   };
3837   static const CostKindTblEntry SSSE3CostTbl[] = {
3838     { ISD::ABS,        MVT::v4i32,   {  1,  2,  1,  1 } },
3839     { ISD::ABS,        MVT::v8i16,   {  1,  2,  1,  1 } },
3840     { ISD::ABS,        MVT::v16i8,   {  1,  2,  1,  1 } },
3841     { ISD::BITREVERSE, MVT::v2i64,   { 16, 20, 11, 21 } },
3842     { ISD::BITREVERSE, MVT::v4i32,   { 16, 20, 11, 21 } },
3843     { ISD::BITREVERSE, MVT::v8i16,   { 16, 20, 11, 21 } },
3844     { ISD::BITREVERSE, MVT::v16i8,   { 11, 12, 10, 16 } },
3845     { ISD::BSWAP,      MVT::v2i64,   {  5,  5,  1,  5 } },
3846     { ISD::BSWAP,      MVT::v4i32,   {  5,  5,  1,  5 } },
3847     { ISD::BSWAP,      MVT::v8i16,   {  5,  5,  1,  5 } },
3848     { ISD::CTLZ,       MVT::v2i64,   { 18, 28, 28, 35 } },
3849     { ISD::CTLZ,       MVT::v4i32,   { 15, 20, 22, 28 } },
3850     { ISD::CTLZ,       MVT::v8i16,   { 13, 17, 16, 22 } },
3851     { ISD::CTLZ,       MVT::v16i8,   { 11, 15, 10, 16 } },
3852     { ISD::CTPOP,      MVT::v2i64,   { 13, 19, 12, 18 } },
3853     { ISD::CTPOP,      MVT::v4i32,   { 18, 24, 16, 22 } },
3854     { ISD::CTPOP,      MVT::v8i16,   { 13, 18, 14, 20 } },
3855     { ISD::CTPOP,      MVT::v16i8,   { 11, 12, 10, 16 } },
3856     { ISD::CTTZ,       MVT::v2i64,   { 13, 25, 15, 22 } },
3857     { ISD::CTTZ,       MVT::v4i32,   { 18, 26, 19, 25 } },
3858     { ISD::CTTZ,       MVT::v8i16,   { 13, 20, 17, 23 } },
3859     { ISD::CTTZ,       MVT::v16i8,   { 11, 16, 13, 19 } }
3860   };
3861   static const CostKindTblEntry SSE2CostTbl[] = {
3862     { ISD::ABS,        MVT::v2i64,   {  3,  6,  5,  5 } },
3863     { ISD::ABS,        MVT::v4i32,   {  1,  4,  4,  4 } },
3864     { ISD::ABS,        MVT::v8i16,   {  1,  2,  3,  3 } },
3865     { ISD::ABS,        MVT::v16i8,   {  1,  2,  3,  3 } },
3866     { ISD::BITREVERSE, MVT::v2i64,   { 16, 20, 32, 32 } },
3867     { ISD::BITREVERSE, MVT::v4i32,   { 16, 20, 30, 30 } },
3868     { ISD::BITREVERSE, MVT::v8i16,   { 16, 20, 25, 25 } },
3869     { ISD::BITREVERSE, MVT::v16i8,   { 11, 12, 21, 21 } },
3870     { ISD::BSWAP,      MVT::v2i64,   {  5,  6, 11, 11 } },
3871     { ISD::BSWAP,      MVT::v4i32,   {  5,  5,  9,  9 } },
3872     { ISD::BSWAP,      MVT::v8i16,   {  5,  5,  4,  5 } },
3873     { ISD::CTLZ,       MVT::v2i64,   { 10, 45, 36, 38 } },
3874     { ISD::CTLZ,       MVT::v4i32,   { 10, 45, 38, 40 } },
3875     { ISD::CTLZ,       MVT::v8i16,   {  9, 38, 32, 34 } },
3876     { ISD::CTLZ,       MVT::v16i8,   {  8, 39, 29, 32 } },
3877     { ISD::CTPOP,      MVT::v2i64,   { 12, 26, 16, 18 } },
3878     { ISD::CTPOP,      MVT::v4i32,   { 15, 29, 21, 23 } },
3879     { ISD::CTPOP,      MVT::v8i16,   { 13, 25, 18, 20 } },
3880     { ISD::CTPOP,      MVT::v16i8,   { 10, 21, 14, 16 } },
3881     { ISD::CTTZ,       MVT::v2i64,   { 14, 28, 19, 21 } },
3882     { ISD::CTTZ,       MVT::v4i32,   { 18, 31, 24, 26 } },
3883     { ISD::CTTZ,       MVT::v8i16,   { 16, 27, 21, 23 } },
3884     { ISD::CTTZ,       MVT::v16i8,   { 13, 23, 17, 19 } },
3885     { ISD::SADDSAT,    MVT::v8i16,   {  1 } },
3886     { ISD::SADDSAT,    MVT::v16i8,   {  1 } },
3887     { ISD::SMAX,       MVT::v2i64,   {  4,  8, 15, 15 } },
3888     { ISD::SMAX,       MVT::v4i32,   {  2,  4,  5,  5 } },
3889     { ISD::SMAX,       MVT::v8i16,   {  1,  1,  1,  1 } },
3890     { ISD::SMAX,       MVT::v16i8,   {  2,  4,  5,  5 } },
3891     { ISD::SMIN,       MVT::v2i64,   {  4,  8, 15, 15 } },
3892     { ISD::SMIN,       MVT::v4i32,   {  2,  4,  5,  5 } },
3893     { ISD::SMIN,       MVT::v8i16,   {  1,  1,  1,  1 } },
3894     { ISD::SMIN,       MVT::v16i8,   {  2,  4,  5,  5 } },
3895     { ISD::SSUBSAT,    MVT::v8i16,   {  1 } },
3896     { ISD::SSUBSAT,    MVT::v16i8,   {  1 } },
3897     { ISD::UADDSAT,    MVT::v8i16,   {  1 } },
3898     { ISD::UADDSAT,    MVT::v16i8,   {  1 } },
3899     { ISD::UMAX,       MVT::v2i64,   {  4,  8, 15, 15 } },
3900     { ISD::UMAX,       MVT::v4i32,   {  2,  5,  8,  8 } },
3901     { ISD::UMAX,       MVT::v8i16,   {  1,  3,  3,  3 } },
3902     { ISD::UMAX,       MVT::v16i8,   {  1,  1,  1,  1 } },
3903     { ISD::UMIN,       MVT::v2i64,   {  4,  8, 15, 15 } },
3904     { ISD::UMIN,       MVT::v4i32,   {  2,  5,  8,  8 } },
3905     { ISD::UMIN,       MVT::v8i16,   {  1,  3,  3,  3 } },
3906     { ISD::UMIN,       MVT::v16i8,   {  1,  1,  1,  1 } },
3907     { ISD::USUBSAT,    MVT::v8i16,   {  1 } },
3908     { ISD::USUBSAT,    MVT::v16i8,   {  1 } },
3909     { ISD::FMAXNUM,    MVT::f64,     {  5,  5,  7,  7 } },
3910     { ISD::FMAXNUM,    MVT::v2f64,   {  4,  6,  6,  6 } },
3911     { ISD::FSQRT,      MVT::f64,     { 32, 32,  1,  1 } }, // Nehalem from http://www.agner.org/
3912     { ISD::FSQRT,      MVT::v2f64,   { 32, 32,  1,  1 } }, // Nehalem from http://www.agner.org/
3913   };
3914   static const CostKindTblEntry SSE1CostTbl[] = {
3915     { ISD::FMAXNUM,    MVT::f32,     {  5,  5,  7,  7 } },
3916     { ISD::FMAXNUM,    MVT::v4f32,   {  4,  6,  6,  6 } },
3917     { ISD::FSQRT,      MVT::f32,     { 28, 30,  1,  2 } }, // Pentium III from http://www.agner.org/
3918     { ISD::FSQRT,      MVT::v4f32,   { 56, 56,  1,  2 } }, // Pentium III from http://www.agner.org/
3919   };
3920   static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
3921     { ISD::CTTZ,       MVT::i64,     {  1 } },
3922   };
3923   static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
3924     { ISD::CTTZ,       MVT::i32,     {  1 } },
3925     { ISD::CTTZ,       MVT::i16,     {  1 } },
3926     { ISD::CTTZ,       MVT::i8,      {  1 } },
3927   };
3928   static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
3929     { ISD::CTLZ,       MVT::i64,     {  1 } },
3930   };
3931   static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
3932     { ISD::CTLZ,       MVT::i32,     {  1 } },
3933     { ISD::CTLZ,       MVT::i16,     {  2 } },
3934     { ISD::CTLZ,       MVT::i8,      {  2 } },
3935   };
3936   static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
3937     { ISD::CTPOP,      MVT::i64,     {  1, 1, 1, 1 } }, // popcnt
3938   };
3939   static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
3940     { ISD::CTPOP,      MVT::i32,     {  1, 1, 1, 1 } }, // popcnt
3941     { ISD::CTPOP,      MVT::i16,     {  1, 1, 2, 2 } }, // popcnt(zext())
3942     { ISD::CTPOP,      MVT::i8,      {  1, 1, 2, 2 } }, // popcnt(zext())
3943   };
3944   static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
3945     { ISD::ABS,        MVT::i64,     {  1,  2,  3,  4 } }, // SUB+CMOV
3946     { ISD::BITREVERSE, MVT::i64,     { 10, 12, 20, 22 } },
3947     { ISD::BSWAP,      MVT::i64,     {  1,  2,  1,  2 } },
3948     { ISD::CTLZ,       MVT::i64,     {  4 } }, // BSR+XOR or BSR+XOR+CMOV
3949     { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{  1,  1,  1,  1 } }, // BSR+XOR
3950     { ISD::CTTZ,       MVT::i64,     {  3 } }, // TEST+BSF+CMOV/BRANCH
3951     { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{  1,  1,  1,  1 } }, // BSR
3952     { ISD::CTPOP,      MVT::i64,     { 10,  6, 19, 19 } },
3953     { ISD::ROTL,       MVT::i64,     {  2, 3, 1, 3 } },
3954     { ISD::ROTR,       MVT::i64,     {  2, 3, 1, 3 } },
3955     { X86ISD::VROTLI,  MVT::i64,     {  1, 1, 1, 1 } },
3956     { ISD::FSHL,       MVT::i64,     {  4, 4, 1, 4 } },
3957     { ISD::SMAX,       MVT::i64,     {  1,  3,  2,  3 } },
3958     { ISD::SMIN,       MVT::i64,     {  1,  3,  2,  3 } },
3959     { ISD::UMAX,       MVT::i64,     {  1,  3,  2,  3 } },
3960     { ISD::UMIN,       MVT::i64,     {  1,  3,  2,  3 } },
3961     { ISD::SADDO,      MVT::i64,     {  1 } },
3962     { ISD::UADDO,      MVT::i64,     {  1 } },
3963     { ISD::UMULO,      MVT::i64,     {  2 } }, // mulq + seto
3964   };
3965   static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3966     { ISD::ABS,        MVT::i32,     {  1,  2,  3,  4 } }, // SUB+XOR+SRA or SUB+CMOV
3967     { ISD::ABS,        MVT::i16,     {  2,  2,  3,  4 } }, // SUB+XOR+SRA or SUB+CMOV
3968     { ISD::ABS,        MVT::i8,      {  2,  4,  4,  4 } }, // SUB+XOR+SRA
3969     { ISD::BITREVERSE, MVT::i32,     {  9, 12, 17, 19 } },
3970     { ISD::BITREVERSE, MVT::i16,     {  9, 12, 17, 19 } },
3971     { ISD::BITREVERSE, MVT::i8,      {  7,  9, 13, 14 } },
3972     { ISD::BSWAP,      MVT::i32,     {  1,  1,  1,  1 } },
3973     { ISD::BSWAP,      MVT::i16,     {  1,  2,  1,  2 } }, // ROL
3974     { ISD::CTLZ,       MVT::i32,     {  4 } }, // BSR+XOR or BSR+XOR+CMOV
3975     { ISD::CTLZ,       MVT::i16,     {  4 } }, // BSR+XOR or BSR+XOR+CMOV
3976     { ISD::CTLZ,       MVT::i8,      {  4 } }, // BSR+XOR or BSR+XOR+CMOV
3977     { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{  1,  1,  1,  1 } }, // BSR+XOR
3978     { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{  2,  2,  3,  3 } }, // BSR+XOR
3979     { ISD::CTLZ_ZERO_UNDEF, MVT::i8, {  2,  2,  3,  3 } }, // BSR+XOR
3980     { ISD::CTTZ,       MVT::i32,     {  3 } }, // TEST+BSF+CMOV/BRANCH
3981     { ISD::CTTZ,       MVT::i16,     {  3 } }, // TEST+BSF+CMOV/BRANCH
3982     { ISD::CTTZ,       MVT::i8,      {  3 } }, // TEST+BSF+CMOV/BRANCH
3983     { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{  1,  1,  1,  1 } }, // BSF
3984     { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{  2,  2,  1,  1 } }, // BSF
3985     { ISD::CTTZ_ZERO_UNDEF, MVT::i8, {  2,  2,  1,  1 } }, // BSF
3986     { ISD::CTPOP,      MVT::i32,     {  8,  7, 15, 15 } },
3987     { ISD::CTPOP,      MVT::i16,     {  9,  8, 17, 17 } },
3988     { ISD::CTPOP,      MVT::i8,      {  7,  6, 13, 13 } },
3989     { ISD::ROTL,       MVT::i32,     {  2,  3,  1,  3 } },
3990     { ISD::ROTL,       MVT::i16,     {  2,  3,  1,  3 } },
3991     { ISD::ROTL,       MVT::i8,      {  2,  3,  1,  3 } },
3992     { ISD::ROTR,       MVT::i32,     {  2,  3,  1,  3 } },
3993     { ISD::ROTR,       MVT::i16,     {  2,  3,  1,  3 } },
3994     { ISD::ROTR,       MVT::i8,      {  2,  3,  1,  3 } },
3995     { X86ISD::VROTLI,  MVT::i32,     {  1,  1,  1,  1 } },
3996     { X86ISD::VROTLI,  MVT::i16,     {  1,  1,  1,  1 } },
3997     { X86ISD::VROTLI,  MVT::i8,      {  1,  1,  1,  1 } },
3998     { ISD::FSHL,       MVT::i32,     {  4,  4,  1,  4 } },
3999     { ISD::FSHL,       MVT::i16,     {  4,  4,  2,  5 } },
4000     { ISD::FSHL,       MVT::i8,      {  4,  4,  2,  5 } },
4001     { ISD::SMAX,       MVT::i32,     {  1,  2,  2,  3 } },
4002     { ISD::SMAX,       MVT::i16,     {  1,  4,  2,  4 } },
4003     { ISD::SMAX,       MVT::i8,      {  1,  4,  2,  4 } },
4004     { ISD::SMIN,       MVT::i32,     {  1,  2,  2,  3 } },
4005     { ISD::SMIN,       MVT::i16,     {  1,  4,  2,  4 } },
4006     { ISD::SMIN,       MVT::i8,      {  1,  4,  2,  4 } },
4007     { ISD::UMAX,       MVT::i32,     {  1,  2,  2,  3 } },
4008     { ISD::UMAX,       MVT::i16,     {  1,  4,  2,  4 } },
4009     { ISD::UMAX,       MVT::i8,      {  1,  4,  2,  4 } },
4010     { ISD::UMIN,       MVT::i32,     {  1,  2,  2,  3 } },
4011     { ISD::UMIN,       MVT::i16,     {  1,  4,  2,  4 } },
4012     { ISD::UMIN,       MVT::i8,      {  1,  4,  2,  4 } },
4013     { ISD::SADDO,      MVT::i32,     {  1 } },
4014     { ISD::SADDO,      MVT::i16,     {  1 } },
4015     { ISD::SADDO,      MVT::i8,      {  1 } },
4016     { ISD::UADDO,      MVT::i32,     {  1 } },
4017     { ISD::UADDO,      MVT::i16,     {  1 } },
4018     { ISD::UADDO,      MVT::i8,      {  1 } },
4019     { ISD::UMULO,      MVT::i32,     {  2 } }, // mul + seto
4020     { ISD::UMULO,      MVT::i16,     {  2 } },
4021     { ISD::UMULO,      MVT::i8,      {  2 } },
4022   };
4023 
4024   Type *RetTy = ICA.getReturnType();
4025   Type *OpTy = RetTy;
4026   Intrinsic::ID IID = ICA.getID();
4027   unsigned ISD = ISD::DELETED_NODE;
4028   switch (IID) {
4029   default:
4030     break;
4031   case Intrinsic::abs:
4032     ISD = ISD::ABS;
4033     break;
4034   case Intrinsic::bitreverse:
4035     ISD = ISD::BITREVERSE;
4036     break;
4037   case Intrinsic::bswap:
4038     ISD = ISD::BSWAP;
4039     break;
4040   case Intrinsic::ctlz:
4041     ISD = ISD::CTLZ;
4042     break;
4043   case Intrinsic::ctpop:
4044     ISD = ISD::CTPOP;
4045     break;
4046   case Intrinsic::cttz:
4047     ISD = ISD::CTTZ;
4048     break;
4049   case Intrinsic::fshl:
4050     ISD = ISD::FSHL;
4051     if (!ICA.isTypeBasedOnly()) {
4052       const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4053       if (Args[0] == Args[1]) {
4054         ISD = ISD::ROTL;
4055         // Handle scalar constant rotation amounts.
4056         // TODO: Handle vector + funnel-shift cases.
4057         if (isa_and_nonnull<ConstantInt>(Args[2]))
4058           ISD = X86ISD::VROTLI;
4059       }
4060     }
4061     break;
4062   case Intrinsic::fshr:
4063     // FSHR has same costs so don't duplicate.
4064     ISD = ISD::FSHL;
4065     if (!ICA.isTypeBasedOnly()) {
4066       const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4067       if (Args[0] == Args[1]) {
4068         // Handle scalar constant rotation amount.
4069         // TODO: Handle vector + funnel-shift cases.
4070         ISD = ISD::ROTR;
4071         if (isa_and_nonnull<ConstantInt>(Args[2]))
4072           ISD = X86ISD::VROTLI;
4073       }
4074     }
4075     break;
4076   case Intrinsic::maxnum:
4077   case Intrinsic::minnum:
4078     // FMINNUM has same costs so don't duplicate.
4079     ISD = ISD::FMAXNUM;
4080     break;
4081   case Intrinsic::sadd_sat:
4082     ISD = ISD::SADDSAT;
4083     break;
4084   case Intrinsic::smax:
4085     ISD = ISD::SMAX;
4086     break;
4087   case Intrinsic::smin:
4088     ISD = ISD::SMIN;
4089     break;
4090   case Intrinsic::ssub_sat:
4091     ISD = ISD::SSUBSAT;
4092     break;
4093   case Intrinsic::uadd_sat:
4094     ISD = ISD::UADDSAT;
4095     break;
4096   case Intrinsic::umax:
4097     ISD = ISD::UMAX;
4098     break;
4099   case Intrinsic::umin:
4100     ISD = ISD::UMIN;
4101     break;
4102   case Intrinsic::usub_sat:
4103     ISD = ISD::USUBSAT;
4104     break;
4105   case Intrinsic::sqrt:
4106     ISD = ISD::FSQRT;
4107     break;
4108   case Intrinsic::sadd_with_overflow:
4109   case Intrinsic::ssub_with_overflow:
4110     // SSUBO has same costs so don't duplicate.
4111     ISD = ISD::SADDO;
4112     OpTy = RetTy->getContainedType(0);
4113     break;
4114   case Intrinsic::uadd_with_overflow:
4115   case Intrinsic::usub_with_overflow:
4116     // USUBO has same costs so don't duplicate.
4117     ISD = ISD::UADDO;
4118     OpTy = RetTy->getContainedType(0);
4119     break;
4120   case Intrinsic::umul_with_overflow:
4121   case Intrinsic::smul_with_overflow:
4122     // SMULO has same costs so don't duplicate.
4123     ISD = ISD::UMULO;
4124     OpTy = RetTy->getContainedType(0);
4125     break;
4126   }
4127 
4128   if (ISD != ISD::DELETED_NODE) {
4129     // Legalize the type.
4130     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4131     MVT MTy = LT.second;
4132 
4133     // Attempt to lookup cost.
4134     if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
4135         MTy.isVector()) {
4136       // With PSHUFB the code is very similar for all types. If we have integer
4137       // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
4138       // we also need a PSHUFB.
4139       unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
4140 
4141       // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
4142       // instructions. We also need an extract and an insert.
4143       if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
4144             (ST->hasBWI() && MTy.is512BitVector())))
4145         Cost = Cost * 2 + 2;
4146 
4147       return LT.first * Cost;
4148     }
4149 
4150     // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4151     if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4152          (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4153         !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4154       const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4155       if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4156         if (Cst->isAllOnesValue())
4157           ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF;
4158     }
4159 
4160     // FSQRT is a single instruction.
4161     if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4162       return LT.first;
4163 
4164     auto adjustTableCost = [](int ISD, unsigned Cost,
4165                               InstructionCost LegalizationCost,
4166                               FastMathFlags FMF) {
4167       // If there are no NANs to deal with, then these are reduced to a
4168       // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4169       // assume is used in the non-fast case.
4170       if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4171         if (FMF.noNaNs())
4172           return LegalizationCost * 1;
4173       }
4174       return LegalizationCost * (int)Cost;
4175     };
4176 
4177     if (ST->useGLMDivSqrtCosts())
4178       if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4179         if (auto KindCost = Entry->Cost[CostKind])
4180           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4181                                  ICA.getFlags());
4182 
4183     if (ST->useSLMArithCosts())
4184       if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4185         if (auto KindCost = Entry->Cost[CostKind])
4186           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4187                                  ICA.getFlags());
4188 
4189     if (ST->hasVBMI2())
4190       if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4191         if (auto KindCost = Entry->Cost[CostKind])
4192           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4193                                  ICA.getFlags());
4194 
4195     if (ST->hasBITALG())
4196       if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4197         if (auto KindCost = Entry->Cost[CostKind])
4198           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4199                                  ICA.getFlags());
4200 
4201     if (ST->hasVPOPCNTDQ())
4202       if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4203         if (auto KindCost = Entry->Cost[CostKind])
4204           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4205                                  ICA.getFlags());
4206 
4207     if (ST->hasCDI())
4208       if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4209         if (auto KindCost = Entry->Cost[CostKind])
4210           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4211                                  ICA.getFlags());
4212 
4213     if (ST->hasBWI())
4214       if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4215         if (auto KindCost = Entry->Cost[CostKind])
4216           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4217                                  ICA.getFlags());
4218 
4219     if (ST->hasAVX512())
4220       if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4221         if (auto KindCost = Entry->Cost[CostKind])
4222           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4223                                  ICA.getFlags());
4224 
4225     if (ST->hasXOP())
4226       if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4227         if (auto KindCost = Entry->Cost[CostKind])
4228           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4229                                  ICA.getFlags());
4230 
4231     if (ST->hasAVX2())
4232       if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4233         if (auto KindCost = Entry->Cost[CostKind])
4234           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4235                                  ICA.getFlags());
4236 
4237     if (ST->hasAVX())
4238       if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4239         if (auto KindCost = Entry->Cost[CostKind])
4240           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4241                                  ICA.getFlags());
4242 
4243     if (ST->hasSSE42())
4244       if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4245         if (auto KindCost = Entry->Cost[CostKind])
4246           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4247                                  ICA.getFlags());
4248 
4249     if (ST->hasSSE41())
4250       if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4251         if (auto KindCost = Entry->Cost[CostKind])
4252           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4253                                  ICA.getFlags());
4254 
4255     if (ST->hasSSSE3())
4256       if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4257         if (auto KindCost = Entry->Cost[CostKind])
4258           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4259                                  ICA.getFlags());
4260 
4261     if (ST->hasSSE2())
4262       if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4263         if (auto KindCost = Entry->Cost[CostKind])
4264           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4265                                  ICA.getFlags());
4266 
4267     if (ST->hasSSE1())
4268       if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4269         if (auto KindCost = Entry->Cost[CostKind])
4270           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4271                                  ICA.getFlags());
4272 
4273     if (ST->hasBMI()) {
4274       if (ST->is64Bit())
4275         if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4276           if (auto KindCost = Entry->Cost[CostKind])
4277             return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4278                                    ICA.getFlags());
4279 
4280       if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4281         if (auto KindCost = Entry->Cost[CostKind])
4282           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4283                                  ICA.getFlags());
4284     }
4285 
4286     if (ST->hasLZCNT()) {
4287       if (ST->is64Bit())
4288         if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4289           if (auto KindCost = Entry->Cost[CostKind])
4290             return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4291                                    ICA.getFlags());
4292 
4293       if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4294         if (auto KindCost = Entry->Cost[CostKind])
4295           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4296                                  ICA.getFlags());
4297     }
4298 
4299     if (ST->hasPOPCNT()) {
4300       if (ST->is64Bit())
4301         if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4302           if (auto KindCost = Entry->Cost[CostKind])
4303             return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4304                                    ICA.getFlags());
4305 
4306       if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4307         if (auto KindCost = Entry->Cost[CostKind])
4308           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4309                                  ICA.getFlags());
4310     }
4311 
4312     if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4313       if (const Instruction *II = ICA.getInst()) {
4314         if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4315           return TTI::TCC_Free;
4316         if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4317           if (LI->hasOneUse())
4318             return TTI::TCC_Free;
4319         }
4320       }
4321     }
4322 
4323     if (ST->is64Bit())
4324       if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4325         if (auto KindCost = Entry->Cost[CostKind])
4326           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4327                                  ICA.getFlags());
4328 
4329     if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4330       if (auto KindCost = Entry->Cost[CostKind])
4331         return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags());
4332   }
4333 
4334   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
4335 }
4336 
4337 InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
4338                                                TTI::TargetCostKind CostKind,
4339                                                unsigned Index, Value *Op0,
4340                                                Value *Op1) {
4341   static const CostTblEntry SLMCostTbl[] = {
4342      { ISD::EXTRACT_VECTOR_ELT,       MVT::i8,      4 },
4343      { ISD::EXTRACT_VECTOR_ELT,       MVT::i16,     4 },
4344      { ISD::EXTRACT_VECTOR_ELT,       MVT::i32,     4 },
4345      { ISD::EXTRACT_VECTOR_ELT,       MVT::i64,     7 }
4346    };
4347 
4348   assert(Val->isVectorTy() && "This must be a vector type");
4349   Type *ScalarType = Val->getScalarType();
4350   InstructionCost RegisterFileMoveCost = 0;
4351 
4352   // Non-immediate extraction/insertion can be handled as a sequence of
4353   // aliased loads+stores via the stack.
4354   if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4355                        Opcode == Instruction::InsertElement)) {
4356     // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4357     // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4358 
4359     // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4360     assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4361     Align VecAlign = DL.getPrefTypeAlign(Val);
4362     Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4363 
4364     // Extract - store vector to stack, load scalar.
4365     if (Opcode == Instruction::ExtractElement) {
4366       return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4367              getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4368                              CostKind);
4369     }
4370     // Insert - store vector to stack, store scalar, load vector.
4371     if (Opcode == Instruction::InsertElement) {
4372       return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4373              getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4374                              CostKind) +
4375              getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4376     }
4377   }
4378 
4379   if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4380                        Opcode == Instruction::InsertElement)) {
4381     // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4382     if (Opcode == Instruction::ExtractElement &&
4383         ScalarType->getScalarSizeInBits() == 1 &&
4384         cast<FixedVectorType>(Val)->getNumElements() > 1)
4385       return 1;
4386 
4387     // Legalize the type.
4388     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4389 
4390     // This type is legalized to a scalar type.
4391     if (!LT.second.isVector())
4392       return 0;
4393 
4394     // The type may be split. Normalize the index to the new type.
4395     unsigned SizeInBits = LT.second.getSizeInBits();
4396     unsigned NumElts = LT.second.getVectorNumElements();
4397     unsigned SubNumElts = NumElts;
4398     Index = Index % NumElts;
4399 
4400     // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4401     // For inserts, we also need to insert the subvector back.
4402     if (SizeInBits > 128) {
4403       assert((SizeInBits % 128) == 0 && "Illegal vector");
4404       unsigned NumSubVecs = SizeInBits / 128;
4405       SubNumElts = NumElts / NumSubVecs;
4406       if (SubNumElts <= Index) {
4407         RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4408         Index %= SubNumElts;
4409       }
4410     }
4411 
4412     MVT MScalarTy = LT.second.getScalarType();
4413     auto IsCheapPInsrPExtrInsertPS = [&]() {
4414       // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4415       // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4416       return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4417              (MScalarTy.isInteger() && ST->hasSSE41()) ||
4418              (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4419               Opcode == Instruction::InsertElement);
4420     };
4421 
4422     if (Index == 0) {
4423       // Floating point scalars are already located in index #0.
4424       // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4425       // true for all.
4426       if (ScalarType->isFloatingPointTy() &&
4427           (Opcode != Instruction::InsertElement || !Op0 ||
4428            isa<UndefValue>(Op0)))
4429         return RegisterFileMoveCost;
4430 
4431       if (Opcode == Instruction::InsertElement &&
4432           isa_and_nonnull<UndefValue>(Op0)) {
4433         // Consider the gather cost to be cheap.
4434         if (isa_and_nonnull<LoadInst>(Op1))
4435           return RegisterFileMoveCost;
4436         if (!IsCheapPInsrPExtrInsertPS()) {
4437           // mov constant-to-GPR + movd/movq GPR -> XMM.
4438           if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4439             return 2 + RegisterFileMoveCost;
4440           // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4441           return 1 + RegisterFileMoveCost;
4442         }
4443       }
4444 
4445       // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4446       if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4447         return 1 + RegisterFileMoveCost;
4448     }
4449 
4450     int ISD = TLI->InstructionOpcodeToISD(Opcode);
4451     assert(ISD && "Unexpected vector opcode");
4452     if (ST->useSLMArithCosts())
4453       if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4454         return Entry->Cost + RegisterFileMoveCost;
4455 
4456     // Consider cheap cases.
4457     if (IsCheapPInsrPExtrInsertPS())
4458       return 1 + RegisterFileMoveCost;
4459 
4460     // For extractions we just need to shuffle the element to index 0, which
4461     // should be very cheap (assume cost = 1). For insertions we need to shuffle
4462     // the elements to its destination. In both cases we must handle the
4463     // subvector move(s).
4464     // If the vector type is already less than 128-bits then don't reduce it.
4465     // TODO: Under what circumstances should we shuffle using the full width?
4466     InstructionCost ShuffleCost = 1;
4467     if (Opcode == Instruction::InsertElement) {
4468       auto *SubTy = cast<VectorType>(Val);
4469       EVT VT = TLI->getValueType(DL, Val);
4470       if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4471         SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4472       ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt,
4473                                    CostKind, 0, SubTy);
4474     }
4475     int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4476     return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4477   }
4478 
4479   return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4480          RegisterFileMoveCost;
4481 }
4482 
4483 InstructionCost
4484 X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
4485                                      bool Insert, bool Extract,
4486                                      TTI::TargetCostKind CostKind) {
4487   assert(DemandedElts.getBitWidth() ==
4488              cast<FixedVectorType>(Ty)->getNumElements() &&
4489          "Vector size mismatch");
4490 
4491   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4492   MVT MScalarTy = LT.second.getScalarType();
4493   unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4494   InstructionCost Cost = 0;
4495 
4496   constexpr unsigned LaneBitWidth = 128;
4497   assert((LegalVectorBitWidth < LaneBitWidth ||
4498           (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4499          "Illegal vector");
4500 
4501   const int NumLegalVectors = *LT.first.getValue();
4502   assert(NumLegalVectors >= 0 && "Negative cost!");
4503 
4504   // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4505   // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4506   if (Insert) {
4507     if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4508         (MScalarTy.isInteger() && ST->hasSSE41()) ||
4509         (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4510       // For types we can insert directly, insertion into 128-bit sub vectors is
4511       // cheap, followed by a cheap chain of concatenations.
4512       if (LegalVectorBitWidth <= LaneBitWidth) {
4513         Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4514                                                 /*Extract*/ false, CostKind);
4515       } else {
4516         // In each 128-lane, if at least one index is demanded but not all
4517         // indices are demanded and this 128-lane is not the first 128-lane of
4518         // the legalized-vector, then this 128-lane needs a extracti128; If in
4519         // each 128-lane, there is at least one demanded index, this 128-lane
4520         // needs a inserti128.
4521 
4522         // The following cases will help you build a better understanding:
4523         // Assume we insert several elements into a v8i32 vector in avx2,
4524         // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4525         // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4526         // inserti128.
4527         // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4528         assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4529         unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4530         unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4531         unsigned NumLegalElts =
4532             LT.second.getVectorNumElements() * NumLegalVectors;
4533         assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4534                "Vector has been legalized to smaller element count");
4535         assert((NumLegalElts % NumLanesTotal) == 0 &&
4536                "Unexpected elts per lane");
4537         unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4538 
4539         APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4540         auto *LaneTy =
4541             FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4542 
4543         for (unsigned I = 0; I != NumLanesTotal; ++I) {
4544           APInt LaneEltMask = WidenedDemandedElts.extractBits(
4545               NumEltsPerLane, NumEltsPerLane * I);
4546           if (LaneEltMask.isZero())
4547             continue;
4548           // FIXME: we don't need to extract if all non-demanded elements
4549           //        are legalization-inserted padding.
4550           if (!LaneEltMask.isAllOnes())
4551             Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4552                                    CostKind, I * NumEltsPerLane, LaneTy);
4553           Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4554                                                   /*Extract*/ false, CostKind);
4555         }
4556 
4557         APInt AffectedLanes =
4558             APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4559         APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4560             AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4561         for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4562           for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4563             unsigned I = NumLegalLanes * LegalVec + Lane;
4564             // No need to insert unaffected lane; or lane 0 of each legal vector
4565             // iff ALL lanes of that vector were affected and will be inserted.
4566             if (!AffectedLanes[I] ||
4567                 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4568               continue;
4569             Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt,
4570                                    CostKind, I * NumEltsPerLane, LaneTy);
4571           }
4572         }
4573       }
4574     } else if (LT.second.isVector()) {
4575       // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4576       // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4577       // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4578       // considered cheap.
4579       if (Ty->isIntOrIntVectorTy())
4580         Cost += DemandedElts.popcount();
4581 
4582       // Get the smaller of the legalized or original pow2-extended number of
4583       // vector elements, which represents the number of unpacks we'll end up
4584       // performing.
4585       unsigned NumElts = LT.second.getVectorNumElements();
4586       unsigned Pow2Elts =
4587           PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4588       Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4589     }
4590   }
4591 
4592   if (Extract) {
4593     // vXi1 can be efficiently extracted with MOVMSK.
4594     // TODO: AVX512 predicate mask handling.
4595     // NOTE: This doesn't work well for roundtrip scalarization.
4596     if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4597       unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4598       unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4599       unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4600       return MOVMSKCost;
4601     }
4602 
4603     if (LT.second.isVector()) {
4604       unsigned NumLegalElts =
4605           LT.second.getVectorNumElements() * NumLegalVectors;
4606       assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4607              "Vector has been legalized to smaller element count");
4608 
4609       // If we're extracting elements from a 128-bit subvector lane,
4610       // we only need to extract each lane once, not for every element.
4611       if (LegalVectorBitWidth > LaneBitWidth) {
4612         unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4613         unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4614         assert((NumLegalElts % NumLanesTotal) == 0 &&
4615                "Unexpected elts per lane");
4616         unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4617 
4618         // Add cost for each demanded 128-bit subvector extraction.
4619         // Luckily this is a lot easier than for insertion.
4620         APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4621         auto *LaneTy =
4622             FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4623 
4624         for (unsigned I = 0; I != NumLanesTotal; ++I) {
4625           APInt LaneEltMask = WidenedDemandedElts.extractBits(
4626               NumEltsPerLane, I * NumEltsPerLane);
4627           if (LaneEltMask.isZero())
4628             continue;
4629           Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4630                                  CostKind, I * NumEltsPerLane, LaneTy);
4631           Cost += BaseT::getScalarizationOverhead(
4632               LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
4633         }
4634 
4635         return Cost;
4636       }
4637     }
4638 
4639     // Fallback to default extraction.
4640     Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
4641                                             Extract, CostKind);
4642   }
4643 
4644   return Cost;
4645 }
4646 
4647 InstructionCost
4648 X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
4649                                       int VF, const APInt &DemandedDstElts,
4650                                       TTI::TargetCostKind CostKind) {
4651   const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
4652   // We don't differentiate element types here, only element bit width.
4653   EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
4654 
4655   auto bailout = [&]() {
4656     return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
4657                                             DemandedDstElts, CostKind);
4658   };
4659 
4660   // For now, only deal with AVX512 cases.
4661   if (!ST->hasAVX512())
4662     return bailout();
4663 
4664   // Do we have a native shuffle for this element type, or should we promote?
4665   unsigned PromEltTyBits = EltTyBits;
4666   switch (EltTyBits) {
4667   case 32:
4668   case 64:
4669     break; // AVX512F.
4670   case 16:
4671     if (!ST->hasBWI())
4672       PromEltTyBits = 32; // promote to i32, AVX512F.
4673     break;                // AVX512BW
4674   case 8:
4675     if (!ST->hasVBMI())
4676       PromEltTyBits = 32; // promote to i32, AVX512F.
4677     break;                // AVX512VBMI
4678   case 1:
4679     // There is no support for shuffling i1 elements. We *must* promote.
4680     if (ST->hasBWI()) {
4681       if (ST->hasVBMI())
4682         PromEltTyBits = 8; // promote to i8, AVX512VBMI.
4683       else
4684         PromEltTyBits = 16; // promote to i16, AVX512BW.
4685       break;
4686     }
4687     PromEltTyBits = 32; // promote to i32, AVX512F.
4688     break;
4689   default:
4690     return bailout();
4691   }
4692   auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
4693 
4694   auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
4695   auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
4696 
4697   int NumDstElements = VF * ReplicationFactor;
4698   auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
4699   auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
4700 
4701   // Legalize the types.
4702   MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
4703   MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
4704   MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
4705   MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
4706   // They should have legalized into vector types.
4707   if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
4708       !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
4709     return bailout();
4710 
4711   if (PromEltTyBits != EltTyBits) {
4712     // If we have to perform the shuffle with wider elt type than our data type,
4713     // then we will first need to anyext (we don't care about the new bits)
4714     // the source elements, and then truncate Dst elements.
4715     InstructionCost PromotionCost;
4716     PromotionCost += getCastInstrCost(
4717         Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4718         TargetTransformInfo::CastContextHint::None, CostKind);
4719     PromotionCost +=
4720         getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4721                          /*Src=*/PromDstVecTy,
4722                          TargetTransformInfo::CastContextHint::None, CostKind);
4723     return PromotionCost + getReplicationShuffleCost(PromEltTy,
4724                                                      ReplicationFactor, VF,
4725                                                      DemandedDstElts, CostKind);
4726   }
4727 
4728   assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
4729          LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
4730          "We expect that the legalization doesn't affect the element width, "
4731          "doesn't coalesce/split elements.");
4732 
4733   unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
4734   unsigned NumDstVectors =
4735       divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
4736 
4737   auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
4738 
4739   // Not all the produced Dst elements may be demanded. In our case,
4740   // given that a single Dst vector is formed by a single shuffle,
4741   // if all elements that will form a single Dst vector aren't demanded,
4742   // then we won't need to do that shuffle, so adjust the cost accordingly.
4743   APInt DemandedDstVectors = APIntOps::ScaleBitMask(
4744       DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
4745   unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
4746 
4747   InstructionCost SingleShuffleCost = getShuffleCost(
4748       TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind,
4749       /*Index=*/0, /*SubTp=*/nullptr);
4750   return NumDstVectorsDemanded * SingleShuffleCost;
4751 }
4752 
4753 InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
4754                                             MaybeAlign Alignment,
4755                                             unsigned AddressSpace,
4756                                             TTI::TargetCostKind CostKind,
4757                                             TTI::OperandValueInfo OpInfo,
4758                                             const Instruction *I) {
4759   // TODO: Handle other cost kinds.
4760   if (CostKind != TTI::TCK_RecipThroughput) {
4761     if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
4762       // Store instruction with index and scale costs 2 Uops.
4763       // Check the preceding GEP to identify non-const indices.
4764       if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
4765         if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
4766           return TTI::TCC_Basic * 2;
4767       }
4768     }
4769     return TTI::TCC_Basic;
4770   }
4771 
4772   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4773          "Invalid Opcode");
4774   // Type legalization can't handle structs
4775   if (TLI->getValueType(DL, Src, true) == MVT::Other)
4776     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4777                                   CostKind);
4778 
4779   // Legalize the type.
4780   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
4781 
4782   auto *VTy = dyn_cast<FixedVectorType>(Src);
4783 
4784   InstructionCost Cost = 0;
4785 
4786   // Add a cost for constant load to vector.
4787   if (Opcode == Instruction::Store && OpInfo.isConstant())
4788     Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
4789                             /*AddressSpace=*/0, CostKind);
4790 
4791   // Handle the simple case of non-vectors.
4792   // NOTE: this assumes that legalization never creates vector from scalars!
4793   if (!VTy || !LT.second.isVector()) {
4794     // Each load/store unit costs 1.
4795     return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
4796   }
4797 
4798   bool IsLoad = Opcode == Instruction::Load;
4799 
4800   Type *EltTy = VTy->getElementType();
4801 
4802   const int EltTyBits = DL.getTypeSizeInBits(EltTy);
4803 
4804   // Source of truth: how many elements were there in the original IR vector?
4805   const unsigned SrcNumElt = VTy->getNumElements();
4806 
4807   // How far have we gotten?
4808   int NumEltRemaining = SrcNumElt;
4809   // Note that we intentionally capture by-reference, NumEltRemaining changes.
4810   auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
4811 
4812   const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
4813 
4814   // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
4815   const unsigned XMMBits = 128;
4816   if (XMMBits % EltTyBits != 0)
4817     // Vector size must be a multiple of the element size. I.e. no padding.
4818     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4819                                   CostKind);
4820   const int NumEltPerXMM = XMMBits / EltTyBits;
4821 
4822   auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
4823 
4824   for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
4825        NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
4826     // How many elements would a single op deal with at once?
4827     if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
4828       // Vector size must be a multiple of the element size. I.e. no padding.
4829       return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4830                                     CostKind);
4831     int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
4832 
4833     assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
4834     assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
4835             (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
4836            "Unless we haven't halved the op size yet, "
4837            "we have less than two op's sized units of work left.");
4838 
4839     auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
4840                           ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
4841                           : XMMVecTy;
4842 
4843     assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
4844            "After halving sizes, the vector elt count is no longer a multiple "
4845            "of number of elements per operation?");
4846     auto *CoalescedVecTy =
4847         CurrNumEltPerOp == 1
4848             ? CurrVecTy
4849             : FixedVectorType::get(
4850                   IntegerType::get(Src->getContext(),
4851                                    EltTyBits * CurrNumEltPerOp),
4852                   CurrVecTy->getNumElements() / CurrNumEltPerOp);
4853     assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
4854                DL.getTypeSizeInBits(CurrVecTy) &&
4855            "coalesciing elements doesn't change vector width.");
4856 
4857     while (NumEltRemaining > 0) {
4858       assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
4859 
4860       // Can we use this vector size, as per the remaining element count?
4861       // Iff the vector is naturally aligned, we can do a wide load regardless.
4862       if (NumEltRemaining < CurrNumEltPerOp &&
4863           (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
4864           CurrOpSizeBytes != 1)
4865         break; // Try smalled vector size.
4866 
4867       bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
4868 
4869       // If we have fully processed the previous reg, we need to replenish it.
4870       if (SubVecEltsLeft == 0) {
4871         SubVecEltsLeft += CurrVecTy->getNumElements();
4872         // And that's free only for the 0'th subvector of a legalized vector.
4873         if (!Is0thSubVec)
4874           Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
4875                                         : TTI::ShuffleKind::SK_ExtractSubvector,
4876                                  VTy, std::nullopt, CostKind, NumEltDone(),
4877                                  CurrVecTy);
4878       }
4879 
4880       // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
4881       // for smaller widths (32/16/8) we have to insert/extract them separately.
4882       // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
4883       // but let's pretend that it is also true for 16/8 bit wide ops...)
4884       if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
4885         int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
4886         assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
4887         int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
4888         APInt DemandedElts =
4889             APInt::getBitsSet(CoalescedVecTy->getNumElements(),
4890                               CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
4891         assert(DemandedElts.popcount() == 1 && "Inserting single value");
4892         Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
4893                                          !IsLoad, CostKind);
4894       }
4895 
4896       // This isn't exactly right. We're using slow unaligned 32-byte accesses
4897       // as a proxy for a double-pumped AVX memory interface such as on
4898       // Sandybridge.
4899       // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
4900       // will be scalarized.
4901       if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
4902         Cost += 2;
4903       else if (CurrOpSizeBytes < 4)
4904         Cost += 2;
4905       else
4906         Cost += 1;
4907 
4908       SubVecEltsLeft -= CurrNumEltPerOp;
4909       NumEltRemaining -= CurrNumEltPerOp;
4910       Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
4911     }
4912   }
4913 
4914   assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
4915 
4916   return Cost;
4917 }
4918 
4919 InstructionCost
4920 X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
4921                                   unsigned AddressSpace,
4922                                   TTI::TargetCostKind CostKind) {
4923   bool IsLoad = (Instruction::Load == Opcode);
4924   bool IsStore = (Instruction::Store == Opcode);
4925 
4926   auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
4927   if (!SrcVTy)
4928     // To calculate scalar take the regular cost, without mask
4929     return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
4930 
4931   unsigned NumElem = SrcVTy->getNumElements();
4932   auto *MaskTy =
4933       FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
4934   if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
4935       (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
4936     // Scalarization
4937     APInt DemandedElts = APInt::getAllOnes(NumElem);
4938     InstructionCost MaskSplitCost = getScalarizationOverhead(
4939         MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
4940     InstructionCost ScalarCompareCost = getCmpSelInstrCost(
4941         Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
4942         CmpInst::BAD_ICMP_PREDICATE, CostKind);
4943     InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
4944     InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
4945     InstructionCost ValueSplitCost = getScalarizationOverhead(
4946         SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
4947     InstructionCost MemopCost =
4948         NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4949                                          Alignment, AddressSpace, CostKind);
4950     return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
4951   }
4952 
4953   // Legalize the type.
4954   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
4955   auto VT = TLI->getValueType(DL, SrcVTy);
4956   InstructionCost Cost = 0;
4957   if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
4958       LT.second.getVectorNumElements() == NumElem)
4959     // Promotion requires extend/truncate for data and a shuffle for mask.
4960     Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
4961                            CostKind, 0, nullptr) +
4962             getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
4963                            CostKind, 0, nullptr);
4964 
4965   else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
4966     auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
4967                                            LT.second.getVectorNumElements());
4968     // Expanding requires fill mask with zeroes
4969     Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
4970                            CostKind, 0, MaskTy);
4971   }
4972 
4973   // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
4974   if (!ST->hasAVX512())
4975     return Cost + LT.first * (IsLoad ? 2 : 8);
4976 
4977   // AVX-512 masked load/store is cheaper
4978   return Cost + LT.first;
4979 }
4980 
4981 InstructionCost
4982 X86TTIImpl::getPointersChainCost(ArrayRef<const Value *> Ptrs,
4983                                  const Value *Base,
4984                                  const TTI::PointersChainInfo &Info,
4985                                  Type *AccessTy, TTI::TargetCostKind CostKind) {
4986   if (Info.isSameBase() && Info.isKnownStride()) {
4987     // If all the pointers have known stride all the differences are translated
4988     // into constants. X86 memory addressing allows encoding it into
4989     // displacement. So we just need to take the base GEP cost.
4990     if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
4991       SmallVector<const Value *> Indices(BaseGEP->indices());
4992       return getGEPCost(BaseGEP->getSourceElementType(),
4993                         BaseGEP->getPointerOperand(), Indices, nullptr,
4994                         CostKind);
4995     }
4996     return TTI::TCC_Free;
4997   }
4998   return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
4999 }
5000 
5001 InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
5002                                                       ScalarEvolution *SE,
5003                                                       const SCEV *Ptr) {
5004   // Address computations in vectorized code with non-consecutive addresses will
5005   // likely result in more instructions compared to scalar code where the
5006   // computation can more often be merged into the index mode. The resulting
5007   // extra micro-ops can significantly decrease throughput.
5008   const unsigned NumVectorInstToHideOverhead = 10;
5009 
5010   // Cost modeling of Strided Access Computation is hidden by the indexing
5011   // modes of X86 regardless of the stride value. We dont believe that there
5012   // is a difference between constant strided access in gerenal and constant
5013   // strided value which is less than or equal to 64.
5014   // Even in the case of (loop invariant) stride whose value is not known at
5015   // compile time, the address computation will not incur more than one extra
5016   // ADD instruction.
5017   if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5018     // TODO: AVX2 is the current cut-off because we don't have correct
5019     //       interleaving costs for prior ISA's.
5020     if (!BaseT::isStridedAccess(Ptr))
5021       return NumVectorInstToHideOverhead;
5022     if (!BaseT::getConstantStrideStep(SE, Ptr))
5023       return 1;
5024   }
5025 
5026   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5027 }
5028 
5029 InstructionCost
5030 X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
5031                                        std::optional<FastMathFlags> FMF,
5032                                        TTI::TargetCostKind CostKind) {
5033   if (TTI::requiresOrderedReduction(FMF))
5034     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5035 
5036   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5037   // and make it as the cost.
5038 
5039   static const CostTblEntry SLMCostTbl[] = {
5040     { ISD::FADD,  MVT::v2f64,   3 },
5041     { ISD::ADD,   MVT::v2i64,   5 },
5042   };
5043 
5044   static const CostTblEntry SSE2CostTbl[] = {
5045     { ISD::FADD,  MVT::v2f64,   2 },
5046     { ISD::FADD,  MVT::v2f32,   2 },
5047     { ISD::FADD,  MVT::v4f32,   4 },
5048     { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
5049     { ISD::ADD,   MVT::v2i32,   2 }, // FIXME: chosen to be less than v4i32
5050     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
5051     { ISD::ADD,   MVT::v2i16,   2 },      // The data reported by the IACA tool is "4.3".
5052     { ISD::ADD,   MVT::v4i16,   3 },      // The data reported by the IACA tool is "4.3".
5053     { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
5054     { ISD::ADD,   MVT::v2i8,    2 },
5055     { ISD::ADD,   MVT::v4i8,    2 },
5056     { ISD::ADD,   MVT::v8i8,    2 },
5057     { ISD::ADD,   MVT::v16i8,   3 },
5058   };
5059 
5060   static const CostTblEntry AVX1CostTbl[] = {
5061     { ISD::FADD,  MVT::v4f64,   3 },
5062     { ISD::FADD,  MVT::v4f32,   3 },
5063     { ISD::FADD,  MVT::v8f32,   4 },
5064     { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
5065     { ISD::ADD,   MVT::v4i64,   3 },
5066     { ISD::ADD,   MVT::v8i32,   5 },
5067     { ISD::ADD,   MVT::v16i16,  5 },
5068     { ISD::ADD,   MVT::v32i8,   4 },
5069   };
5070 
5071   int ISD = TLI->InstructionOpcodeToISD(Opcode);
5072   assert(ISD && "Invalid opcode");
5073 
5074   // Before legalizing the type, give a chance to look up illegal narrow types
5075   // in the table.
5076   // FIXME: Is there a better way to do this?
5077   EVT VT = TLI->getValueType(DL, ValTy);
5078   if (VT.isSimple()) {
5079     MVT MTy = VT.getSimpleVT();
5080     if (ST->useSLMArithCosts())
5081       if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5082         return Entry->Cost;
5083 
5084     if (ST->hasAVX())
5085       if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5086         return Entry->Cost;
5087 
5088     if (ST->hasSSE2())
5089       if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5090         return Entry->Cost;
5091   }
5092 
5093   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5094 
5095   MVT MTy = LT.second;
5096 
5097   auto *ValVTy = cast<FixedVectorType>(ValTy);
5098 
5099   // Special case: vXi8 mul reductions are performed as vXi16.
5100   if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5101     auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5102     auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5103     return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5104                             TargetTransformInfo::CastContextHint::None,
5105                             CostKind) +
5106            getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5107   }
5108 
5109   InstructionCost ArithmeticCost = 0;
5110   if (LT.first != 1 && MTy.isVector() &&
5111       MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5112     // Type needs to be split. We need LT.first - 1 arithmetic ops.
5113     auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5114                                             MTy.getVectorNumElements());
5115     ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5116     ArithmeticCost *= LT.first - 1;
5117   }
5118 
5119   if (ST->useSLMArithCosts())
5120     if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5121       return ArithmeticCost + Entry->Cost;
5122 
5123   if (ST->hasAVX())
5124     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5125       return ArithmeticCost + Entry->Cost;
5126 
5127   if (ST->hasSSE2())
5128     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5129       return ArithmeticCost + Entry->Cost;
5130 
5131   // FIXME: These assume a naive kshift+binop lowering, which is probably
5132   // conservative in most cases.
5133   static const CostTblEntry AVX512BoolReduction[] = {
5134     { ISD::AND,  MVT::v2i1,   3 },
5135     { ISD::AND,  MVT::v4i1,   5 },
5136     { ISD::AND,  MVT::v8i1,   7 },
5137     { ISD::AND,  MVT::v16i1,  9 },
5138     { ISD::AND,  MVT::v32i1, 11 },
5139     { ISD::AND,  MVT::v64i1, 13 },
5140     { ISD::OR,   MVT::v2i1,   3 },
5141     { ISD::OR,   MVT::v4i1,   5 },
5142     { ISD::OR,   MVT::v8i1,   7 },
5143     { ISD::OR,   MVT::v16i1,  9 },
5144     { ISD::OR,   MVT::v32i1, 11 },
5145     { ISD::OR,   MVT::v64i1, 13 },
5146   };
5147 
5148   static const CostTblEntry AVX2BoolReduction[] = {
5149     { ISD::AND,  MVT::v16i16,  2 }, // vpmovmskb + cmp
5150     { ISD::AND,  MVT::v32i8,   2 }, // vpmovmskb + cmp
5151     { ISD::OR,   MVT::v16i16,  2 }, // vpmovmskb + cmp
5152     { ISD::OR,   MVT::v32i8,   2 }, // vpmovmskb + cmp
5153   };
5154 
5155   static const CostTblEntry AVX1BoolReduction[] = {
5156     { ISD::AND,  MVT::v4i64,   2 }, // vmovmskpd + cmp
5157     { ISD::AND,  MVT::v8i32,   2 }, // vmovmskps + cmp
5158     { ISD::AND,  MVT::v16i16,  4 }, // vextractf128 + vpand + vpmovmskb + cmp
5159     { ISD::AND,  MVT::v32i8,   4 }, // vextractf128 + vpand + vpmovmskb + cmp
5160     { ISD::OR,   MVT::v4i64,   2 }, // vmovmskpd + cmp
5161     { ISD::OR,   MVT::v8i32,   2 }, // vmovmskps + cmp
5162     { ISD::OR,   MVT::v16i16,  4 }, // vextractf128 + vpor + vpmovmskb + cmp
5163     { ISD::OR,   MVT::v32i8,   4 }, // vextractf128 + vpor + vpmovmskb + cmp
5164   };
5165 
5166   static const CostTblEntry SSE2BoolReduction[] = {
5167     { ISD::AND,  MVT::v2i64,   2 }, // movmskpd + cmp
5168     { ISD::AND,  MVT::v4i32,   2 }, // movmskps + cmp
5169     { ISD::AND,  MVT::v8i16,   2 }, // pmovmskb + cmp
5170     { ISD::AND,  MVT::v16i8,   2 }, // pmovmskb + cmp
5171     { ISD::OR,   MVT::v2i64,   2 }, // movmskpd + cmp
5172     { ISD::OR,   MVT::v4i32,   2 }, // movmskps + cmp
5173     { ISD::OR,   MVT::v8i16,   2 }, // pmovmskb + cmp
5174     { ISD::OR,   MVT::v16i8,   2 }, // pmovmskb + cmp
5175   };
5176 
5177   // Handle bool allof/anyof patterns.
5178   if (ValVTy->getElementType()->isIntegerTy(1)) {
5179     InstructionCost ArithmeticCost = 0;
5180     if (LT.first != 1 && MTy.isVector() &&
5181         MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5182       // Type needs to be split. We need LT.first - 1 arithmetic ops.
5183       auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5184                                               MTy.getVectorNumElements());
5185       ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5186       ArithmeticCost *= LT.first - 1;
5187     }
5188 
5189     if (ST->hasAVX512())
5190       if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5191         return ArithmeticCost + Entry->Cost;
5192     if (ST->hasAVX2())
5193       if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5194         return ArithmeticCost + Entry->Cost;
5195     if (ST->hasAVX())
5196       if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5197         return ArithmeticCost + Entry->Cost;
5198     if (ST->hasSSE2())
5199       if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5200         return ArithmeticCost + Entry->Cost;
5201 
5202     return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5203   }
5204 
5205   unsigned NumVecElts = ValVTy->getNumElements();
5206   unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5207 
5208   // Special case power of 2 reductions where the scalar type isn't changed
5209   // by type legalization.
5210   if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5211     return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5212 
5213   InstructionCost ReductionCost = 0;
5214 
5215   auto *Ty = ValVTy;
5216   if (LT.first != 1 && MTy.isVector() &&
5217       MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5218     // Type needs to be split. We need LT.first - 1 arithmetic ops.
5219     Ty = FixedVectorType::get(ValVTy->getElementType(),
5220                               MTy.getVectorNumElements());
5221     ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5222     ReductionCost *= LT.first - 1;
5223     NumVecElts = MTy.getVectorNumElements();
5224   }
5225 
5226   // Now handle reduction with the legal type, taking into account size changes
5227   // at each level.
5228   while (NumVecElts > 1) {
5229     // Determine the size of the remaining vector we need to reduce.
5230     unsigned Size = NumVecElts * ScalarSize;
5231     NumVecElts /= 2;
5232     // If we're reducing from 256/512 bits, use an extract_subvector.
5233     if (Size > 128) {
5234       auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5235       ReductionCost +=
5236           getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, CostKind,
5237                          NumVecElts, SubTy);
5238       Ty = SubTy;
5239     } else if (Size == 128) {
5240       // Reducing from 128 bits is a permute of v2f64/v2i64.
5241       FixedVectorType *ShufTy;
5242       if (ValVTy->isFloatingPointTy())
5243         ShufTy =
5244             FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5245       else
5246         ShufTy =
5247             FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5248       ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5249                                       std::nullopt, CostKind, 0, nullptr);
5250     } else if (Size == 64) {
5251       // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5252       FixedVectorType *ShufTy;
5253       if (ValVTy->isFloatingPointTy())
5254         ShufTy =
5255             FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5256       else
5257         ShufTy =
5258             FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5259       ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5260                                       std::nullopt, CostKind, 0, nullptr);
5261     } else {
5262       // Reducing from smaller size is a shift by immediate.
5263       auto *ShiftTy = FixedVectorType::get(
5264           Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5265       ReductionCost += getArithmeticInstrCost(
5266           Instruction::LShr, ShiftTy, CostKind,
5267           {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
5268           {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None});
5269     }
5270 
5271     // Add the arithmetic op for this level.
5272     ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5273   }
5274 
5275   // Add the final extract element to the cost.
5276   return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5277                                             CostKind, 0, nullptr, nullptr);
5278 }
5279 
5280 InstructionCost X86TTIImpl::getMinMaxCost(Intrinsic::ID IID, Type *Ty,
5281                                           TTI::TargetCostKind CostKind,
5282                                           FastMathFlags FMF) {
5283   IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5284   return getIntrinsicInstrCost(ICA, CostKind);
5285 }
5286 
5287 InstructionCost
5288 X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy,
5289                                    FastMathFlags FMF,
5290                                    TTI::TargetCostKind CostKind) {
5291   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5292 
5293   MVT MTy = LT.second;
5294 
5295   int ISD;
5296   if (ValTy->isIntOrIntVectorTy()) {
5297     ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5298                                                              : ISD::SMIN;
5299   } else {
5300     assert(ValTy->isFPOrFPVectorTy() &&
5301            "Expected float point or integer vector type.");
5302     ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5303               ? ISD::FMINNUM
5304               : ISD::FMINIMUM;
5305   }
5306 
5307   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5308   // and make it as the cost.
5309 
5310   static const CostTblEntry SSE2CostTbl[] = {
5311       {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5312       {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5313       {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5314   };
5315 
5316   static const CostTblEntry SSE41CostTbl[] = {
5317       {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5318       {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5319       {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5320       {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5321       {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5322       {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5323       {ISD::SMIN, MVT::v2i8,  3}, // pminsb
5324       {ISD::SMIN, MVT::v4i8,  5}, // pminsb
5325       {ISD::SMIN, MVT::v8i8,  7}, // pminsb
5326       {ISD::SMIN, MVT::v16i8, 6},
5327       {ISD::UMIN, MVT::v2i8,  3}, // same as sse2
5328       {ISD::UMIN, MVT::v4i8,  5}, // same as sse2
5329       {ISD::UMIN, MVT::v8i8,  7}, // same as sse2
5330       {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5331   };
5332 
5333   static const CostTblEntry AVX1CostTbl[] = {
5334       {ISD::SMIN, MVT::v16i16, 6},
5335       {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5336       {ISD::SMIN, MVT::v32i8, 8},
5337       {ISD::UMIN, MVT::v32i8, 8},
5338   };
5339 
5340   static const CostTblEntry AVX512BWCostTbl[] = {
5341       {ISD::SMIN, MVT::v32i16, 8},
5342       {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5343       {ISD::SMIN, MVT::v64i8, 10},
5344       {ISD::UMIN, MVT::v64i8, 10},
5345   };
5346 
5347   // Before legalizing the type, give a chance to look up illegal narrow types
5348   // in the table.
5349   // FIXME: Is there a better way to do this?
5350   EVT VT = TLI->getValueType(DL, ValTy);
5351   if (VT.isSimple()) {
5352     MVT MTy = VT.getSimpleVT();
5353     if (ST->hasBWI())
5354       if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5355         return Entry->Cost;
5356 
5357     if (ST->hasAVX())
5358       if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5359         return Entry->Cost;
5360 
5361     if (ST->hasSSE41())
5362       if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5363         return Entry->Cost;
5364 
5365     if (ST->hasSSE2())
5366       if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5367         return Entry->Cost;
5368   }
5369 
5370   auto *ValVTy = cast<FixedVectorType>(ValTy);
5371   unsigned NumVecElts = ValVTy->getNumElements();
5372 
5373   auto *Ty = ValVTy;
5374   InstructionCost MinMaxCost = 0;
5375   if (LT.first != 1 && MTy.isVector() &&
5376       MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5377     // Type needs to be split. We need LT.first - 1 operations ops.
5378     Ty = FixedVectorType::get(ValVTy->getElementType(),
5379                               MTy.getVectorNumElements());
5380     MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5381     MinMaxCost *= LT.first - 1;
5382     NumVecElts = MTy.getVectorNumElements();
5383   }
5384 
5385   if (ST->hasBWI())
5386     if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5387       return MinMaxCost + Entry->Cost;
5388 
5389   if (ST->hasAVX())
5390     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5391       return MinMaxCost + Entry->Cost;
5392 
5393   if (ST->hasSSE41())
5394     if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5395       return MinMaxCost + Entry->Cost;
5396 
5397   if (ST->hasSSE2())
5398     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5399       return MinMaxCost + Entry->Cost;
5400 
5401   unsigned ScalarSize = ValTy->getScalarSizeInBits();
5402 
5403   // Special case power of 2 reductions where the scalar type isn't changed
5404   // by type legalization.
5405   if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5406       ScalarSize != MTy.getScalarSizeInBits())
5407     return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5408 
5409   // Now handle reduction with the legal type, taking into account size changes
5410   // at each level.
5411   while (NumVecElts > 1) {
5412     // Determine the size of the remaining vector we need to reduce.
5413     unsigned Size = NumVecElts * ScalarSize;
5414     NumVecElts /= 2;
5415     // If we're reducing from 256/512 bits, use an extract_subvector.
5416     if (Size > 128) {
5417       auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5418       MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
5419                                    CostKind, NumVecElts, SubTy);
5420       Ty = SubTy;
5421     } else if (Size == 128) {
5422       // Reducing from 128 bits is a permute of v2f64/v2i64.
5423       VectorType *ShufTy;
5424       if (ValTy->isFloatingPointTy())
5425         ShufTy =
5426             FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
5427       else
5428         ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5429       MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5430                                    std::nullopt, CostKind, 0, nullptr);
5431     } else if (Size == 64) {
5432       // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5433       FixedVectorType *ShufTy;
5434       if (ValTy->isFloatingPointTy())
5435         ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5436       else
5437         ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5438       MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5439                                    std::nullopt, CostKind, 0, nullptr);
5440     } else {
5441       // Reducing from smaller size is a shift by immediate.
5442       auto *ShiftTy = FixedVectorType::get(
5443           Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5444       MinMaxCost += getArithmeticInstrCost(
5445           Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5446           {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
5447           {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None});
5448     }
5449 
5450     // Add the arithmetic op for this level.
5451     MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5452   }
5453 
5454   // Add the final extract element to the cost.
5455   return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5456                                          CostKind, 0, nullptr, nullptr);
5457 }
5458 
5459 /// Calculate the cost of materializing a 64-bit value. This helper
5460 /// method might only calculate a fraction of a larger immediate. Therefore it
5461 /// is valid to return a cost of ZERO.
5462 InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) {
5463   if (Val == 0)
5464     return TTI::TCC_Free;
5465 
5466   if (isInt<32>(Val))
5467     return TTI::TCC_Basic;
5468 
5469   return 2 * TTI::TCC_Basic;
5470 }
5471 
5472 InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
5473                                           TTI::TargetCostKind CostKind) {
5474   assert(Ty->isIntegerTy());
5475 
5476   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5477   if (BitSize == 0)
5478     return ~0U;
5479 
5480   // Never hoist constants larger than 128bit, because this might lead to
5481   // incorrect code generation or assertions in codegen.
5482   // Fixme: Create a cost model for types larger than i128 once the codegen
5483   // issues have been fixed.
5484   if (BitSize > 128)
5485     return TTI::TCC_Free;
5486 
5487   if (Imm == 0)
5488     return TTI::TCC_Free;
5489 
5490   // Sign-extend all constants to a multiple of 64-bit.
5491   APInt ImmVal = Imm;
5492   if (BitSize % 64 != 0)
5493     ImmVal = Imm.sext(alignTo(BitSize, 64));
5494 
5495   // Split the constant into 64-bit chunks and calculate the cost for each
5496   // chunk.
5497   InstructionCost Cost = 0;
5498   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5499     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5500     int64_t Val = Tmp.getSExtValue();
5501     Cost += getIntImmCost(Val);
5502   }
5503   // We need at least one instruction to materialize the constant.
5504   return std::max<InstructionCost>(1, Cost);
5505 }
5506 
5507 InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
5508                                               const APInt &Imm, Type *Ty,
5509                                               TTI::TargetCostKind CostKind,
5510                                               Instruction *Inst) {
5511   assert(Ty->isIntegerTy());
5512 
5513   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5514   // There is no cost model for constants with a bit size of 0. Return TCC_Free
5515   // here, so that constant hoisting will ignore this constant.
5516   if (BitSize == 0)
5517     return TTI::TCC_Free;
5518 
5519   unsigned ImmIdx = ~0U;
5520   switch (Opcode) {
5521   default:
5522     return TTI::TCC_Free;
5523   case Instruction::GetElementPtr:
5524     // Always hoist the base address of a GetElementPtr. This prevents the
5525     // creation of new constants for every base constant that gets constant
5526     // folded with the offset.
5527     if (Idx == 0)
5528       return 2 * TTI::TCC_Basic;
5529     return TTI::TCC_Free;
5530   case Instruction::Store:
5531     ImmIdx = 0;
5532     break;
5533   case Instruction::ICmp:
5534     // This is an imperfect hack to prevent constant hoisting of
5535     // compares that might be trying to check if a 64-bit value fits in
5536     // 32-bits. The backend can optimize these cases using a right shift by 32.
5537     // Ideally we would check the compare predicate here. There also other
5538     // similar immediates the backend can use shifts for.
5539     if (Idx == 1 && Imm.getBitWidth() == 64) {
5540       uint64_t ImmVal = Imm.getZExtValue();
5541       if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5542         return TTI::TCC_Free;
5543     }
5544     ImmIdx = 1;
5545     break;
5546   case Instruction::And:
5547     // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5548     // by using a 32-bit operation with implicit zero extension. Detect such
5549     // immediates here as the normal path expects bit 31 to be sign extended.
5550     if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32))
5551       return TTI::TCC_Free;
5552     ImmIdx = 1;
5553     break;
5554   case Instruction::Add:
5555   case Instruction::Sub:
5556     // For add/sub, we can use the opposite instruction for INT32_MIN.
5557     if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
5558       return TTI::TCC_Free;
5559     ImmIdx = 1;
5560     break;
5561   case Instruction::UDiv:
5562   case Instruction::SDiv:
5563   case Instruction::URem:
5564   case Instruction::SRem:
5565     // Division by constant is typically expanded later into a different
5566     // instruction sequence. This completely changes the constants.
5567     // Report them as "free" to stop ConstantHoist from marking them as opaque.
5568     return TTI::TCC_Free;
5569   case Instruction::Mul:
5570   case Instruction::Or:
5571   case Instruction::Xor:
5572     ImmIdx = 1;
5573     break;
5574   // Always return TCC_Free for the shift value of a shift instruction.
5575   case Instruction::Shl:
5576   case Instruction::LShr:
5577   case Instruction::AShr:
5578     if (Idx == 1)
5579       return TTI::TCC_Free;
5580     break;
5581   case Instruction::Trunc:
5582   case Instruction::ZExt:
5583   case Instruction::SExt:
5584   case Instruction::IntToPtr:
5585   case Instruction::PtrToInt:
5586   case Instruction::BitCast:
5587   case Instruction::PHI:
5588   case Instruction::Call:
5589   case Instruction::Select:
5590   case Instruction::Ret:
5591   case Instruction::Load:
5592     break;
5593   }
5594 
5595   if (Idx == ImmIdx) {
5596     uint64_t NumConstants = divideCeil(BitSize, 64);
5597     InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5598     return (Cost <= NumConstants * TTI::TCC_Basic)
5599                ? static_cast<int>(TTI::TCC_Free)
5600                : Cost;
5601   }
5602 
5603   return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5604 }
5605 
5606 InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
5607                                                 const APInt &Imm, Type *Ty,
5608                                                 TTI::TargetCostKind CostKind) {
5609   assert(Ty->isIntegerTy());
5610 
5611   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5612   // There is no cost model for constants with a bit size of 0. Return TCC_Free
5613   // here, so that constant hoisting will ignore this constant.
5614   if (BitSize == 0)
5615     return TTI::TCC_Free;
5616 
5617   switch (IID) {
5618   default:
5619     return TTI::TCC_Free;
5620   case Intrinsic::sadd_with_overflow:
5621   case Intrinsic::uadd_with_overflow:
5622   case Intrinsic::ssub_with_overflow:
5623   case Intrinsic::usub_with_overflow:
5624   case Intrinsic::smul_with_overflow:
5625   case Intrinsic::umul_with_overflow:
5626     if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
5627       return TTI::TCC_Free;
5628     break;
5629   case Intrinsic::experimental_stackmap:
5630     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5631       return TTI::TCC_Free;
5632     break;
5633   case Intrinsic::experimental_patchpoint_void:
5634   case Intrinsic::experimental_patchpoint_i64:
5635     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5636       return TTI::TCC_Free;
5637     break;
5638   }
5639   return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5640 }
5641 
5642 InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode,
5643                                            TTI::TargetCostKind CostKind,
5644                                            const Instruction *I) {
5645   if (CostKind != TTI::TCK_RecipThroughput)
5646     return Opcode == Instruction::PHI ? 0 : 1;
5647   // Branches are assumed to be predicted.
5648   return 0;
5649 }
5650 
5651 int X86TTIImpl::getGatherOverhead() const {
5652   // Some CPUs have more overhead for gather. The specified overhead is relative
5653   // to the Load operation. "2" is the number provided by Intel architects. This
5654   // parameter is used for cost estimation of Gather Op and comparison with
5655   // other alternatives.
5656   // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5657   // enable gather with a -march.
5658   if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5659     return 2;
5660 
5661   return 1024;
5662 }
5663 
5664 int X86TTIImpl::getScatterOverhead() const {
5665   if (ST->hasAVX512())
5666     return 2;
5667 
5668   return 1024;
5669 }
5670 
5671 // Return an average cost of Gather / Scatter instruction, maybe improved later.
5672 // FIXME: Add TargetCostKind support.
5673 InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy,
5674                                             const Value *Ptr, Align Alignment,
5675                                             unsigned AddressSpace) {
5676 
5677   assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
5678   unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5679 
5680   // Try to reduce index size from 64 bit (default for GEP)
5681   // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5682   // operation will use 16 x 64 indices which do not fit in a zmm and needs
5683   // to split. Also check that the base pointer is the same for all lanes,
5684   // and that there's at most one variable index.
5685   auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5686     unsigned IndexSize = DL.getPointerSizeInBits();
5687     const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5688     if (IndexSize < 64 || !GEP)
5689       return IndexSize;
5690 
5691     unsigned NumOfVarIndices = 0;
5692     const Value *Ptrs = GEP->getPointerOperand();
5693     if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5694       return IndexSize;
5695     for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
5696       if (isa<Constant>(GEP->getOperand(I)))
5697         continue;
5698       Type *IndxTy = GEP->getOperand(I)->getType();
5699       if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5700         IndxTy = IndexVTy->getElementType();
5701       if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5702            !isa<SExtInst>(GEP->getOperand(I))) ||
5703           ++NumOfVarIndices > 1)
5704         return IndexSize; // 64
5705     }
5706     return (unsigned)32;
5707   };
5708 
5709   // Trying to reduce IndexSize to 32 bits for vector 16.
5710   // By default the IndexSize is equal to pointer size.
5711   unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5712                            ? getIndexSizeInBits(Ptr, DL)
5713                            : DL.getPointerSizeInBits();
5714 
5715   auto *IndexVTy = FixedVectorType::get(
5716       IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5717   std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
5718   std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
5719   InstructionCost::CostType SplitFactor =
5720       *std::max(IdxsLT.first, SrcLT.first).getValue();
5721   if (SplitFactor > 1) {
5722     // Handle splitting of vector of pointers
5723     auto *SplitSrcTy =
5724         FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
5725     return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
5726                                          AddressSpace);
5727   }
5728 
5729   // The gather / scatter cost is given by Intel architects. It is a rough
5730   // number since we are looking at one instruction in a time.
5731   const int GSOverhead = (Opcode == Instruction::Load)
5732                              ? getGatherOverhead()
5733                              : getScatterOverhead();
5734   return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5735                                            MaybeAlign(Alignment), AddressSpace,
5736                                            TTI::TCK_RecipThroughput);
5737 }
5738 
5739 /// Return the cost of full scalarization of gather / scatter operation.
5740 ///
5741 /// Opcode - Load or Store instruction.
5742 /// SrcVTy - The type of the data vector that should be gathered or scattered.
5743 /// VariableMask - The mask is non-constant at compile time.
5744 /// Alignment - Alignment for one element.
5745 /// AddressSpace - pointer[s] address space.
5746 ///
5747 /// FIXME: Add TargetCostKind support.
5748 InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
5749                                             bool VariableMask, Align Alignment,
5750                                             unsigned AddressSpace) {
5751   Type *ScalarTy = SrcVTy->getScalarType();
5752   unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5753   APInt DemandedElts = APInt::getAllOnes(VF);
5754   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5755 
5756   InstructionCost MaskUnpackCost = 0;
5757   if (VariableMask) {
5758     auto *MaskTy =
5759         FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
5760     MaskUnpackCost = getScalarizationOverhead(
5761         MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5762     InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5763         Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
5764         CmpInst::BAD_ICMP_PREDICATE, CostKind);
5765     InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5766     MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
5767   }
5768 
5769   InstructionCost AddressUnpackCost = getScalarizationOverhead(
5770       FixedVectorType::get(PointerType::getUnqual(ScalarTy->getContext()), VF),
5771       DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5772 
5773   // The cost of the scalar loads/stores.
5774   InstructionCost MemoryOpCost =
5775       VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment),
5776                            AddressSpace, CostKind);
5777 
5778   // The cost of forming the vector from loaded scalars/
5779   // scalarizing the vector to perform scalar stores.
5780   InstructionCost InsertExtractCost = getScalarizationOverhead(
5781       cast<FixedVectorType>(SrcVTy), DemandedElts,
5782       /*Insert=*/Opcode == Instruction::Load,
5783       /*Extract=*/Opcode == Instruction::Store, CostKind);
5784 
5785   return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
5786 }
5787 
5788 /// Calculate the cost of Gather / Scatter operation
5789 InstructionCost X86TTIImpl::getGatherScatterOpCost(
5790     unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
5791     Align Alignment, TTI::TargetCostKind CostKind,
5792     const Instruction *I = nullptr) {
5793   if (CostKind != TTI::TCK_RecipThroughput) {
5794     if ((Opcode == Instruction::Load &&
5795          isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
5796          !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5797                                      Align(Alignment))) ||
5798         (Opcode == Instruction::Store &&
5799          isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
5800          !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5801                                       Align(Alignment))))
5802       return 1;
5803     return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
5804                                          Alignment, CostKind, I);
5805   }
5806 
5807   assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
5808   PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5809   if (!PtrTy && Ptr->getType()->isVectorTy())
5810     PtrTy = dyn_cast<PointerType>(
5811         cast<VectorType>(Ptr->getType())->getElementType());
5812   assert(PtrTy && "Unexpected type for Ptr argument");
5813   unsigned AddressSpace = PtrTy->getAddressSpace();
5814 
5815   if ((Opcode == Instruction::Load &&
5816        (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
5817         forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5818                                    Align(Alignment)))) ||
5819       (Opcode == Instruction::Store &&
5820        (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
5821         forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5822                                     Align(Alignment)))))
5823     return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
5824                            AddressSpace);
5825 
5826   return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
5827 }
5828 
5829 bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
5830                                const TargetTransformInfo::LSRCost &C2) {
5831     // X86 specific here are "instruction number 1st priority".
5832     return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
5833                     C1.NumIVMuls, C1.NumBaseAdds,
5834                     C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5835            std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
5836                     C2.NumIVMuls, C2.NumBaseAdds,
5837                     C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5838 }
5839 
5840 bool X86TTIImpl::canMacroFuseCmp() {
5841   return ST->hasMacroFusion() || ST->hasBranchFusion();
5842 }
5843 
5844 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
5845   if (!ST->hasAVX())
5846     return false;
5847 
5848   // The backend can't handle a single element vector.
5849   if (isa<VectorType>(DataTy) &&
5850       cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5851     return false;
5852   Type *ScalarTy = DataTy->getScalarType();
5853 
5854   if (ScalarTy->isPointerTy())
5855     return true;
5856 
5857   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5858     return true;
5859 
5860   if (ScalarTy->isHalfTy() && ST->hasBWI())
5861     return true;
5862 
5863   if (ScalarTy->isBFloatTy() && ST->hasBF16())
5864     return true;
5865 
5866   if (!ScalarTy->isIntegerTy())
5867     return false;
5868 
5869   unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5870   return IntWidth == 32 || IntWidth == 64 ||
5871          ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
5872 }
5873 
5874 bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
5875   return isLegalMaskedLoad(DataType, Alignment);
5876 }
5877 
5878 bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
5879   unsigned DataSize = DL.getTypeStoreSize(DataType);
5880   // The only supported nontemporal loads are for aligned vectors of 16 or 32
5881   // bytes.  Note that 32-byte nontemporal vector loads are supported by AVX2
5882   // (the equivalent stores only require AVX).
5883   if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
5884     return DataSize == 16 ?  ST->hasSSE1() : ST->hasAVX2();
5885 
5886   return false;
5887 }
5888 
5889 bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
5890   unsigned DataSize = DL.getTypeStoreSize(DataType);
5891 
5892   // SSE4A supports nontemporal stores of float and double at arbitrary
5893   // alignment.
5894   if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
5895     return true;
5896 
5897   // Besides the SSE4A subtarget exception above, only aligned stores are
5898   // available nontemporaly on any other subtarget.  And only stores with a size
5899   // of 4..32 bytes (powers of 2, only) are permitted.
5900   if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
5901       !isPowerOf2_32(DataSize))
5902     return false;
5903 
5904   // 32-byte vector nontemporal stores are supported by AVX (the equivalent
5905   // loads require AVX2).
5906   if (DataSize == 32)
5907     return ST->hasAVX();
5908   if (DataSize == 16)
5909     return ST->hasSSE1();
5910   return true;
5911 }
5912 
5913 bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy,
5914                                       ElementCount NumElements) const {
5915   // movddup
5916   return ST->hasSSE3() && !NumElements.isScalable() &&
5917          NumElements.getFixedValue() == 2 &&
5918          ElementTy == Type::getDoubleTy(ElementTy->getContext());
5919 }
5920 
5921 bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
5922   if (!isa<VectorType>(DataTy))
5923     return false;
5924 
5925   if (!ST->hasAVX512())
5926     return false;
5927 
5928   // The backend can't handle a single element vector.
5929   if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5930     return false;
5931 
5932   Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
5933 
5934   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5935     return true;
5936 
5937   if (!ScalarTy->isIntegerTy())
5938     return false;
5939 
5940   unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5941   return IntWidth == 32 || IntWidth == 64 ||
5942          ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
5943 }
5944 
5945 bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
5946   return isLegalMaskedExpandLoad(DataTy);
5947 }
5948 
5949 bool X86TTIImpl::supportsGather() const {
5950   // Some CPUs have better gather performance than others.
5951   // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
5952   // enable gather with a -march.
5953   return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
5954 }
5955 
5956 bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
5957   // Gather / Scatter for vector 2 is not profitable on KNL / SKX
5958   // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
5959   // it to 8 elements, but zeroing upper bits of the mask vector will add more
5960   // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
5961   // Check, maybe the gather/scatter instruction is better in the VariableMask
5962   // case.
5963   unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
5964   return NumElts == 1 ||
5965          (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
5966 }
5967 
5968 bool X86TTIImpl::isLegalMaskedGatherScatter(Type *DataTy, Align Alignment) {
5969   Type *ScalarTy = DataTy->getScalarType();
5970   if (ScalarTy->isPointerTy())
5971     return true;
5972 
5973   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5974     return true;
5975 
5976   if (!ScalarTy->isIntegerTy())
5977     return false;
5978 
5979   unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5980   return IntWidth == 32 || IntWidth == 64;
5981 }
5982 
5983 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
5984   if (!supportsGather() || !ST->preferGather())
5985     return false;
5986   return isLegalMaskedGatherScatter(DataTy, Alignment);
5987 }
5988 
5989 bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
5990                                  unsigned Opcode1,
5991                                  const SmallBitVector &OpcodeMask) const {
5992   // ADDSUBPS  4xf32 SSE3
5993   // VADDSUBPS 4xf32 AVX
5994   // VADDSUBPS 8xf32 AVX2
5995   // ADDSUBPD  2xf64 SSE3
5996   // VADDSUBPD 2xf64 AVX
5997   // VADDSUBPD 4xf64 AVX2
5998 
5999   unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6000   assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6001   if (!isPowerOf2_32(NumElements))
6002     return false;
6003   // Check the opcode pattern. We apply the mask on the opcode arguments and
6004   // then check if it is what we expect.
6005   for (int Lane : seq<int>(0, NumElements)) {
6006     unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6007     // We expect FSub for even lanes and FAdd for odd lanes.
6008     if (Lane % 2 == 0 && Opc != Instruction::FSub)
6009       return false;
6010     if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6011       return false;
6012   }
6013   // Now check that the pattern is supported by the target ISA.
6014   Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6015   if (ElemTy->isFloatTy())
6016     return ST->hasSSE3() && NumElements % 4 == 0;
6017   if (ElemTy->isDoubleTy())
6018     return ST->hasSSE3() && NumElements % 2 == 0;
6019   return false;
6020 }
6021 
6022 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6023   // AVX2 doesn't support scatter
6024   if (!ST->hasAVX512() || !ST->preferScatter())
6025     return false;
6026   return isLegalMaskedGatherScatter(DataType, Alignment);
6027 }
6028 
6029 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6030   EVT VT = TLI->getValueType(DL, DataType);
6031   return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6032 }
6033 
6034 bool X86TTIImpl::isExpensiveToSpeculativelyExecute(const Instruction* I) {
6035   // FDIV is always expensive, even if it has a very low uop count.
6036   // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6037   if (I->getOpcode() == Instruction::FDiv)
6038     return true;
6039 
6040   return BaseT::isExpensiveToSpeculativelyExecute(I);
6041 }
6042 
6043 bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
6044   return false;
6045 }
6046 
6047 bool X86TTIImpl::areInlineCompatible(const Function *Caller,
6048                                      const Function *Callee) const {
6049   const TargetMachine &TM = getTLI()->getTargetMachine();
6050 
6051   // Work this as a subsetting of subtarget features.
6052   const FeatureBitset &CallerBits =
6053       TM.getSubtargetImpl(*Caller)->getFeatureBits();
6054   const FeatureBitset &CalleeBits =
6055       TM.getSubtargetImpl(*Callee)->getFeatureBits();
6056 
6057   // Check whether features are the same (apart from the ignore list).
6058   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6059   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6060   if (RealCallerBits == RealCalleeBits)
6061     return true;
6062 
6063   // If the features are a subset, we need to additionally check for calls
6064   // that may become ABI-incompatible as a result of inlining.
6065   if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6066     return false;
6067 
6068   for (const Instruction &I : instructions(Callee)) {
6069     if (const auto *CB = dyn_cast<CallBase>(&I)) {
6070       SmallVector<Type *, 8> Types;
6071       for (Value *Arg : CB->args())
6072         Types.push_back(Arg->getType());
6073       if (!CB->getType()->isVoidTy())
6074         Types.push_back(CB->getType());
6075 
6076       // Simple types are always ABI compatible.
6077       auto IsSimpleTy = [](Type *Ty) {
6078         return !Ty->isVectorTy() && !Ty->isAggregateType();
6079       };
6080       if (all_of(Types, IsSimpleTy))
6081         continue;
6082 
6083       if (Function *NestedCallee = CB->getCalledFunction()) {
6084         // Assume that intrinsics are always ABI compatible.
6085         if (NestedCallee->isIntrinsic())
6086           continue;
6087 
6088         // Do a precise compatibility check.
6089         if (!areTypesABICompatible(Caller, NestedCallee, Types))
6090           return false;
6091       } else {
6092         // We don't know the target features of the callee,
6093         // assume it is incompatible.
6094         return false;
6095       }
6096     }
6097   }
6098   return true;
6099 }
6100 
6101 bool X86TTIImpl::areTypesABICompatible(const Function *Caller,
6102                                        const Function *Callee,
6103                                        const ArrayRef<Type *> &Types) const {
6104   if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6105     return false;
6106 
6107   // If we get here, we know the target features match. If one function
6108   // considers 512-bit vectors legal and the other does not, consider them
6109   // incompatible.
6110   const TargetMachine &TM = getTLI()->getTargetMachine();
6111 
6112   if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6113       TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6114     return true;
6115 
6116   // Consider the arguments compatible if they aren't vectors or aggregates.
6117   // FIXME: Look at the size of vectors.
6118   // FIXME: Look at the element types of aggregates to see if there are vectors.
6119   return llvm::none_of(Types,
6120       [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6121 }
6122 
6123 X86TTIImpl::TTI::MemCmpExpansionOptions
6124 X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6125   TTI::MemCmpExpansionOptions Options;
6126   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6127   Options.NumLoadsPerBlock = 2;
6128   // All GPR and vector loads can be unaligned.
6129   Options.AllowOverlappingLoads = true;
6130   if (IsZeroCmp) {
6131     // Only enable vector loads for equality comparison. Right now the vector
6132     // version is not as fast for three way compare (see #33329).
6133     const unsigned PreferredWidth = ST->getPreferVectorWidth();
6134     if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6135       Options.LoadSizes.push_back(64);
6136     if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6137     if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6138   }
6139   if (ST->is64Bit()) {
6140     Options.LoadSizes.push_back(8);
6141   }
6142   Options.LoadSizes.push_back(4);
6143   Options.LoadSizes.push_back(2);
6144   Options.LoadSizes.push_back(1);
6145   return Options;
6146 }
6147 
6148 bool X86TTIImpl::prefersVectorizedAddressing() const {
6149   return supportsGather();
6150 }
6151 
6152 bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const {
6153   return false;
6154 }
6155 
6156 bool X86TTIImpl::enableInterleavedAccessVectorization() {
6157   // TODO: We expect this to be beneficial regardless of arch,
6158   // but there are currently some unexplained performance artifacts on Atom.
6159   // As a temporary solution, disable on Atom.
6160   return !(ST->isAtom());
6161 }
6162 
6163 // Get estimation for interleaved load/store operations and strided load.
6164 // \p Indices contains indices for strided load.
6165 // \p Factor - the factor of interleaving.
6166 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
6167 InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
6168     unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6169     ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6170     TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6171   // VecTy for interleave memop is <VF*Factor x Elt>.
6172   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6173   // VecTy = <12 x i32>.
6174 
6175   // Calculate the number of memory operations (NumOfMemOps), required
6176   // for load/store the VecTy.
6177   MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6178   unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6179   unsigned LegalVTSize = LegalVT.getStoreSize();
6180   unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6181 
6182   // Get the cost of one memory operation.
6183   auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6184                                              LegalVT.getVectorNumElements());
6185   InstructionCost MemOpCost;
6186   bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6187   if (UseMaskedMemOp)
6188     MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6189                                       AddressSpace, CostKind);
6190   else
6191     MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6192                                 AddressSpace, CostKind);
6193 
6194   unsigned VF = VecTy->getNumElements() / Factor;
6195   MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
6196 
6197   InstructionCost MaskCost;
6198   if (UseMaskedMemOp) {
6199     APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6200     for (unsigned Index : Indices) {
6201       assert(Index < Factor && "Invalid index for interleaved memory op");
6202       for (unsigned Elm = 0; Elm < VF; Elm++)
6203         DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6204     }
6205 
6206     Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6207 
6208     MaskCost = getReplicationShuffleCost(
6209         I1Type, Factor, VF,
6210         UseMaskForGaps ? DemandedLoadStoreElts
6211                        : APInt::getAllOnes(VecTy->getNumElements()),
6212         CostKind);
6213 
6214     // The Gaps mask is invariant and created outside the loop, therefore the
6215     // cost of creating it is not accounted for here. However if we have both
6216     // a MaskForGaps and some other mask that guards the execution of the
6217     // memory access, we need to account for the cost of And-ing the two masks
6218     // inside the loop.
6219     if (UseMaskForGaps) {
6220       auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6221       MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6222     }
6223   }
6224 
6225   if (Opcode == Instruction::Load) {
6226     // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6227     // contain the cost of the optimized shuffle sequence that the
6228     // X86InterleavedAccess pass will generate.
6229     // The cost of loads and stores are computed separately from the table.
6230 
6231     // X86InterleavedAccess support only the following interleaved-access group.
6232     static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6233         {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6234         {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6235         {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6236     };
6237 
6238     if (const auto *Entry =
6239             CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6240       return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6241     //If an entry does not exist, fallback to the default implementation.
6242 
6243     // Kind of shuffle depends on number of loaded values.
6244     // If we load the entire data in one register, we can use a 1-src shuffle.
6245     // Otherwise, we'll merge 2 sources in each operation.
6246     TTI::ShuffleKind ShuffleKind =
6247         (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6248 
6249     InstructionCost ShuffleCost = getShuffleCost(
6250         ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6251 
6252     unsigned NumOfLoadsInInterleaveGrp =
6253         Indices.size() ? Indices.size() : Factor;
6254     auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6255                                           VecTy->getNumElements() / Factor);
6256     InstructionCost NumOfResults =
6257         getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6258 
6259     // About a half of the loads may be folded in shuffles when we have only
6260     // one result. If we have more than one result, or the loads are masked,
6261     // we do not fold loads at all.
6262     unsigned NumOfUnfoldedLoads =
6263         UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6264 
6265     // Get a number of shuffle operations per result.
6266     unsigned NumOfShufflesPerResult =
6267         std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6268 
6269     // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6270     // When we have more than one destination, we need additional instructions
6271     // to keep sources.
6272     InstructionCost NumOfMoves = 0;
6273     if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6274       NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6275 
6276     InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6277                            MaskCost + NumOfUnfoldedLoads * MemOpCost +
6278                            NumOfMoves;
6279 
6280     return Cost;
6281   }
6282 
6283   // Store.
6284   assert(Opcode == Instruction::Store &&
6285          "Expected Store Instruction at this  point");
6286   // X86InterleavedAccess support only the following interleaved-access group.
6287   static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6288       {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6289       {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6290       {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6291 
6292       {4, MVT::v8i8, 10},  // interleave 4 x 8i8  into 32i8  (and store)
6293       {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8  (and store)
6294       {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6295       {4, MVT::v64i8, 24}  // interleave 4 x 32i8 into 256i8 (and store)
6296   };
6297 
6298   if (const auto *Entry =
6299           CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6300     return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6301   //If an entry does not exist, fallback to the default implementation.
6302 
6303   // There is no strided stores meanwhile. And store can't be folded in
6304   // shuffle.
6305   unsigned NumOfSources = Factor; // The number of values to be merged.
6306   InstructionCost ShuffleCost = getShuffleCost(
6307       TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6308   unsigned NumOfShufflesPerStore = NumOfSources - 1;
6309 
6310   // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6311   // We need additional instructions to keep sources.
6312   unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6313   InstructionCost Cost =
6314       MaskCost +
6315       NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6316       NumOfMoves;
6317   return Cost;
6318 }
6319 
6320 InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
6321     unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6322     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6323     bool UseMaskForCond, bool UseMaskForGaps) {
6324   auto *VecTy = cast<FixedVectorType>(BaseTy);
6325 
6326   auto isSupportedOnAVX512 = [&](Type *VecTy) {
6327     Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6328     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6329         EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6330       return true;
6331     if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6332       return ST->hasBWI();
6333     if (EltTy->isBFloatTy())
6334       return ST->hasBF16();
6335     return false;
6336   };
6337   if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6338     return getInterleavedMemoryOpCostAVX512(
6339         Opcode, VecTy, Factor, Indices, Alignment,
6340         AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6341 
6342   if (UseMaskForCond || UseMaskForGaps)
6343     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6344                                              Alignment, AddressSpace, CostKind,
6345                                              UseMaskForCond, UseMaskForGaps);
6346 
6347   // Get estimation for interleaved load/store operations for SSE-AVX2.
6348   // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6349   // computing the cost using a generic formula as a function of generic
6350   // shuffles. We therefore use a lookup table instead, filled according to
6351   // the instruction sequences that codegen currently generates.
6352 
6353   // VecTy for interleave memop is <VF*Factor x Elt>.
6354   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6355   // VecTy = <12 x i32>.
6356   MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6357 
6358   // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6359   // the VF=2, while v2i128 is an unsupported MVT vector type
6360   // (see MachineValueType.h::getVectorVT()).
6361   if (!LegalVT.isVector())
6362     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6363                                              Alignment, AddressSpace, CostKind);
6364 
6365   unsigned VF = VecTy->getNumElements() / Factor;
6366   Type *ScalarTy = VecTy->getElementType();
6367   // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6368   if (!ScalarTy->isIntegerTy())
6369     ScalarTy =
6370         Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6371 
6372   // Get the cost of all the memory operations.
6373   // FIXME: discount dead loads.
6374   InstructionCost MemOpCosts = getMemoryOpCost(
6375       Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6376 
6377   auto *VT = FixedVectorType::get(ScalarTy, VF);
6378   EVT ETy = TLI->getValueType(DL, VT);
6379   if (!ETy.isSimple())
6380     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6381                                              Alignment, AddressSpace, CostKind);
6382 
6383   // TODO: Complete for other data-types and strides.
6384   // Each combination of Stride, element bit width and VF results in a different
6385   // sequence; The cost tables are therefore accessed with:
6386   // Factor (stride) and VectorType=VFxiN.
6387   // The Cost accounts only for the shuffle sequence;
6388   // The cost of the loads/stores is accounted for separately.
6389   //
6390   static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6391       {2, MVT::v2i8, 2},  // (load 4i8 and) deinterleave into 2 x 2i8
6392       {2, MVT::v4i8, 2},  // (load 8i8 and) deinterleave into 2 x 4i8
6393       {2, MVT::v8i8, 2},  // (load 16i8 and) deinterleave into 2 x 8i8
6394       {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6395       {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6396 
6397       {2, MVT::v8i16, 6},   // (load 16i16 and) deinterleave into 2 x 8i16
6398       {2, MVT::v16i16, 9},  // (load 32i16 and) deinterleave into 2 x 16i16
6399       {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6400 
6401       {2, MVT::v8i32, 4},   // (load 16i32 and) deinterleave into 2 x 8i32
6402       {2, MVT::v16i32, 8},  // (load 32i32 and) deinterleave into 2 x 16i32
6403       {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6404 
6405       {2, MVT::v4i64, 4},   // (load 8i64 and) deinterleave into 2 x 4i64
6406       {2, MVT::v8i64, 8},   // (load 16i64 and) deinterleave into 2 x 8i64
6407       {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6408       {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6409 
6410       {3, MVT::v2i8, 3},   // (load 6i8 and) deinterleave into 3 x 2i8
6411       {3, MVT::v4i8, 3},   // (load 12i8 and) deinterleave into 3 x 4i8
6412       {3, MVT::v8i8, 6},   // (load 24i8 and) deinterleave into 3 x 8i8
6413       {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6414       {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6415 
6416       {3, MVT::v2i16, 5},   // (load 6i16 and) deinterleave into 3 x 2i16
6417       {3, MVT::v4i16, 7},   // (load 12i16 and) deinterleave into 3 x 4i16
6418       {3, MVT::v8i16, 9},   // (load 24i16 and) deinterleave into 3 x 8i16
6419       {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6420       {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6421 
6422       {3, MVT::v2i32, 3},   // (load 6i32 and) deinterleave into 3 x 2i32
6423       {3, MVT::v4i32, 3},   // (load 12i32 and) deinterleave into 3 x 4i32
6424       {3, MVT::v8i32, 7},   // (load 24i32 and) deinterleave into 3 x 8i32
6425       {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6426       {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6427 
6428       {3, MVT::v2i64, 1},   // (load 6i64 and) deinterleave into 3 x 2i64
6429       {3, MVT::v4i64, 5},   // (load 12i64 and) deinterleave into 3 x 4i64
6430       {3, MVT::v8i64, 10},  // (load 24i64 and) deinterleave into 3 x 8i64
6431       {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6432 
6433       {4, MVT::v2i8, 4},   // (load 8i8 and) deinterleave into 4 x 2i8
6434       {4, MVT::v4i8, 4},   // (load 16i8 and) deinterleave into 4 x 4i8
6435       {4, MVT::v8i8, 12},  // (load 32i8 and) deinterleave into 4 x 8i8
6436       {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6437       {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6438 
6439       {4, MVT::v2i16, 6},    // (load 8i16 and) deinterleave into 4 x 2i16
6440       {4, MVT::v4i16, 17},   // (load 16i16 and) deinterleave into 4 x 4i16
6441       {4, MVT::v8i16, 33},   // (load 32i16 and) deinterleave into 4 x 8i16
6442       {4, MVT::v16i16, 75},  // (load 64i16 and) deinterleave into 4 x 16i16
6443       {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6444 
6445       {4, MVT::v2i32, 4},   // (load 8i32 and) deinterleave into 4 x 2i32
6446       {4, MVT::v4i32, 8},   // (load 16i32 and) deinterleave into 4 x 4i32
6447       {4, MVT::v8i32, 16},  // (load 32i32 and) deinterleave into 4 x 8i32
6448       {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6449       {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6450 
6451       {4, MVT::v2i64, 6},  // (load 8i64 and) deinterleave into 4 x 2i64
6452       {4, MVT::v4i64, 8},  // (load 16i64 and) deinterleave into 4 x 4i64
6453       {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6454       {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6455 
6456       {6, MVT::v2i8, 6},   // (load 12i8 and) deinterleave into 6 x 2i8
6457       {6, MVT::v4i8, 14},  // (load 24i8 and) deinterleave into 6 x 4i8
6458       {6, MVT::v8i8, 18},  // (load 48i8 and) deinterleave into 6 x 8i8
6459       {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6460       {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6461 
6462       {6, MVT::v2i16, 13},   // (load 12i16 and) deinterleave into 6 x 2i16
6463       {6, MVT::v4i16, 9},    // (load 24i16 and) deinterleave into 6 x 4i16
6464       {6, MVT::v8i16, 39},   // (load 48i16 and) deinterleave into 6 x 8i16
6465       {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6466       {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6467 
6468       {6, MVT::v2i32, 6},   // (load 12i32 and) deinterleave into 6 x 2i32
6469       {6, MVT::v4i32, 15},  // (load 24i32 and) deinterleave into 6 x 4i32
6470       {6, MVT::v8i32, 31},  // (load 48i32 and) deinterleave into 6 x 8i32
6471       {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6472 
6473       {6, MVT::v2i64, 6},  // (load 12i64 and) deinterleave into 6 x 2i64
6474       {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6475       {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6476 
6477       {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6478   };
6479 
6480   static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6481       {2, MVT::v4i16, 2},   // (load 8i16 and) deinterleave into 2 x 4i16
6482   };
6483 
6484   static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6485       {2, MVT::v2i16, 2},   // (load 4i16 and) deinterleave into 2 x 2i16
6486       {2, MVT::v4i16, 7},   // (load 8i16 and) deinterleave into 2 x 4i16
6487 
6488       {2, MVT::v2i32, 2},   // (load 4i32 and) deinterleave into 2 x 2i32
6489       {2, MVT::v4i32, 2},   // (load 8i32 and) deinterleave into 2 x 4i32
6490 
6491       {2, MVT::v2i64, 2},   // (load 4i64 and) deinterleave into 2 x 2i64
6492   };
6493 
6494   static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6495       {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6496       {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6497 
6498       {2, MVT::v8i16, 3},  // interleave 2 x 8i16 into 16i16 (and store)
6499       {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6500       {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6501 
6502       {2, MVT::v4i32, 2},   // interleave 2 x 4i32 into 8i32 (and store)
6503       {2, MVT::v8i32, 4},   // interleave 2 x 8i32 into 16i32 (and store)
6504       {2, MVT::v16i32, 8},  // interleave 2 x 16i32 into 32i32 (and store)
6505       {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6506 
6507       {2, MVT::v2i64, 2},   // interleave 2 x 2i64 into 4i64 (and store)
6508       {2, MVT::v4i64, 4},   // interleave 2 x 4i64 into 8i64 (and store)
6509       {2, MVT::v8i64, 8},   // interleave 2 x 8i64 into 16i64 (and store)
6510       {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6511       {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6512 
6513       {3, MVT::v2i8, 4},   // interleave 3 x 2i8 into 6i8 (and store)
6514       {3, MVT::v4i8, 4},   // interleave 3 x 4i8 into 12i8 (and store)
6515       {3, MVT::v8i8, 6},   // interleave 3 x 8i8 into 24i8 (and store)
6516       {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6517       {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6518 
6519       {3, MVT::v2i16, 4},   // interleave 3 x 2i16 into 6i16 (and store)
6520       {3, MVT::v4i16, 6},   // interleave 3 x 4i16 into 12i16 (and store)
6521       {3, MVT::v8i16, 12},  // interleave 3 x 8i16 into 24i16 (and store)
6522       {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6523       {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6524 
6525       {3, MVT::v2i32, 4},   // interleave 3 x 2i32 into 6i32 (and store)
6526       {3, MVT::v4i32, 5},   // interleave 3 x 4i32 into 12i32 (and store)
6527       {3, MVT::v8i32, 11},  // interleave 3 x 8i32 into 24i32 (and store)
6528       {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6529       {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6530 
6531       {3, MVT::v2i64, 4},   // interleave 3 x 2i64 into 6i64 (and store)
6532       {3, MVT::v4i64, 6},   // interleave 3 x 4i64 into 12i64 (and store)
6533       {3, MVT::v8i64, 12},  // interleave 3 x 8i64 into 24i64 (and store)
6534       {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6535 
6536       {4, MVT::v2i8, 4},   // interleave 4 x 2i8 into 8i8 (and store)
6537       {4, MVT::v4i8, 4},   // interleave 4 x 4i8 into 16i8 (and store)
6538       {4, MVT::v8i8, 4},   // interleave 4 x 8i8 into 32i8 (and store)
6539       {4, MVT::v16i8, 8},  // interleave 4 x 16i8 into 64i8 (and store)
6540       {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6541 
6542       {4, MVT::v2i16, 2},   // interleave 4 x 2i16 into 8i16 (and store)
6543       {4, MVT::v4i16, 6},   // interleave 4 x 4i16 into 16i16 (and store)
6544       {4, MVT::v8i16, 10},  // interleave 4 x 8i16 into 32i16 (and store)
6545       {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6546       {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6547 
6548       {4, MVT::v2i32, 5},   // interleave 4 x 2i32 into 8i32 (and store)
6549       {4, MVT::v4i32, 6},   // interleave 4 x 4i32 into 16i32 (and store)
6550       {4, MVT::v8i32, 16},  // interleave 4 x 8i32 into 32i32 (and store)
6551       {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6552       {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6553 
6554       {4, MVT::v2i64, 6},  // interleave 4 x 2i64 into 8i64 (and store)
6555       {4, MVT::v4i64, 8},  // interleave 4 x 4i64 into 16i64 (and store)
6556       {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6557       {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6558 
6559       {6, MVT::v2i8, 7},   // interleave 6 x 2i8 into 12i8 (and store)
6560       {6, MVT::v4i8, 9},   // interleave 6 x 4i8 into 24i8 (and store)
6561       {6, MVT::v8i8, 16},  // interleave 6 x 8i8 into 48i8 (and store)
6562       {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6563       {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6564 
6565       {6, MVT::v2i16, 10},  // interleave 6 x 2i16 into 12i16 (and store)
6566       {6, MVT::v4i16, 15},  // interleave 6 x 4i16 into 24i16 (and store)
6567       {6, MVT::v8i16, 21},  // interleave 6 x 8i16 into 48i16 (and store)
6568       {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6569       {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6570 
6571       {6, MVT::v2i32, 9},   // interleave 6 x 2i32 into 12i32 (and store)
6572       {6, MVT::v4i32, 12},  // interleave 6 x 4i32 into 24i32 (and store)
6573       {6, MVT::v8i32, 33},  // interleave 6 x 8i32 into 48i32 (and store)
6574       {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6575 
6576       {6, MVT::v2i64, 8},  // interleave 6 x 2i64 into 12i64 (and store)
6577       {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6578       {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6579   };
6580 
6581   static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6582       {2, MVT::v2i8, 1},   // interleave 2 x 2i8 into 4i8 (and store)
6583       {2, MVT::v4i8, 1},   // interleave 2 x 4i8 into 8i8 (and store)
6584       {2, MVT::v8i8, 1},   // interleave 2 x 8i8 into 16i8 (and store)
6585 
6586       {2, MVT::v2i16, 1},  // interleave 2 x 2i16 into 4i16 (and store)
6587       {2, MVT::v4i16, 1},  // interleave 2 x 4i16 into 8i16 (and store)
6588 
6589       {2, MVT::v2i32, 1},  // interleave 2 x 2i32 into 4i32 (and store)
6590   };
6591 
6592   if (Opcode == Instruction::Load) {
6593     auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6594                               MemOpCosts](const CostTblEntry *Entry) {
6595       // NOTE: this is just an approximation!
6596       //       It can over/under -estimate the cost!
6597       return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6598     };
6599 
6600     if (ST->hasAVX2())
6601       if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6602                                               ETy.getSimpleVT()))
6603         return GetDiscountedCost(Entry);
6604 
6605     if (ST->hasSSSE3())
6606       if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6607                                               ETy.getSimpleVT()))
6608         return GetDiscountedCost(Entry);
6609 
6610     if (ST->hasSSE2())
6611       if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6612                                               ETy.getSimpleVT()))
6613         return GetDiscountedCost(Entry);
6614   } else {
6615     assert(Opcode == Instruction::Store &&
6616            "Expected Store Instruction at this point");
6617     assert((!Indices.size() || Indices.size() == Factor) &&
6618            "Interleaved store only supports fully-interleaved groups.");
6619     if (ST->hasAVX2())
6620       if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6621                                               ETy.getSimpleVT()))
6622         return MemOpCosts + Entry->Cost;
6623 
6624     if (ST->hasSSE2())
6625       if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6626                                               ETy.getSimpleVT()))
6627         return MemOpCosts + Entry->Cost;
6628   }
6629 
6630   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6631                                            Alignment, AddressSpace, CostKind,
6632                                            UseMaskForCond, UseMaskForGaps);
6633 }
6634 
6635 InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
6636                                                  int64_t BaseOffset,
6637                                                  bool HasBaseReg, int64_t Scale,
6638                                                  unsigned AddrSpace) const {
6639   // Scaling factors are not free at all.
6640   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6641   // will take 2 allocations in the out of order engine instead of 1
6642   // for plain addressing mode, i.e. inst (reg1).
6643   // E.g.,
6644   // vaddps (%rsi,%rdx), %ymm0, %ymm1
6645   // Requires two allocations (one for the load, one for the computation)
6646   // whereas:
6647   // vaddps (%rsi), %ymm0, %ymm1
6648   // Requires just 1 allocation, i.e., freeing allocations for other operations
6649   // and having less micro operations to execute.
6650   //
6651   // For some X86 architectures, this is even worse because for instance for
6652   // stores, the complex addressing mode forces the instruction to use the
6653   // "load" ports instead of the dedicated "store" port.
6654   // E.g., on Haswell:
6655   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6656   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6657   TargetLoweringBase::AddrMode AM;
6658   AM.BaseGV = BaseGV;
6659   AM.BaseOffs = BaseOffset;
6660   AM.HasBaseReg = HasBaseReg;
6661   AM.Scale = Scale;
6662   if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6663     // Scale represents reg2 * scale, thus account for 1
6664     // as soon as we use a second register.
6665     return AM.Scale != 0;
6666   return -1;
6667 }
6668